diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 4a03b68..5451542 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -11,7 +11,7 @@ jobs: build: strategy: matrix: - go-version: [1.15.x, 1.16.x, 1.17.x] + go-version: [1.16.x, 1.17.x, 1.18.x] os: [ubuntu-latest, macos-latest, windows-latest] env: CGO_ENABLED: 0 @@ -47,7 +47,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v2 with: - go-version: 1.17.x + go-version: 1.18.x - name: Checkout code uses: actions/checkout@v2 @@ -86,3 +86,5 @@ jobs: CGO_ENABLED: 1 run: go test -no-avx512 -no-avx2 -no-ssse3 -short -race . + - name: Test Microarch v4 + run: go run testlevel.go 4;if [ $? -eq 0 ]; then GOAMD64=v4 go test -no-avx512 ./...; else true; fi diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index fdd619c..0000000 --- a/.travis.yml +++ /dev/null @@ -1,65 +0,0 @@ -language: go - -os: - - linux - - osx - - windows - -arch: - - amd64 - - arm64 - - ppc64le - - s390x - -go: - - 1.14.x - - 1.15.x - - 1.16.x - - master - -env: - - GO111MODULE=off CGO_ENABLED=0 - -install: - - go get ./... - -script: - - go vet ./... - - go test -cpu=1,2 . - - go test -tags=noasm -cpu=1,2 . - - go build examples/simple-decoder.go - - go build examples/simple-encoder.go - - go build examples/stream-decoder.go - - go build examples/stream-encoder.go - -jobs: - allow_failures: - - go: 'master' - - arch: s390x - fast_finish: true - include: - - stage: other - go: 1.16.x - os: linux - arch: amd64 - script: - - diff <(gofmt -d .) <(printf "") - - diff <(gofmt -d ./examples) <(printf "") - - go get github.com/klauspost/asmfmt&&go install github.com/klauspost/asmfmt/cmd/asmfmt - - diff <(asmfmt -d .) <(printf "") - - CGO_ENABLED=1 go test -cpu=1 -short -race . - - CGO_ENABLED=1 go test -cpu=2 -short -race . - - CGO_ENABLED=1 go test -tags=noasm -cpu=1 -short -race . - - CGO_ENABLED=1 go test -tags=noasm -cpu=4 -short -race . - - CGO_ENABLED=1 go test -no-avx512 -short -race . - - CGO_ENABLED=1 go test -no-avx512 -no-avx2 -short -race . - - CGO_ENABLED=1 go test -no-avx512 -no-avx2 -no-ssse3 -short -race . - - GOOS=linux GOARCH=386 go test -short . - - stage: other - go: 1.15.x - os: linux - arch: amd64 - script: - - go test -no-avx512 - - go test -no-avx512 -no-avx2 - - go test -no-avx512 -no-avx2 -no-ssse3 diff --git a/README.md b/README.md index ee8f2ae..22c5a62 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ Using Go modules recommended. # Changes ## 2021 +* Use `GOAMD64=v4` to enable faster AVX2. * Add progressive shard encoding. * Wider AVX2 loops * Limit concurrency on AVX2, since we are likely memory bound. diff --git a/_gen/cleanup.go b/_gen/cleanup.go new file mode 100644 index 0000000..bcd7964 --- /dev/null +++ b/_gen/cleanup.go @@ -0,0 +1,34 @@ +//go:build custom +// +build custom + +package main + +import ( + "bytes" + "flag" + "io/ioutil" + "log" + "os" + + "github.com/klauspost/asmfmt" +) + +func main() { + flag.Parse() + args := flag.Args() + for _, file := range args { + data, err := ioutil.ReadFile(file) + if err != nil { + log.Fatalln(err) + } + data = bytes.Replace(data, []byte("\t// #"), []byte("#"), -1) + data, err = asmfmt.Format(bytes.NewBuffer(data)) + if err != nil { + log.Fatalln(err) + } + err = ioutil.WriteFile(file, data, os.ModePerm) + if err != nil { + log.Fatalln(err) + } + } +} diff --git a/_gen/gen.go b/_gen/gen.go index 36709e8..40f1227 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -4,6 +4,7 @@ //go:generate go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon //go:generate go fmt ../galois_gen_switch_amd64.go //go:generate go fmt ../galois_gen_amd64.go +//go:generate go run cleanup.go ../galois_gen_amd64.s package main @@ -120,6 +121,17 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { Generate() } +// VPXOR3way will 3-way xor a and b and dst. +func VPXOR3way(a, b, dst reg.VecVirtual) { + Comment("#ifdef GOAMD64_v4") + // AVX512F and AVX512VL required + VPTERNLOGD(U8(0x96), a, b, dst) + Comment("#else") + VPXOR(a, dst, dst) // dst = a^dst + VPXOR(b, dst, dst) // dst = (a^dst)^b + Comment("#endif") +} + func genMulAvx2(name string, inputs int, outputs int, xor bool) { const perLoopBits = 5 const perLoop = 1 << perLoopBits @@ -342,8 +354,7 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { // We don't have any existing data, write directly. VPXOR(lookLow, lookHigh, dst[j]) } else { - VPXOR(lookLow, lookHigh, lookLow) - VPXOR(lookLow, dst[j], dst[j]) + VPXOR3way(lookLow, lookHigh, dst[j]) } } } @@ -587,9 +598,9 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) { VMOVDQU(Mem{Base: matrixBase, Disp: 64 * (i*outputs + j)}, lookLow) VMOVDQU(Mem{Base: matrixBase, Disp: 32 + 64*(i*outputs+j)}, lookHigh) VPSHUFB(in2Low, lookLow, lookLow2) - VPSHUFB(inLow, lookLow, lookLow) + VPSHUFB(inLow, lookLow, lookLow) // Reuse lookLow to save a reg VPSHUFB(in2High, lookHigh, lookHigh2) - VPSHUFB(inHigh, lookHigh, lookHigh) + VPSHUFB(inHigh, lookHigh, lookHigh) // Reuse lookHigh to save a reg } else { VPSHUFB(inLow, inLo[i*outputs+j], lookLow) VPSHUFB(in2Low, inLo[i*outputs+j], lookLow2) @@ -601,10 +612,8 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) { VPXOR(lookLow, lookHigh, dst[j]) VPXOR(lookLow2, lookHigh2, dst2[j]) } else { - VPXOR(lookLow, lookHigh, lookLow) - VPXOR(lookLow2, lookHigh2, lookLow2) - VPXOR(lookLow, dst[j], dst[j]) - VPXOR(lookLow2, dst2[j], dst2[j]) + VPXOR3way(lookLow, lookHigh, dst[j]) + VPXOR3way(lookLow2, lookHigh2, dst2[j]) } } } diff --git a/_gen/go.mod b/_gen/go.mod index d496f66..913c07d 100644 --- a/_gen/go.mod +++ b/_gen/go.mod @@ -1,5 +1,8 @@ module github.com/klauspost/reedsolomon/_gen -go 1.14 +go 1.16 -require github.com/mmcloughlin/avo v0.2.0 +require ( + github.com/klauspost/asmfmt v1.3.1 + github.com/mmcloughlin/avo v0.4.0 +) diff --git a/_gen/go.sum b/_gen/go.sum index dae4777..111e553 100644 --- a/_gen/go.sum +++ b/_gen/go.sum @@ -1,29 +1,32 @@ -github.com/mmcloughlin/avo v0.2.0 h1:6vhoSaKtxb6f4RiH+LK2qL6GSMpFzhEwJYTTSZNy09w= -github.com/mmcloughlin/avo v0.2.0/go.mod h1:5tidO2Z9Z7N6X7UMcGg+1KTj51O8OxYDCMHxCZTVpEA= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -golang.org/x/arch v0.0.0-20210405154355-08b684f594a5/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4= +github.com/klauspost/asmfmt v1.3.1 h1:7xZi1N7s9gTLbqiM8KUv8TLyysavbTRGBT5/ly0bRtw= +github.com/klauspost/asmfmt v1.3.1/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE= +github.com/mmcloughlin/avo v0.4.0 h1:jeHDRktVD+578ULxWpQHkilor6pkdLF7u7EiTzDbfcU= +github.com/mmcloughlin/avo v0.4.0/go.mod h1:RW9BfYA3TgO9uCdNrKU2h6J8cPD8ZLznvfgHAeszb1s= +github.com/yuin/goldmark v1.4.0/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= +golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.4.2 h1:Gz96sIWK3OalVv/I/qNygP42zyoKp3xptRVCWRFEBvo= +golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57 h1:F5Gozwx4I1xtr/sr/8CFbb57iKi3297KFs0QDbGN60A= -golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021 h1:giLT+HuUP/gXYrG2Plg9WTjj4qhfgaW424ZIFog3rlk= +golang.org/x/sys v0.0.0-20211030160813-b3129d9d1021/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.1.0 h1:po9/4sTYwZU9lPhi1tOrb4hCv3qrhiQ77LZfGa2OjwY= -golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= +golang.org/x/tools v0.1.7 h1:6j8CgantCy3yc8JGBqkDLMKWqZ0RDU2g1HVgacojGWQ= +golang.org/x/tools v0.1.7/go.mod h1:LGqMHiF4EqQNHR1JncWGqT5BVaXmza+X+BDGol+dOxo= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index 9bb067f..0000000 --- a/appveyor.yml +++ /dev/null @@ -1,20 +0,0 @@ -os: Visual Studio 2015 - -platform: x64 - -clone_folder: c:\gopath\src\github.com\klauspost\reedsolomon - -# environment variables -environment: - GOPATH: c:\gopath - -install: - - echo %PATH% - - echo %GOPATH% - - go version - - go env - - go get -d ./... - -build_script: - - go test -v -cpu=2 ./... - - go test -cpu=1,2,4 -short -race ./... diff --git a/galoisAvx512_amd64.go b/galoisAvx512_amd64.go index 79207e6..9a249d2 100644 --- a/galoisAvx512_amd64.go +++ b/galoisAvx512_amd64.go @@ -104,7 +104,7 @@ func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[ // Invoke AVX512 routine for single output row in parallel func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix81 *[matrixSize81]byte) { done := stop - start - if done <= 0 { + if done <= 0 || len(in) == 0 || len(out) == 0 { return } @@ -139,7 +139,7 @@ func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, // Invoke AVX512 routine for 2 output rows in parallel func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) { done := stop - start - if done <= 0 { + if done <= 0 || len(in) == 0 || len(out) == 0 { return } @@ -174,7 +174,7 @@ func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, // Invoke AVX512 routine for 4 output rows in parallel func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix84 *[matrixSize84]byte) { done := stop - start - if done <= 0 { + if done <= 0 || len(in) == 0 || len(out) == 0 { return } diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index 36e885f..5d24082 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -1,9 +1,7 @@ // Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. -// +build !appengine -// +build !noasm -// +build !nogen -// +build gc +//go:build !appengine && !noasm && !nogen && gc +// +build !appengine,!noasm,!nogen,gc #include "textflag.h" @@ -119,7 +117,7 @@ mulAvxTwo_1x1_64_end: RET // func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x1Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -156,9 +154,15 @@ mulAvxTwo_1x1Xor_loop: VMOVDQU (DX), Y2 VPSHUFB Y4, Y0, Y4 VPSHUFB Y5, Y1, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 1 outputs VMOVDQU Y2, (DX) ADDQ $0x20, DX @@ -172,7 +176,7 @@ mulAvxTwo_1x1Xor_end: RET // func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -219,11 +223,23 @@ mulAvxTwo_1x1_64Xor_loop: VPSHUFB Y7, Y0, Y7 VPSHUFB Y6, Y1, Y6 VPSHUFB Y8, Y1, Y8 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 1 outputs VMOVDQU Y2, (DX) VMOVDQU Y3, 32(DX) @@ -371,7 +387,7 @@ mulAvxTwo_1x2_64_end: RET // func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x2Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -412,14 +428,27 @@ mulAvxTwo_1x2Xor_loop: VMOVDQU (BX), Y4 VPSHUFB Y9, Y0, Y7 VPSHUFB Y10, Y1, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU (DX), Y5 VPSHUFB Y9, Y2, Y7 VPSHUFB Y10, Y3, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 2 outputs VMOVDQU Y4, (BX) ADDQ $0x20, BX @@ -435,7 +464,7 @@ mulAvxTwo_1x2Xor_end: RET // func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -486,21 +515,46 @@ mulAvxTwo_1x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) @@ -673,7 +727,7 @@ mulAvxTwo_1x3_64_end: RET // func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x3Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -718,19 +772,39 @@ mulAvxTwo_1x3Xor_loop: VMOVDQU (BX), Y6 VPSHUFB Y12, Y0, Y10 VPSHUFB Y13, Y1, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU (SI), Y7 VPSHUFB Y12, Y2, Y10 VPSHUFB Y13, Y3, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU (DX), Y8 VPSHUFB Y12, Y4, Y10 VPSHUFB Y13, Y5, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 3 outputs VMOVDQU Y6, (BX) ADDQ $0x20, BX @@ -748,7 +822,7 @@ mulAvxTwo_1x3Xor_end: RET // func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -803,31 +877,69 @@ mulAvxTwo_1x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) @@ -926,7 +1038,7 @@ mulAvxTwo_1x4_end: RET // func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -969,30 +1081,57 @@ mulAvxTwo_1x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU (BX), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -1099,7 +1238,7 @@ mulAvxTwo_1x5_end: RET // func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -1144,37 +1283,71 @@ mulAvxTwo_1x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU (BX), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -1292,7 +1465,7 @@ mulAvxTwo_1x6_end: RET // func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -1339,44 +1512,85 @@ mulAvxTwo_1x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU (BX), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -1505,7 +1719,7 @@ mulAvxTwo_1x7_end: RET // func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -1554,51 +1768,99 @@ mulAvxTwo_1x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU (BX), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -1738,7 +2000,7 @@ mulAvxTwo_1x8_end: RET // func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -1789,58 +2051,113 @@ mulAvxTwo_1x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU (BX), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -1991,7 +2308,7 @@ mulAvxTwo_1x9_end: RET // func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -2044,65 +2361,127 @@ mulAvxTwo_1x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU (R13), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU (BX), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -2264,7 +2643,7 @@ mulAvxTwo_1x10_end: RET // func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -2319,72 +2698,141 @@ mulAvxTwo_1x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU (R13), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU (R14), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU (BX), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -2416,7 +2864,7 @@ mulAvxTwo_1x10Xor_end: RET // func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -2466,9 +2914,15 @@ mulAvxTwo_2x1_loop: VPAND Y5, Y7, Y7 VPSHUFB Y6, Y2, Y6 VPSHUFB Y7, Y3, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 1 outputs VMOVDQU Y4, (BX) ADDQ $0x20, BX @@ -2482,7 +2936,7 @@ mulAvxTwo_2x1_end: RET // func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -2546,11 +3000,23 @@ mulAvxTwo_2x1_64_loop: VPSHUFB Y9, Y2, Y9 VPSHUFB Y8, Y3, Y8 VPSHUFB Y10, Y3, Y10 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 1 outputs VMOVDQU Y4, (BX) VMOVDQU Y5, 32(BX) @@ -2565,7 +3031,7 @@ mulAvxTwo_2x1_64_end: RET // func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x1Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -2606,9 +3072,15 @@ mulAvxTwo_2x1Xor_loop: VMOVDQU (BX), Y4 VPSHUFB Y6, Y0, Y6 VPSHUFB Y7, Y1, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (CX), Y6 ADDQ $0x20, CX @@ -2617,9 +3089,15 @@ mulAvxTwo_2x1Xor_loop: VPAND Y5, Y7, Y7 VPSHUFB Y6, Y2, Y6 VPSHUFB Y7, Y3, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 1 outputs VMOVDQU Y4, (BX) ADDQ $0x20, BX @@ -2633,7 +3111,7 @@ mulAvxTwo_2x1Xor_end: RET // func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -2684,11 +3162,23 @@ mulAvxTwo_2x1_64Xor_loop: VPSHUFB Y9, Y0, Y9 VPSHUFB Y8, Y1, Y8 VPSHUFB Y10, Y1, Y10 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y9 @@ -2703,11 +3193,23 @@ mulAvxTwo_2x1_64Xor_loop: VPSHUFB Y9, Y2, Y9 VPSHUFB Y8, Y3, Y8 VPSHUFB Y10, Y3, Y10 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 1 outputs VMOVDQU Y4, (BX) VMOVDQU Y5, 32(BX) @@ -2722,7 +3224,7 @@ mulAvxTwo_2x1_64Xor_end: RET // func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x2(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -2781,13 +3283,26 @@ mulAvxTwo_2x2_loop: VPAND Y10, Y14, Y14 VPSHUFB Y13, Y4, Y11 VPSHUFB Y14, Y5, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VPSHUFB Y13, Y6, Y11 VPSHUFB Y14, Y7, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 2 outputs VMOVDQU Y8, (SI) ADDQ $0x20, SI @@ -2803,7 +3318,7 @@ mulAvxTwo_2x2_end: RET // func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -2877,21 +3392,46 @@ mulAvxTwo_2x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) @@ -2909,7 +3449,7 @@ mulAvxTwo_2x2_64_end: RET // func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x2Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -2956,14 +3496,27 @@ mulAvxTwo_2x2Xor_loop: VMOVDQU (SI), Y8 VPSHUFB Y13, Y0, Y11 VPSHUFB Y14, Y1, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU (BX), Y9 VPSHUFB Y13, Y2, Y11 VPSHUFB Y14, Y3, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (CX), Y13 ADDQ $0x20, CX @@ -2972,13 +3525,26 @@ mulAvxTwo_2x2Xor_loop: VPAND Y10, Y14, Y14 VPSHUFB Y13, Y4, Y11 VPSHUFB Y14, Y5, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VPSHUFB Y13, Y6, Y11 VPSHUFB Y14, Y7, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 2 outputs VMOVDQU Y8, (SI) ADDQ $0x20, SI @@ -2994,7 +3560,7 @@ mulAvxTwo_2x2Xor_end: RET // func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3047,21 +3613,46 @@ mulAvxTwo_2x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -3078,21 +3669,46 @@ mulAvxTwo_2x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) @@ -3110,7 +3726,7 @@ mulAvxTwo_2x2_64Xor_end: RET // func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3174,21 +3790,41 @@ mulAvxTwo_2x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -3206,7 +3842,7 @@ mulAvxTwo_2x3_end: RET // func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3290,31 +3926,69 @@ mulAvxTwo_2x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) @@ -3335,7 +4009,7 @@ mulAvxTwo_2x3_64_end: RET // func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3378,23 +4052,43 @@ mulAvxTwo_2x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU (SI), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -3405,21 +4099,41 @@ mulAvxTwo_2x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -3437,7 +4151,7 @@ mulAvxTwo_2x3Xor_end: RET // func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3494,31 +4208,69 @@ mulAvxTwo_2x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -3535,31 +4287,69 @@ mulAvxTwo_2x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) @@ -3580,7 +4370,7 @@ mulAvxTwo_2x3_64Xor_end: RET // func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3651,27 +4441,54 @@ mulAvxTwo_2x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -3691,7 +4508,7 @@ mulAvxTwo_2x4_end: RET // func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3736,30 +4553,57 @@ mulAvxTwo_2x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU (SI), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -3770,27 +4614,54 @@ mulAvxTwo_2x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -3810,7 +4681,7 @@ mulAvxTwo_2x4Xor_end: RET // func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3888,33 +4759,67 @@ mulAvxTwo_2x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -3936,7 +4841,7 @@ mulAvxTwo_2x5_end: RET // func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3983,37 +4888,71 @@ mulAvxTwo_2x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU (SI), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -4024,33 +4963,67 @@ mulAvxTwo_2x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -4072,7 +5045,7 @@ mulAvxTwo_2x5Xor_end: RET // func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4157,39 +5130,80 @@ mulAvxTwo_2x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -4213,7 +5227,7 @@ mulAvxTwo_2x6_end: RET // func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4262,44 +5276,85 @@ mulAvxTwo_2x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU (SI), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -4310,39 +5365,80 @@ mulAvxTwo_2x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -4366,7 +5462,7 @@ mulAvxTwo_2x6Xor_end: RET // func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4458,45 +5554,93 @@ mulAvxTwo_2x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -4522,7 +5666,7 @@ mulAvxTwo_2x7_end: RET // func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4573,51 +5717,99 @@ mulAvxTwo_2x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU (SI), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -4628,45 +5820,93 @@ mulAvxTwo_2x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -4692,7 +5932,7 @@ mulAvxTwo_2x7Xor_end: RET // func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4791,51 +6031,106 @@ mulAvxTwo_2x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -4863,7 +6158,7 @@ mulAvxTwo_2x8_end: RET // func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4916,58 +6211,113 @@ mulAvxTwo_2x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU (SI), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -4978,51 +6328,106 @@ mulAvxTwo_2x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -5050,7 +6455,7 @@ mulAvxTwo_2x8Xor_end: RET // func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -5156,57 +6561,119 @@ mulAvxTwo_2x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -5236,7 +6703,7 @@ mulAvxTwo_2x9_end: RET // func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -5291,65 +6758,127 @@ mulAvxTwo_2x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU (SI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -5360,57 +6889,119 @@ mulAvxTwo_2x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -5440,7 +7031,7 @@ mulAvxTwo_2x9Xor_end: RET // func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -5553,63 +7144,132 @@ mulAvxTwo_2x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -5641,7 +7301,7 @@ mulAvxTwo_2x10_end: RET // func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x10Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -5698,72 +7358,141 @@ mulAvxTwo_2x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU (R15), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU (SI), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -5774,63 +7503,132 @@ mulAvxTwo_2x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -5862,7 +7660,7 @@ mulAvxTwo_2x10Xor_end: RET // func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -5916,9 +7714,15 @@ mulAvxTwo_3x1_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y2, Y8 VPSHUFB Y9, Y3, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (CX), Y8 ADDQ $0x20, CX @@ -5927,9 +7731,15 @@ mulAvxTwo_3x1_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y4, Y8 VPSHUFB Y9, Y5, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 1 outputs VMOVDQU Y6, (SI) ADDQ $0x20, SI @@ -5943,7 +7753,7 @@ mulAvxTwo_3x1_end: RET // func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6009,11 +7819,23 @@ mulAvxTwo_3x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -6030,11 +7852,23 @@ mulAvxTwo_3x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) @@ -6049,7 +7883,7 @@ mulAvxTwo_3x1_64_end: RET // func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x1Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -6094,9 +7928,15 @@ mulAvxTwo_3x1Xor_loop: VMOVDQU (SI), Y6 VPSHUFB Y8, Y0, Y8 VPSHUFB Y9, Y1, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -6105,9 +7945,15 @@ mulAvxTwo_3x1Xor_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y2, Y8 VPSHUFB Y9, Y3, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (CX), Y8 ADDQ $0x20, CX @@ -6116,9 +7962,15 @@ mulAvxTwo_3x1Xor_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y4, Y8 VPSHUFB Y9, Y5, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 1 outputs VMOVDQU Y6, (SI) ADDQ $0x20, SI @@ -6132,7 +7984,7 @@ mulAvxTwo_3x1Xor_end: RET // func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6183,11 +8035,23 @@ mulAvxTwo_3x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -6204,11 +8068,23 @@ mulAvxTwo_3x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -6225,11 +8101,23 @@ mulAvxTwo_3x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) @@ -6244,7 +8132,7 @@ mulAvxTwo_3x1_64Xor_end: RET // func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6303,15 +8191,28 @@ mulAvxTwo_3x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -6322,15 +8223,28 @@ mulAvxTwo_3x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -6346,7 +8260,7 @@ mulAvxTwo_3x2_end: RET // func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6422,21 +8336,46 @@ mulAvxTwo_3x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -6453,21 +8392,46 @@ mulAvxTwo_3x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) @@ -6485,7 +8449,7 @@ mulAvxTwo_3x2_64_end: RET // func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6528,16 +8492,29 @@ mulAvxTwo_3x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -6548,15 +8525,28 @@ mulAvxTwo_3x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -6567,15 +8557,28 @@ mulAvxTwo_3x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -6591,7 +8594,7 @@ mulAvxTwo_3x2Xor_end: RET // func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6646,21 +8649,46 @@ mulAvxTwo_3x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -6677,21 +8705,46 @@ mulAvxTwo_3x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -6708,21 +8761,46 @@ mulAvxTwo_3x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) @@ -6740,7 +8818,7 @@ mulAvxTwo_3x2_64Xor_end: RET // func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6806,21 +8884,41 @@ mulAvxTwo_3x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -6831,21 +8929,41 @@ mulAvxTwo_3x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -6863,7 +8981,7 @@ mulAvxTwo_3x3_end: RET // func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6949,31 +9067,69 @@ mulAvxTwo_3x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -6990,31 +9146,69 @@ mulAvxTwo_3x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) @@ -7035,7 +9229,7 @@ mulAvxTwo_3x3_64_end: RET // func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -7080,23 +9274,43 @@ mulAvxTwo_3x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU (DI), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -7107,21 +9321,41 @@ mulAvxTwo_3x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -7132,21 +9366,41 @@ mulAvxTwo_3x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -7164,7 +9418,7 @@ mulAvxTwo_3x3Xor_end: RET // func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -7223,31 +9477,69 @@ mulAvxTwo_3x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -7264,31 +9556,69 @@ mulAvxTwo_3x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -7305,31 +9635,69 @@ mulAvxTwo_3x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) @@ -7350,7 +9718,7 @@ mulAvxTwo_3x3_64Xor_end: RET // func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -7423,27 +9791,54 @@ mulAvxTwo_3x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -7454,27 +9849,54 @@ mulAvxTwo_3x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -7494,7 +9916,7 @@ mulAvxTwo_3x4_end: RET // func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -7541,30 +9963,57 @@ mulAvxTwo_3x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU (DI), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -7575,27 +10024,54 @@ mulAvxTwo_3x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -7606,27 +10082,54 @@ mulAvxTwo_3x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -7646,7 +10149,7 @@ mulAvxTwo_3x4Xor_end: RET // func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -7726,33 +10229,67 @@ mulAvxTwo_3x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -7763,33 +10300,67 @@ mulAvxTwo_3x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -7811,7 +10382,7 @@ mulAvxTwo_3x5_end: RET // func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -7860,37 +10431,71 @@ mulAvxTwo_3x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU (DI), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -7901,33 +10506,67 @@ mulAvxTwo_3x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -7938,33 +10577,67 @@ mulAvxTwo_3x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -7986,7 +10659,7 @@ mulAvxTwo_3x5Xor_end: RET // func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -8073,39 +10746,80 @@ mulAvxTwo_3x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -8116,39 +10830,80 @@ mulAvxTwo_3x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -8172,7 +10927,7 @@ mulAvxTwo_3x6_end: RET // func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -8223,44 +10978,85 @@ mulAvxTwo_3x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU (DI), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -8271,39 +11067,80 @@ mulAvxTwo_3x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -8314,39 +11151,80 @@ mulAvxTwo_3x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -8370,7 +11248,7 @@ mulAvxTwo_3x6Xor_end: RET // func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -8464,45 +11342,93 @@ mulAvxTwo_3x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -8513,45 +11439,93 @@ mulAvxTwo_3x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -8577,7 +11551,7 @@ mulAvxTwo_3x7_end: RET // func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -8630,51 +11604,99 @@ mulAvxTwo_3x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU (DI), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -8685,45 +11707,93 @@ mulAvxTwo_3x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -8734,45 +11804,93 @@ mulAvxTwo_3x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -8798,7 +11916,7 @@ mulAvxTwo_3x7Xor_end: RET // func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -8899,51 +12017,106 @@ mulAvxTwo_3x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -8954,51 +12127,106 @@ mulAvxTwo_3x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -9026,7 +12254,7 @@ mulAvxTwo_3x8_end: RET // func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -9081,58 +12309,113 @@ mulAvxTwo_3x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU (R14), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU (DI), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -9143,51 +12426,106 @@ mulAvxTwo_3x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -9198,51 +12536,106 @@ mulAvxTwo_3x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -9270,7 +12663,7 @@ mulAvxTwo_3x8Xor_end: RET // func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -9378,57 +12771,119 @@ mulAvxTwo_3x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -9439,57 +12894,119 @@ mulAvxTwo_3x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -9519,7 +13036,7 @@ mulAvxTwo_3x9_end: RET // func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -9576,65 +13093,127 @@ mulAvxTwo_3x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU (R14), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU (R15), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU (DI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -9645,57 +13224,119 @@ mulAvxTwo_3x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -9706,57 +13347,119 @@ mulAvxTwo_3x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -9786,7 +13489,7 @@ mulAvxTwo_3x9Xor_end: RET // func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -9903,63 +13606,132 @@ mulAvxTwo_3x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (AX), Y13 ADDQ $0x20, AX @@ -9970,63 +13742,132 @@ mulAvxTwo_3x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -10058,7 +13899,7 @@ mulAvxTwo_3x10_end: RET // func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x10Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -10119,72 +13960,141 @@ mulAvxTwo_3x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU (R15), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU (SI), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX @@ -10195,63 +14105,132 @@ mulAvxTwo_3x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (AX), Y13 ADDQ $0x20, AX @@ -10262,63 +14241,132 @@ mulAvxTwo_3x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -10350,7 +14398,7 @@ mulAvxTwo_3x10Xor_end: RET // func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -10408,9 +14456,15 @@ mulAvxTwo_4x1_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y2, Y10 VPSHUFB Y11, Y3, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -10419,9 +14473,15 @@ mulAvxTwo_4x1_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y4, Y10 VPSHUFB Y11, Y5, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (CX), Y10 ADDQ $0x20, CX @@ -10430,9 +14490,15 @@ mulAvxTwo_4x1_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y6, Y10 VPSHUFB Y11, Y7, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 1 outputs VMOVDQU Y8, (DI) ADDQ $0x20, DI @@ -10446,7 +14512,7 @@ mulAvxTwo_4x1_end: RET // func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -10514,11 +14580,23 @@ mulAvxTwo_4x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -10535,11 +14613,23 @@ mulAvxTwo_4x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -10556,11 +14646,23 @@ mulAvxTwo_4x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) @@ -10575,7 +14677,7 @@ mulAvxTwo_4x1_64_end: RET // func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x1Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -10624,9 +14726,15 @@ mulAvxTwo_4x1Xor_loop: VMOVDQU (DI), Y8 VPSHUFB Y10, Y0, Y10 VPSHUFB Y11, Y1, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -10635,9 +14743,15 @@ mulAvxTwo_4x1Xor_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y2, Y10 VPSHUFB Y11, Y3, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -10646,9 +14760,15 @@ mulAvxTwo_4x1Xor_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y4, Y10 VPSHUFB Y11, Y5, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (CX), Y10 ADDQ $0x20, CX @@ -10657,9 +14777,15 @@ mulAvxTwo_4x1Xor_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y6, Y10 VPSHUFB Y11, Y7, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 1 outputs VMOVDQU Y8, (DI) ADDQ $0x20, DI @@ -10673,7 +14799,7 @@ mulAvxTwo_4x1Xor_end: RET // func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -10726,11 +14852,23 @@ mulAvxTwo_4x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -10747,11 +14885,23 @@ mulAvxTwo_4x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -10768,11 +14918,23 @@ mulAvxTwo_4x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -10789,11 +14951,23 @@ mulAvxTwo_4x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) @@ -10808,7 +14982,7 @@ mulAvxTwo_4x1_64Xor_end: RET // func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -10869,15 +15043,28 @@ mulAvxTwo_4x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -10888,15 +15075,28 @@ mulAvxTwo_4x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -10907,15 +15107,28 @@ mulAvxTwo_4x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -10931,7 +15144,7 @@ mulAvxTwo_4x2_end: RET // func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11009,21 +15222,46 @@ mulAvxTwo_4x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -11040,21 +15278,46 @@ mulAvxTwo_4x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -11071,21 +15334,46 @@ mulAvxTwo_4x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) @@ -11103,7 +15391,7 @@ mulAvxTwo_4x2_64_end: RET // func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11148,16 +15436,29 @@ mulAvxTwo_4x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -11168,15 +15469,28 @@ mulAvxTwo_4x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -11187,15 +15501,28 @@ mulAvxTwo_4x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -11206,15 +15533,28 @@ mulAvxTwo_4x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -11230,7 +15570,7 @@ mulAvxTwo_4x2Xor_end: RET // func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11287,21 +15627,46 @@ mulAvxTwo_4x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -11318,21 +15683,46 @@ mulAvxTwo_4x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -11349,21 +15739,46 @@ mulAvxTwo_4x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -11380,21 +15795,46 @@ mulAvxTwo_4x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) @@ -11412,7 +15852,7 @@ mulAvxTwo_4x2_64Xor_end: RET // func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11480,21 +15920,41 @@ mulAvxTwo_4x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -11505,21 +15965,41 @@ mulAvxTwo_4x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -11530,21 +16010,41 @@ mulAvxTwo_4x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -11562,7 +16062,7 @@ mulAvxTwo_4x3_end: RET // func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11650,31 +16150,69 @@ mulAvxTwo_4x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -11691,31 +16229,69 @@ mulAvxTwo_4x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -11732,31 +16308,69 @@ mulAvxTwo_4x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) @@ -11777,7 +16391,7 @@ mulAvxTwo_4x3_64_end: RET // func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11824,23 +16438,43 @@ mulAvxTwo_4x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -11851,21 +16485,41 @@ mulAvxTwo_4x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -11876,21 +16530,41 @@ mulAvxTwo_4x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -11901,21 +16575,41 @@ mulAvxTwo_4x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -11933,7 +16627,7 @@ mulAvxTwo_4x3Xor_end: RET // func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11994,31 +16688,69 @@ mulAvxTwo_4x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -12035,31 +16767,69 @@ mulAvxTwo_4x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -12076,31 +16846,69 @@ mulAvxTwo_4x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -12117,31 +16925,69 @@ mulAvxTwo_4x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) @@ -12162,7 +17008,7 @@ mulAvxTwo_4x3_64Xor_end: RET // func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -12237,27 +17083,54 @@ mulAvxTwo_4x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -12268,27 +17141,54 @@ mulAvxTwo_4x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -12299,27 +17199,54 @@ mulAvxTwo_4x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -12339,7 +17266,7 @@ mulAvxTwo_4x4_end: RET // func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -12388,30 +17315,57 @@ mulAvxTwo_4x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU (R8), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -12422,27 +17376,54 @@ mulAvxTwo_4x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -12453,27 +17434,54 @@ mulAvxTwo_4x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -12484,27 +17492,54 @@ mulAvxTwo_4x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -12524,7 +17559,7 @@ mulAvxTwo_4x4Xor_end: RET // func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -12606,33 +17641,67 @@ mulAvxTwo_4x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -12643,33 +17712,67 @@ mulAvxTwo_4x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -12680,33 +17783,67 @@ mulAvxTwo_4x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -12728,7 +17865,7 @@ mulAvxTwo_4x5_end: RET // func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -12779,37 +17916,71 @@ mulAvxTwo_4x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU (R8), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -12820,33 +17991,67 @@ mulAvxTwo_4x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -12857,33 +18062,67 @@ mulAvxTwo_4x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -12894,33 +18133,67 @@ mulAvxTwo_4x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -12942,7 +18215,7 @@ mulAvxTwo_4x5Xor_end: RET // func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -13031,39 +18304,80 @@ mulAvxTwo_4x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -13074,39 +18388,80 @@ mulAvxTwo_4x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -13117,39 +18472,80 @@ mulAvxTwo_4x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -13173,7 +18569,7 @@ mulAvxTwo_4x6_end: RET // func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -13226,44 +18622,85 @@ mulAvxTwo_4x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU (R8), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -13274,39 +18711,80 @@ mulAvxTwo_4x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -13317,39 +18795,80 @@ mulAvxTwo_4x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -13360,39 +18879,80 @@ mulAvxTwo_4x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -13416,7 +18976,7 @@ mulAvxTwo_4x6Xor_end: RET // func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -13512,45 +19072,93 @@ mulAvxTwo_4x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -13561,45 +19169,93 @@ mulAvxTwo_4x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -13610,45 +19266,93 @@ mulAvxTwo_4x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -13674,7 +19378,7 @@ mulAvxTwo_4x7_end: RET // func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -13729,51 +19433,99 @@ mulAvxTwo_4x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU (R14), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU (R8), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -13784,45 +19536,93 @@ mulAvxTwo_4x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -13833,45 +19633,93 @@ mulAvxTwo_4x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -13882,45 +19730,93 @@ mulAvxTwo_4x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -13946,7 +19842,7 @@ mulAvxTwo_4x7Xor_end: RET // func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -14049,51 +19945,106 @@ mulAvxTwo_4x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -14104,51 +20055,106 @@ mulAvxTwo_4x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -14159,51 +20165,106 @@ mulAvxTwo_4x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -14231,7 +20292,7 @@ mulAvxTwo_4x8_end: RET // func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x8Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -14288,58 +20349,113 @@ mulAvxTwo_4x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU (R14), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU (R15), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU (R8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -14350,51 +20466,106 @@ mulAvxTwo_4x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -14405,51 +20576,106 @@ mulAvxTwo_4x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -14460,51 +20686,106 @@ mulAvxTwo_4x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -14532,7 +20813,7 @@ mulAvxTwo_4x8Xor_end: RET // func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -14644,57 +20925,119 @@ mulAvxTwo_4x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -14705,57 +21048,119 @@ mulAvxTwo_4x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (AX), Y12 ADDQ $0x20, AX @@ -14766,57 +21171,119 @@ mulAvxTwo_4x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -14846,7 +21313,7 @@ mulAvxTwo_4x9_end: RET // func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -14907,65 +21374,127 @@ mulAvxTwo_4x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU (R14), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU (R15), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU (DI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -14976,57 +21505,119 @@ mulAvxTwo_4x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -15037,57 +21628,119 @@ mulAvxTwo_4x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (AX), Y12 ADDQ $0x20, AX @@ -15098,57 +21751,119 @@ mulAvxTwo_4x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -15178,7 +21893,7 @@ mulAvxTwo_4x9Xor_end: RET // func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -15273,63 +21988,132 @@ mulAvxTwo_4x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -15340,63 +22124,132 @@ mulAvxTwo_4x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -15407,63 +22260,132 @@ mulAvxTwo_4x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R8), R10 VMOVDQU Y0, (R10)(R9*1) @@ -15496,7 +22418,7 @@ mulAvxTwo_4x10_end: RET // func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -15536,81 +22458,150 @@ mulAvxTwo_4x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif MOVQ 24(R8), R10 VMOVDQU (R10)(R9*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif MOVQ 48(R8), R10 VMOVDQU (R10)(R9*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif MOVQ 72(R8), R10 VMOVDQU (R10)(R9*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif MOVQ 96(R8), R10 VMOVDQU (R10)(R9*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif MOVQ 120(R8), R10 VMOVDQU (R10)(R9*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif MOVQ 144(R8), R10 VMOVDQU (R10)(R9*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif MOVQ 168(R8), R10 VMOVDQU (R10)(R9*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif MOVQ 192(R8), R10 VMOVDQU (R10)(R9*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif MOVQ 216(R8), R10 VMOVDQU (R10)(R9*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -15621,63 +22612,132 @@ mulAvxTwo_4x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -15688,63 +22748,132 @@ mulAvxTwo_4x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -15755,63 +22884,132 @@ mulAvxTwo_4x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R8), R10 VMOVDQU Y0, (R10)(R9*1) @@ -15844,7 +23042,7 @@ mulAvxTwo_4x10Xor_end: RET // func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -15906,9 +23104,15 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y2, Y12 VPSHUFB Y13, Y3, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y12, Y13, Y10 + +#else + VPXOR Y12, Y10, Y10 + VPXOR Y13, Y10, Y10 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -15917,9 +23121,15 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y4, Y12 VPSHUFB Y13, Y5, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y12, Y13, Y10 + +#else + VPXOR Y12, Y10, Y10 + VPXOR Y13, Y10, Y10 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -15928,9 +23138,15 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y6, Y12 VPSHUFB Y13, Y7, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y12, Y13, Y10 + +#else + VPXOR Y12, Y10, Y10 + VPXOR Y13, Y10, Y10 + +#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX @@ -15939,9 +23155,15 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y8, Y12 VPSHUFB Y13, Y9, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y12, Y13, Y10 + +#else + VPXOR Y12, Y10, Y10 + VPXOR Y13, Y10, Y10 + +#endif // Store 1 outputs VMOVDQU Y10, (R8) ADDQ $0x20, R8 @@ -15955,7 +23177,7 @@ mulAvxTwo_5x1_end: RET // func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -16025,11 +23247,23 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -16046,11 +23280,23 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -16067,11 +23313,23 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -16088,11 +23346,23 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) @@ -16107,7 +23377,7 @@ mulAvxTwo_5x1_64_end: RET // func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x1Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -16160,9 +23430,15 @@ mulAvxTwo_5x1Xor_loop: VMOVDQU (R8), Y10 VPSHUFB Y12, Y0, Y12 VPSHUFB Y13, Y1, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y12, Y13, Y10 + +#else + VPXOR Y12, Y10, Y10 + VPXOR Y13, Y10, Y10 + +#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -16171,9 +23447,15 @@ mulAvxTwo_5x1Xor_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y2, Y12 VPSHUFB Y13, Y3, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y12, Y13, Y10 + +#else + VPXOR Y12, Y10, Y10 + VPXOR Y13, Y10, Y10 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -16182,9 +23464,15 @@ mulAvxTwo_5x1Xor_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y4, Y12 VPSHUFB Y13, Y5, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y12, Y13, Y10 + +#else + VPXOR Y12, Y10, Y10 + VPXOR Y13, Y10, Y10 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -16193,9 +23481,15 @@ mulAvxTwo_5x1Xor_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y6, Y12 VPSHUFB Y13, Y7, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y12, Y13, Y10 + +#else + VPXOR Y12, Y10, Y10 + VPXOR Y13, Y10, Y10 + +#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX @@ -16204,9 +23498,15 @@ mulAvxTwo_5x1Xor_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y8, Y12 VPSHUFB Y13, Y9, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y12, Y13, Y10 + +#else + VPXOR Y12, Y10, Y10 + VPXOR Y13, Y10, Y10 + +#endif // Store 1 outputs VMOVDQU Y10, (R8) ADDQ $0x20, R8 @@ -16220,7 +23520,7 @@ mulAvxTwo_5x1Xor_end: RET // func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -16275,11 +23575,23 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -16296,11 +23608,23 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -16317,11 +23641,23 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -16338,11 +23674,23 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -16359,11 +23707,23 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) @@ -16378,7 +23738,7 @@ mulAvxTwo_5x1_64Xor_end: RET // func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -16441,15 +23801,28 @@ mulAvxTwo_5x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -16460,15 +23833,28 @@ mulAvxTwo_5x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -16479,15 +23865,28 @@ mulAvxTwo_5x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -16498,15 +23897,28 @@ mulAvxTwo_5x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -16522,7 +23934,7 @@ mulAvxTwo_5x2_end: RET // func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -16602,21 +24014,46 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -16633,21 +24070,46 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -16664,21 +24126,46 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -16695,21 +24182,46 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) @@ -16727,7 +24239,7 @@ mulAvxTwo_5x2_64_end: RET // func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -16774,16 +24286,29 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -16794,15 +24319,28 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -16813,15 +24351,28 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -16832,15 +24383,28 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -16851,15 +24415,28 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -16875,7 +24452,7 @@ mulAvxTwo_5x2Xor_end: RET // func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -16934,21 +24511,46 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -16965,21 +24567,46 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -16996,21 +24623,46 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -17027,21 +24679,46 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -17058,21 +24735,46 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) @@ -17090,7 +24792,7 @@ mulAvxTwo_5x2_64Xor_end: RET // func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -17160,21 +24862,41 @@ mulAvxTwo_5x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -17185,21 +24907,41 @@ mulAvxTwo_5x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -17210,21 +24952,41 @@ mulAvxTwo_5x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -17235,21 +24997,41 @@ mulAvxTwo_5x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -17267,7 +25049,7 @@ mulAvxTwo_5x3_end: RET // func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -17357,31 +25139,69 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -17398,31 +25218,69 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -17439,31 +25297,69 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -17480,31 +25376,69 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) @@ -17525,7 +25459,7 @@ mulAvxTwo_5x3_64_end: RET // func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -17574,23 +25508,43 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -17601,21 +25555,41 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -17626,21 +25600,41 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -17651,21 +25645,41 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -17676,21 +25690,41 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -17708,7 +25742,7 @@ mulAvxTwo_5x3Xor_end: RET // func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -17771,31 +25805,69 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -17812,31 +25884,69 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -17853,31 +25963,69 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -17894,31 +26042,69 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -17935,31 +26121,69 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) @@ -17980,7 +26204,7 @@ mulAvxTwo_5x3_64Xor_end: RET // func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -18057,27 +26281,54 @@ mulAvxTwo_5x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -18088,27 +26339,54 @@ mulAvxTwo_5x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -18119,27 +26397,54 @@ mulAvxTwo_5x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -18150,27 +26455,54 @@ mulAvxTwo_5x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -18190,7 +26522,7 @@ mulAvxTwo_5x4_end: RET // func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -18241,30 +26573,57 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -18275,27 +26634,54 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -18306,27 +26692,54 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -18337,27 +26750,54 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -18368,27 +26808,54 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -18408,7 +26875,7 @@ mulAvxTwo_5x4Xor_end: RET // func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -18492,33 +26959,67 @@ mulAvxTwo_5x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -18529,33 +27030,67 @@ mulAvxTwo_5x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -18566,33 +27101,67 @@ mulAvxTwo_5x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -18603,33 +27172,67 @@ mulAvxTwo_5x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -18651,7 +27254,7 @@ mulAvxTwo_5x5_end: RET // func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -18704,37 +27307,71 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU (R9), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -18745,33 +27382,67 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -18782,33 +27453,67 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -18819,33 +27524,67 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -18856,33 +27595,67 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -18904,7 +27677,7 @@ mulAvxTwo_5x5Xor_end: RET // func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -18995,39 +27768,80 @@ mulAvxTwo_5x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -19038,39 +27852,80 @@ mulAvxTwo_5x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -19081,39 +27936,80 @@ mulAvxTwo_5x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -19124,39 +28020,80 @@ mulAvxTwo_5x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -19180,7 +28117,7 @@ mulAvxTwo_5x6_end: RET // func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -19235,44 +28172,85 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU (R14), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU (R9), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -19283,39 +28261,80 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -19326,39 +28345,80 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -19369,39 +28429,80 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -19412,39 +28513,80 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -19468,7 +28610,7 @@ mulAvxTwo_5x6Xor_end: RET // func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -19566,45 +28708,93 @@ mulAvxTwo_5x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -19615,45 +28805,93 @@ mulAvxTwo_5x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -19664,45 +28902,93 @@ mulAvxTwo_5x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -19713,45 +28999,93 @@ mulAvxTwo_5x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -19777,7 +29111,7 @@ mulAvxTwo_5x7_end: RET // func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -19834,51 +29168,99 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU (R14), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU (R15), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU (R9), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -19889,45 +29271,93 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -19938,45 +29368,93 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -19987,45 +29465,93 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -20036,45 +29562,93 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -20100,7 +29674,7 @@ mulAvxTwo_5x7Xor_end: RET // func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -20207,51 +29781,106 @@ mulAvxTwo_5x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -20262,51 +29891,106 @@ mulAvxTwo_5x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -20317,51 +30001,106 @@ mulAvxTwo_5x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (AX), Y11 ADDQ $0x20, AX @@ -20372,51 +30111,106 @@ mulAvxTwo_5x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -20444,7 +30238,7 @@ mulAvxTwo_5x8_end: RET // func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x8Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -20505,58 +30299,113 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU (R14), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU (R15), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU (R8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX @@ -20567,51 +30416,106 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -20622,51 +30526,106 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -20677,51 +30636,106 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (AX), Y11 ADDQ $0x20, AX @@ -20732,51 +30746,106 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -20804,7 +30873,7 @@ mulAvxTwo_5x8Xor_end: RET // func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -20896,57 +30965,119 @@ mulAvxTwo_5x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -20957,57 +31088,119 @@ mulAvxTwo_5x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -21018,57 +31211,119 @@ mulAvxTwo_5x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -21079,57 +31334,119 @@ mulAvxTwo_5x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) @@ -21160,7 +31477,7 @@ mulAvxTwo_5x9_end: RET // func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -21202,73 +31519,135 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif MOVQ 24(R9), R11 VMOVDQU (R11)(R10*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif MOVQ 48(R9), R11 VMOVDQU (R11)(R10*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif MOVQ 72(R9), R11 VMOVDQU (R11)(R10*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif MOVQ 96(R9), R11 VMOVDQU (R11)(R10*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif MOVQ 120(R9), R11 VMOVDQU (R11)(R10*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif MOVQ 144(R9), R11 VMOVDQU (R11)(R10*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif MOVQ 168(R9), R11 VMOVDQU (R11)(R10*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif MOVQ 192(R9), R11 VMOVDQU (R11)(R10*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -21279,57 +31658,119 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -21340,57 +31781,119 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -21401,57 +31904,119 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -21462,57 +32027,119 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) @@ -21543,7 +32170,7 @@ mulAvxTwo_5x9Xor_end: RET // func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -21640,63 +32267,132 @@ mulAvxTwo_5x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -21707,63 +32403,132 @@ mulAvxTwo_5x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -21774,63 +32539,132 @@ mulAvxTwo_5x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -21841,63 +32675,132 @@ mulAvxTwo_5x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) @@ -21930,7 +32833,7 @@ mulAvxTwo_5x10_end: RET // func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -21972,81 +32875,150 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif MOVQ 24(R9), R11 VMOVDQU (R11)(R10*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif MOVQ 48(R9), R11 VMOVDQU (R11)(R10*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif MOVQ 72(R9), R11 VMOVDQU (R11)(R10*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif MOVQ 96(R9), R11 VMOVDQU (R11)(R10*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif MOVQ 120(R9), R11 VMOVDQU (R11)(R10*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif MOVQ 144(R9), R11 VMOVDQU (R11)(R10*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif MOVQ 168(R9), R11 VMOVDQU (R11)(R10*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif MOVQ 192(R9), R11 VMOVDQU (R11)(R10*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif MOVQ 216(R9), R11 VMOVDQU (R11)(R10*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -22057,63 +33029,132 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -22124,63 +33165,132 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -22191,63 +33301,132 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -22258,63 +33437,132 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) @@ -22347,7 +33595,7 @@ mulAvxTwo_5x10Xor_end: RET // func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -22413,9 +33661,15 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y2, Y14 VPSHUFB Y15, Y3, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y14, Y15, Y12 + +#else + VPXOR Y14, Y12, Y12 + VPXOR Y15, Y12, Y12 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI @@ -22424,9 +33678,15 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y4, Y14 VPSHUFB Y15, Y5, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y14, Y15, Y12 + +#else + VPXOR Y14, Y12, Y12 + VPXOR Y15, Y12, Y12 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI @@ -22435,9 +33695,15 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y6, Y14 VPSHUFB Y15, Y7, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y14, Y15, Y12 + +#else + VPXOR Y14, Y12, Y12 + VPXOR Y15, Y12, Y12 + +#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 @@ -22446,9 +33712,15 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y8, Y14 VPSHUFB Y15, Y9, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y14, Y15, Y12 + +#else + VPXOR Y14, Y12, Y12 + VPXOR Y15, Y12, Y12 + +#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (CX), Y14 ADDQ $0x20, CX @@ -22457,9 +33729,15 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y10, Y14 VPSHUFB Y15, Y11, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y14, Y15, Y12 + +#else + VPXOR Y14, Y12, Y12 + VPXOR Y15, Y12, Y12 + +#endif // Store 1 outputs VMOVDQU Y12, (R9) ADDQ $0x20, R9 @@ -22473,7 +33751,7 @@ mulAvxTwo_6x1_end: RET // func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -22545,11 +33823,23 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -22566,11 +33856,23 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -22587,11 +33889,23 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -22608,11 +33922,23 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -22629,11 +33955,23 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) @@ -22648,7 +33986,7 @@ mulAvxTwo_6x1_64_end: RET // func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x1Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -22705,9 +34043,15 @@ mulAvxTwo_6x1Xor_loop: VMOVDQU (R9), Y12 VPSHUFB Y14, Y0, Y14 VPSHUFB Y15, Y1, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y14, Y15, Y12 + +#else + VPXOR Y14, Y12, Y12 + VPXOR Y15, Y12, Y12 + +#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX @@ -22716,9 +34060,15 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y2, Y14 VPSHUFB Y15, Y3, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y14, Y15, Y12 + +#else + VPXOR Y14, Y12, Y12 + VPXOR Y15, Y12, Y12 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI @@ -22727,9 +34077,15 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y4, Y14 VPSHUFB Y15, Y5, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y14, Y15, Y12 + +#else + VPXOR Y14, Y12, Y12 + VPXOR Y15, Y12, Y12 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI @@ -22738,9 +34094,15 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y6, Y14 VPSHUFB Y15, Y7, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y14, Y15, Y12 + +#else + VPXOR Y14, Y12, Y12 + VPXOR Y15, Y12, Y12 + +#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 @@ -22749,9 +34111,15 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y8, Y14 VPSHUFB Y15, Y9, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y14, Y15, Y12 + +#else + VPXOR Y14, Y12, Y12 + VPXOR Y15, Y12, Y12 + +#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (CX), Y14 ADDQ $0x20, CX @@ -22760,9 +34128,15 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y10, Y14 VPSHUFB Y15, Y11, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y14, Y15, Y12 + +#else + VPXOR Y14, Y12, Y12 + VPXOR Y15, Y12, Y12 + +#endif // Store 1 outputs VMOVDQU Y12, (R9) ADDQ $0x20, R9 @@ -22776,7 +34150,7 @@ mulAvxTwo_6x1Xor_end: RET // func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -22833,11 +34207,23 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -22854,11 +34240,23 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -22875,11 +34273,23 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -22896,11 +34306,23 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -22917,11 +34339,23 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -22938,11 +34372,23 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) @@ -22957,7 +34403,7 @@ mulAvxTwo_6x1_64Xor_end: RET // func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -23022,15 +34468,28 @@ mulAvxTwo_6x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -23041,15 +34500,28 @@ mulAvxTwo_6x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -23060,15 +34532,28 @@ mulAvxTwo_6x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -23079,15 +34564,28 @@ mulAvxTwo_6x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -23098,15 +34596,28 @@ mulAvxTwo_6x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -23122,7 +34633,7 @@ mulAvxTwo_6x2_end: RET // func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -23204,21 +34715,46 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -23235,21 +34771,46 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -23266,21 +34827,46 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -23297,21 +34883,46 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -23328,21 +34939,46 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) @@ -23360,7 +34996,7 @@ mulAvxTwo_6x2_64_end: RET // func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -23409,16 +35045,29 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -23429,15 +35078,28 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -23448,15 +35110,28 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -23467,15 +35142,28 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -23486,15 +35174,28 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -23505,15 +35206,28 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -23529,7 +35243,7 @@ mulAvxTwo_6x2Xor_end: RET // func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -23590,21 +35304,46 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -23621,21 +35360,46 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -23652,21 +35416,46 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -23683,21 +35472,46 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -23714,21 +35528,46 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -23745,21 +35584,46 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) @@ -23777,7 +35641,7 @@ mulAvxTwo_6x2_64Xor_end: RET // func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -23849,21 +35713,41 @@ mulAvxTwo_6x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -23874,21 +35758,41 @@ mulAvxTwo_6x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -23899,21 +35803,41 @@ mulAvxTwo_6x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -23924,21 +35848,41 @@ mulAvxTwo_6x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -23949,21 +35893,41 @@ mulAvxTwo_6x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -23981,7 +35945,7 @@ mulAvxTwo_6x3_end: RET // func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -24073,31 +36037,69 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -24114,31 +36116,69 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -24155,31 +36195,69 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -24196,31 +36274,69 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -24237,31 +36353,69 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) @@ -24282,7 +36436,7 @@ mulAvxTwo_6x3_64_end: RET // func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -24333,23 +36487,43 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -24360,21 +36534,41 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -24385,21 +36579,41 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -24410,21 +36624,41 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -24435,21 +36669,41 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -24460,21 +36714,41 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -24492,7 +36766,7 @@ mulAvxTwo_6x3Xor_end: RET // func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -24557,31 +36831,69 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -24598,31 +36910,69 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -24639,31 +36989,69 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -24680,31 +37068,69 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -24721,31 +37147,69 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -24762,31 +37226,69 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) @@ -24807,7 +37309,7 @@ mulAvxTwo_6x3_64Xor_end: RET // func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -24886,27 +37388,54 @@ mulAvxTwo_6x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -24917,27 +37446,54 @@ mulAvxTwo_6x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -24948,27 +37504,54 @@ mulAvxTwo_6x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -24979,27 +37562,54 @@ mulAvxTwo_6x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -25010,27 +37620,54 @@ mulAvxTwo_6x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -25050,7 +37687,7 @@ mulAvxTwo_6x4_end: RET // func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -25103,30 +37740,57 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -25137,27 +37801,54 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -25168,27 +37859,54 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -25199,27 +37917,54 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -25230,27 +37975,54 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -25261,27 +38033,54 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -25301,7 +38100,7 @@ mulAvxTwo_6x4Xor_end: RET // func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -25387,33 +38186,67 @@ mulAvxTwo_6x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -25424,33 +38257,67 @@ mulAvxTwo_6x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -25461,33 +38328,67 @@ mulAvxTwo_6x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -25498,33 +38399,67 @@ mulAvxTwo_6x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -25535,33 +38470,67 @@ mulAvxTwo_6x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -25583,7 +38552,7 @@ mulAvxTwo_6x5_end: RET // func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -25638,37 +38607,71 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU (R14), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -25679,33 +38682,67 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -25716,33 +38753,67 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -25753,33 +38824,67 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -25790,33 +38895,67 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -25827,33 +38966,67 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -25875,7 +39048,7 @@ mulAvxTwo_6x5Xor_end: RET // func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -25968,39 +39141,80 @@ mulAvxTwo_6x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -26011,39 +39225,80 @@ mulAvxTwo_6x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -26054,39 +39309,80 @@ mulAvxTwo_6x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -26097,39 +39393,80 @@ mulAvxTwo_6x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -26140,39 +39477,80 @@ mulAvxTwo_6x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -26196,7 +39574,7 @@ mulAvxTwo_6x6_end: RET // func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x6Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -26253,44 +39631,85 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU (R14), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU (R15), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU (R10), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -26301,39 +39720,80 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -26344,39 +39804,80 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -26387,39 +39888,80 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -26430,39 +39972,80 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -26473,39 +40056,80 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -26529,7 +40153,7 @@ mulAvxTwo_6x6Xor_end: RET // func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -26631,45 +40255,93 @@ mulAvxTwo_6x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -26680,45 +40352,93 @@ mulAvxTwo_6x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -26729,45 +40449,93 @@ mulAvxTwo_6x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -26778,45 +40546,93 @@ mulAvxTwo_6x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (AX), Y10 ADDQ $0x20, AX @@ -26827,45 +40643,93 @@ mulAvxTwo_6x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -26891,7 +40755,7 @@ mulAvxTwo_6x7_end: RET // func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -26952,51 +40816,99 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU (R14), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU (R15), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU (R9), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -27007,45 +40919,93 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -27056,45 +41016,93 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -27105,45 +41113,93 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -27154,45 +41210,93 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (AX), Y10 ADDQ $0x20, AX @@ -27203,45 +41307,93 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -27267,7 +41419,7 @@ mulAvxTwo_6x7Xor_end: RET // func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -27356,51 +41508,106 @@ mulAvxTwo_6x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -27411,51 +41618,106 @@ mulAvxTwo_6x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -27466,51 +41728,106 @@ mulAvxTwo_6x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -27521,51 +41838,106 @@ mulAvxTwo_6x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -27576,51 +41948,106 @@ mulAvxTwo_6x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) @@ -27649,7 +42076,7 @@ mulAvxTwo_6x8_end: RET // func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -27693,65 +42120,120 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -27762,51 +42244,106 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -27817,51 +42354,106 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -27872,51 +42464,106 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -27927,51 +42574,106 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -27982,51 +42684,106 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) @@ -28055,7 +42812,7 @@ mulAvxTwo_6x8Xor_end: RET // func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -28149,57 +42906,119 @@ mulAvxTwo_6x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -28210,57 +43029,119 @@ mulAvxTwo_6x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -28271,57 +43152,119 @@ mulAvxTwo_6x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -28332,57 +43275,119 @@ mulAvxTwo_6x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -28393,57 +43398,119 @@ mulAvxTwo_6x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) @@ -28474,7 +43541,7 @@ mulAvxTwo_6x9_end: RET // func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -28518,73 +43585,135 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif MOVQ 192(R10), R12 VMOVDQU (R12)(R11*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -28595,57 +43724,119 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -28656,57 +43847,119 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -28717,57 +43970,119 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -28778,57 +44093,119 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -28839,57 +44216,119 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) @@ -28920,7 +44359,7 @@ mulAvxTwo_6x9Xor_end: RET // func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -29019,63 +44458,132 @@ mulAvxTwo_6x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -29086,63 +44594,132 @@ mulAvxTwo_6x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -29153,63 +44730,132 @@ mulAvxTwo_6x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -29220,63 +44866,132 @@ mulAvxTwo_6x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -29287,63 +45002,132 @@ mulAvxTwo_6x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) @@ -29376,7 +45160,7 @@ mulAvxTwo_6x10_end: RET // func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -29420,81 +45204,150 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif MOVQ 192(R10), R12 VMOVDQU (R12)(R11*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif MOVQ 216(R10), R12 VMOVDQU (R12)(R11*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -29505,63 +45358,132 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -29572,63 +45494,132 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -29639,63 +45630,132 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -29706,63 +45766,132 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -29773,63 +45902,132 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) @@ -29862,7 +46060,7 @@ mulAvxTwo_6x10Xor_end: RET // func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x1(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -29922,9 +46120,15 @@ mulAvxTwo_7x1_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -29935,9 +46139,15 @@ mulAvxTwo_7x1_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -29948,9 +46158,15 @@ mulAvxTwo_7x1_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -29961,9 +46177,15 @@ mulAvxTwo_7x1_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -29974,9 +46196,15 @@ mulAvxTwo_7x1_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -29987,9 +46215,15 @@ mulAvxTwo_7x1_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Store 1 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -30003,7 +46237,7 @@ mulAvxTwo_7x1_end: RET // func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -30077,11 +46311,23 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -30098,11 +46344,23 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -30119,11 +46377,23 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -30140,11 +46410,23 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -30161,11 +46443,23 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -30182,11 +46476,23 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) @@ -30201,7 +46507,7 @@ mulAvxTwo_7x1_64_end: RET // func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x1Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -30250,9 +46556,15 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 ADDQ $0x20, SI @@ -30263,9 +46575,15 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -30276,9 +46594,15 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -30289,9 +46613,15 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -30302,9 +46632,15 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -30315,9 +46651,15 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -30328,9 +46670,15 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Store 1 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -30344,7 +46692,7 @@ mulAvxTwo_7x1Xor_end: RET // func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -30403,11 +46751,23 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -30424,11 +46784,23 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -30445,11 +46817,23 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -30466,11 +46850,23 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -30487,11 +46883,23 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -30508,11 +46916,23 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -30529,11 +46949,23 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) @@ -30548,7 +46980,7 @@ mulAvxTwo_7x1_64Xor_end: RET // func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -30615,15 +47047,28 @@ mulAvxTwo_7x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -30634,15 +47079,28 @@ mulAvxTwo_7x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -30653,15 +47111,28 @@ mulAvxTwo_7x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -30672,15 +47143,28 @@ mulAvxTwo_7x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -30691,15 +47175,28 @@ mulAvxTwo_7x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -30710,15 +47207,28 @@ mulAvxTwo_7x2_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -30734,7 +47244,7 @@ mulAvxTwo_7x2_end: RET // func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -30818,21 +47328,46 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -30849,21 +47384,46 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -30880,21 +47440,46 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -30911,21 +47496,46 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -30942,21 +47552,46 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -30973,21 +47608,46 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) @@ -31005,7 +47665,7 @@ mulAvxTwo_7x2_64_end: RET // func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -31056,16 +47716,29 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -31076,15 +47749,28 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -31095,15 +47781,28 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -31114,15 +47813,28 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -31133,15 +47845,28 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -31152,15 +47877,28 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -31171,15 +47909,28 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -31195,7 +47946,7 @@ mulAvxTwo_7x2Xor_end: RET // func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -31258,21 +48009,46 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -31289,21 +48065,46 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -31320,21 +48121,46 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -31351,21 +48177,46 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -31382,21 +48233,46 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -31413,21 +48289,46 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -31444,21 +48345,46 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) @@ -31476,7 +48402,7 @@ mulAvxTwo_7x2_64Xor_end: RET // func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -31550,21 +48476,41 @@ mulAvxTwo_7x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -31575,21 +48521,41 @@ mulAvxTwo_7x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -31600,21 +48566,41 @@ mulAvxTwo_7x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -31625,21 +48611,41 @@ mulAvxTwo_7x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -31650,21 +48656,41 @@ mulAvxTwo_7x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -31675,21 +48701,41 @@ mulAvxTwo_7x3_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -31707,7 +48753,7 @@ mulAvxTwo_7x3_end: RET // func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -31801,31 +48847,69 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -31842,31 +48926,69 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -31883,31 +49005,69 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -31924,31 +49084,69 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -31965,31 +49163,69 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -32006,31 +49242,69 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) @@ -32051,7 +49325,7 @@ mulAvxTwo_7x3_64_end: RET // func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -32104,23 +49378,43 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -32131,21 +49425,41 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -32156,21 +49470,41 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -32181,21 +49515,41 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -32206,21 +49560,41 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -32231,21 +49605,41 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -32256,21 +49650,41 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -32288,7 +49702,7 @@ mulAvxTwo_7x3Xor_end: RET // func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -32355,31 +49769,69 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -32396,31 +49848,69 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -32437,31 +49927,69 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -32478,31 +50006,69 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -32519,31 +50085,69 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -32560,31 +50164,69 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -32601,31 +50243,69 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) @@ -32646,7 +50326,7 @@ mulAvxTwo_7x3_64Xor_end: RET // func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -32727,27 +50407,54 @@ mulAvxTwo_7x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -32758,27 +50465,54 @@ mulAvxTwo_7x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -32789,27 +50523,54 @@ mulAvxTwo_7x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -32820,27 +50581,54 @@ mulAvxTwo_7x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -32851,27 +50639,54 @@ mulAvxTwo_7x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -32882,27 +50697,54 @@ mulAvxTwo_7x4_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -32922,7 +50764,7 @@ mulAvxTwo_7x4_end: RET // func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -32977,30 +50819,57 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU (R14), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -33011,27 +50880,54 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -33042,27 +50938,54 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -33073,27 +50996,54 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -33104,27 +51054,54 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -33135,27 +51112,54 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -33166,27 +51170,54 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -33206,7 +51237,7 @@ mulAvxTwo_7x4Xor_end: RET // func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -33294,33 +51325,67 @@ mulAvxTwo_7x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -33331,33 +51396,67 @@ mulAvxTwo_7x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -33368,33 +51467,67 @@ mulAvxTwo_7x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -33405,33 +51538,67 @@ mulAvxTwo_7x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -33442,33 +51609,67 @@ mulAvxTwo_7x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -33479,33 +51680,67 @@ mulAvxTwo_7x5_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -33527,7 +51762,7 @@ mulAvxTwo_7x5_end: RET // func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x5Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -33584,37 +51819,71 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU (R14), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU (R15), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -33625,33 +51894,67 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -33662,33 +51965,67 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -33699,33 +52036,67 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -33736,33 +52107,67 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -33773,33 +52178,67 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -33810,33 +52249,67 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -33858,7 +52331,7 @@ mulAvxTwo_7x5Xor_end: RET // func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -33955,39 +52428,80 @@ mulAvxTwo_7x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -33998,39 +52512,80 @@ mulAvxTwo_7x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -34041,39 +52596,80 @@ mulAvxTwo_7x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -34084,39 +52680,80 @@ mulAvxTwo_7x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -34127,39 +52764,80 @@ mulAvxTwo_7x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (AX), Y9 ADDQ $0x20, AX @@ -34170,39 +52848,80 @@ mulAvxTwo_7x6_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -34226,7 +52945,7 @@ mulAvxTwo_7x6_end: RET // func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x6Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -34287,44 +53006,85 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU (R14), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU (R15), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU (R10), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX @@ -34335,39 +53095,80 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -34378,39 +53179,80 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -34421,39 +53263,80 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -34464,39 +53347,80 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -34507,39 +53431,80 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (AX), Y9 ADDQ $0x20, AX @@ -34550,39 +53515,80 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -34606,7 +53612,7 @@ mulAvxTwo_7x6Xor_end: RET // func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -34692,45 +53698,93 @@ mulAvxTwo_7x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -34741,45 +53795,93 @@ mulAvxTwo_7x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -34790,45 +53892,93 @@ mulAvxTwo_7x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -34839,45 +53989,93 @@ mulAvxTwo_7x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -34888,45 +54086,93 @@ mulAvxTwo_7x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -34937,45 +54183,93 @@ mulAvxTwo_7x7_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -35002,7 +54296,7 @@ mulAvxTwo_7x7_end: RET // func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -35048,57 +54342,105 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -35109,45 +54451,93 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -35158,45 +54548,93 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -35207,45 +54645,93 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -35256,45 +54742,93 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -35305,45 +54839,93 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -35354,45 +54936,93 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -35419,7 +55049,7 @@ mulAvxTwo_7x7Xor_end: RET // func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -35510,51 +55140,106 @@ mulAvxTwo_7x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -35565,51 +55250,106 @@ mulAvxTwo_7x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -35620,51 +55360,106 @@ mulAvxTwo_7x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -35675,51 +55470,106 @@ mulAvxTwo_7x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -35730,51 +55580,106 @@ mulAvxTwo_7x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -35785,51 +55690,106 @@ mulAvxTwo_7x8_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -35858,7 +55818,7 @@ mulAvxTwo_7x8_end: RET // func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -35904,65 +55864,120 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -35973,51 +55988,106 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -36028,51 +56098,106 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -36083,51 +56208,106 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -36138,51 +56318,106 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -36193,51 +56428,106 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -36248,51 +56538,106 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -36321,7 +56666,7 @@ mulAvxTwo_7x8Xor_end: RET // func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -36417,57 +56762,119 @@ mulAvxTwo_7x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -36478,57 +56885,119 @@ mulAvxTwo_7x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -36539,57 +57008,119 @@ mulAvxTwo_7x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -36600,57 +57131,119 @@ mulAvxTwo_7x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -36661,57 +57254,119 @@ mulAvxTwo_7x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -36722,57 +57377,119 @@ mulAvxTwo_7x9_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -36803,7 +57520,7 @@ mulAvxTwo_7x9_end: RET // func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -36849,73 +57566,135 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif MOVQ 192(R11), R13 VMOVDQU (R13)(R12*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -36926,57 +57705,119 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -36987,57 +57828,119 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -37048,57 +57951,119 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -37109,57 +58074,119 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -37170,57 +58197,119 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -37231,57 +58320,119 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -37312,7 +58463,7 @@ mulAvxTwo_7x9Xor_end: RET // func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -37413,63 +58564,132 @@ mulAvxTwo_7x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -37480,63 +58700,132 @@ mulAvxTwo_7x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -37547,63 +58836,132 @@ mulAvxTwo_7x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -37614,63 +58972,132 @@ mulAvxTwo_7x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -37681,63 +59108,132 @@ mulAvxTwo_7x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -37748,63 +59244,132 @@ mulAvxTwo_7x10_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -37837,7 +59402,7 @@ mulAvxTwo_7x10_end: RET // func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -37883,81 +59448,150 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif MOVQ 192(R11), R13 VMOVDQU (R13)(R12*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif MOVQ 216(R11), R13 VMOVDQU (R13)(R12*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -37968,63 +59602,132 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -38035,63 +59738,132 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -38102,63 +59874,132 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -38169,63 +60010,132 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -38236,63 +60146,132 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -38303,63 +60282,132 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -38392,7 +60440,7 @@ mulAvxTwo_7x10Xor_end: RET // func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x1(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -38454,9 +60502,15 @@ mulAvxTwo_8x1_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -38467,9 +60521,15 @@ mulAvxTwo_8x1_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -38480,9 +60540,15 @@ mulAvxTwo_8x1_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -38493,9 +60559,15 @@ mulAvxTwo_8x1_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -38506,9 +60578,15 @@ mulAvxTwo_8x1_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 @@ -38519,9 +60597,15 @@ mulAvxTwo_8x1_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -38532,9 +60616,15 @@ mulAvxTwo_8x1_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Store 1 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -38548,7 +60638,7 @@ mulAvxTwo_8x1_end: RET // func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -38624,11 +60714,23 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -38645,11 +60747,23 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -38666,11 +60780,23 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -38687,11 +60813,23 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -38708,11 +60846,23 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 @@ -38729,11 +60879,23 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -38750,11 +60912,23 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) @@ -38769,7 +60943,7 @@ mulAvxTwo_8x1_64_end: RET // func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x1Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -38820,9 +60994,15 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 ADDQ $0x20, SI @@ -38833,9 +61013,15 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -38846,9 +61032,15 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -38859,9 +61051,15 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -38872,9 +61070,15 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -38885,9 +61089,15 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 @@ -38898,9 +61108,15 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -38911,9 +61127,15 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Store 1 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -38927,7 +61149,7 @@ mulAvxTwo_8x1Xor_end: RET // func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -38988,11 +61210,23 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -39009,11 +61243,23 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -39030,11 +61276,23 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -39051,11 +61309,23 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -39072,11 +61342,23 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -39093,11 +61375,23 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 @@ -39114,11 +61408,23 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -39135,11 +61441,23 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) @@ -39154,7 +61472,7 @@ mulAvxTwo_8x1_64Xor_end: RET // func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -39223,15 +61541,28 @@ mulAvxTwo_8x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -39242,15 +61573,28 @@ mulAvxTwo_8x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -39261,15 +61605,28 @@ mulAvxTwo_8x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -39280,15 +61637,28 @@ mulAvxTwo_8x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -39299,15 +61669,28 @@ mulAvxTwo_8x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 @@ -39318,15 +61701,28 @@ mulAvxTwo_8x2_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -39337,15 +61733,28 @@ mulAvxTwo_8x2_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -39361,7 +61770,7 @@ mulAvxTwo_8x2_end: RET // func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -39447,21 +61856,46 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -39478,21 +61912,46 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -39509,21 +61968,46 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -39540,21 +62024,46 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -39571,21 +62080,46 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 @@ -39602,21 +62136,46 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -39633,21 +62192,46 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) @@ -39665,7 +62249,7 @@ mulAvxTwo_8x2_64_end: RET // func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -39718,16 +62302,29 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -39738,15 +62335,28 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -39757,15 +62367,28 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -39776,15 +62399,28 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -39795,15 +62431,28 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -39814,15 +62463,28 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 @@ -39833,15 +62495,28 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -39852,15 +62527,28 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -39876,7 +62564,7 @@ mulAvxTwo_8x2Xor_end: RET // func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -39941,21 +62629,46 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -39972,21 +62685,46 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -40003,21 +62741,46 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -40034,21 +62797,46 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -40065,21 +62853,46 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -40096,21 +62909,46 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 @@ -40127,21 +62965,46 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -40158,21 +63021,46 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) @@ -40190,7 +63078,7 @@ mulAvxTwo_8x2_64Xor_end: RET // func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -40266,21 +63154,41 @@ mulAvxTwo_8x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -40291,21 +63199,41 @@ mulAvxTwo_8x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -40316,21 +63244,41 @@ mulAvxTwo_8x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -40341,21 +63289,41 @@ mulAvxTwo_8x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -40366,21 +63334,41 @@ mulAvxTwo_8x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 @@ -40391,21 +63379,41 @@ mulAvxTwo_8x3_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -40416,21 +63424,41 @@ mulAvxTwo_8x3_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -40448,7 +63476,7 @@ mulAvxTwo_8x3_end: RET // func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -40544,31 +63572,69 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -40585,31 +63651,69 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -40626,31 +63730,69 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -40667,31 +63809,69 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -40708,31 +63888,69 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 @@ -40749,31 +63967,69 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -40790,31 +64046,69 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) @@ -40835,7 +64129,7 @@ mulAvxTwo_8x3_64_end: RET // func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -40890,23 +64184,43 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -40917,21 +64231,41 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -40942,21 +64276,41 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -40967,21 +64321,41 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -40992,21 +64366,41 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -41017,21 +64411,41 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 @@ -41042,21 +64456,41 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -41067,21 +64501,41 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -41099,7 +64553,7 @@ mulAvxTwo_8x3Xor_end: RET // func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -41168,31 +64622,69 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -41209,31 +64701,69 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -41250,31 +64780,69 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -41291,31 +64859,69 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -41332,31 +64938,69 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -41373,31 +65017,69 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 @@ -41414,31 +65096,69 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -41455,31 +65175,69 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) @@ -41500,7 +65258,7 @@ mulAvxTwo_8x3_64Xor_end: RET // func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -41583,27 +65341,54 @@ mulAvxTwo_8x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -41614,27 +65399,54 @@ mulAvxTwo_8x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -41645,27 +65457,54 @@ mulAvxTwo_8x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -41676,27 +65515,54 @@ mulAvxTwo_8x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -41707,27 +65573,54 @@ mulAvxTwo_8x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 @@ -41738,27 +65631,54 @@ mulAvxTwo_8x4_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -41769,27 +65689,54 @@ mulAvxTwo_8x4_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -41809,7 +65756,7 @@ mulAvxTwo_8x4_end: RET // func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x4Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -41866,30 +65813,57 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU (R15), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -41900,27 +65874,54 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -41931,27 +65932,54 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -41962,27 +65990,54 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -41993,27 +66048,54 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -42024,27 +66106,54 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 @@ -42055,27 +66164,54 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -42086,27 +66222,54 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -42126,7 +66289,7 @@ mulAvxTwo_8x4Xor_end: RET // func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -42218,33 +66381,67 @@ mulAvxTwo_8x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -42255,33 +66452,67 @@ mulAvxTwo_8x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -42292,33 +66523,67 @@ mulAvxTwo_8x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -42329,33 +66594,67 @@ mulAvxTwo_8x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -42366,33 +66665,67 @@ mulAvxTwo_8x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -42403,33 +66736,67 @@ mulAvxTwo_8x5_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (AX), Y8 ADDQ $0x20, AX @@ -42440,33 +66807,67 @@ mulAvxTwo_8x5_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -42488,7 +66889,7 @@ mulAvxTwo_8x5_end: RET // func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x5Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -42549,37 +66950,71 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU (R14), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU (R15), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -42590,33 +67025,67 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -42627,33 +67096,67 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -42664,33 +67167,67 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -42701,33 +67238,67 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -42738,33 +67309,67 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -42775,33 +67380,67 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (AX), Y8 ADDQ $0x20, AX @@ -42812,33 +67451,67 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -42860,7 +67533,7 @@ mulAvxTwo_8x5Xor_end: RET // func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -42943,39 +67616,80 @@ mulAvxTwo_8x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -42986,39 +67700,80 @@ mulAvxTwo_8x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -43029,39 +67784,80 @@ mulAvxTwo_8x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -43072,39 +67868,80 @@ mulAvxTwo_8x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 @@ -43115,39 +67952,80 @@ mulAvxTwo_8x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 @@ -43158,39 +68036,80 @@ mulAvxTwo_8x6_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -43201,39 +68120,80 @@ mulAvxTwo_8x6_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -43258,7 +68218,7 @@ mulAvxTwo_8x6_end: RET // func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -43306,49 +68266,90 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -43359,39 +68360,80 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -43402,39 +68444,80 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -43445,39 +68528,80 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -43488,39 +68612,80 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 @@ -43531,39 +68696,80 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 @@ -43574,39 +68780,80 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -43617,39 +68864,80 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -43674,7 +68962,7 @@ mulAvxTwo_8x6Xor_end: RET // func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -43762,45 +69050,93 @@ mulAvxTwo_8x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -43811,45 +69147,93 @@ mulAvxTwo_8x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -43860,45 +69244,93 @@ mulAvxTwo_8x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -43909,45 +69341,93 @@ mulAvxTwo_8x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -43958,45 +69438,93 @@ mulAvxTwo_8x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 @@ -44007,45 +69535,93 @@ mulAvxTwo_8x7_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -44056,45 +69632,93 @@ mulAvxTwo_8x7_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -44121,7 +69745,7 @@ mulAvxTwo_8x7_end: RET // func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -44169,57 +69793,105 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -44230,45 +69902,93 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -44279,45 +69999,93 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -44328,45 +70096,93 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -44377,45 +70193,93 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -44426,45 +70290,93 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 @@ -44475,45 +70387,93 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -44524,45 +70484,93 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -44589,7 +70597,7 @@ mulAvxTwo_8x7Xor_end: RET // func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -44682,51 +70690,106 @@ mulAvxTwo_8x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -44737,51 +70800,106 @@ mulAvxTwo_8x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -44792,51 +70910,106 @@ mulAvxTwo_8x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -44847,51 +71020,106 @@ mulAvxTwo_8x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -44902,51 +71130,106 @@ mulAvxTwo_8x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 @@ -44957,51 +71240,106 @@ mulAvxTwo_8x8_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -45012,51 +71350,106 @@ mulAvxTwo_8x8_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -45085,7 +71478,7 @@ mulAvxTwo_8x8_end: RET // func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -45133,65 +71526,120 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -45202,51 +71650,106 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -45257,51 +71760,106 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -45312,51 +71870,106 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -45367,51 +71980,106 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -45422,51 +72090,106 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 @@ -45477,51 +72200,106 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -45532,51 +72310,106 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -45605,7 +72438,7 @@ mulAvxTwo_8x8Xor_end: RET // func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -45703,57 +72536,119 @@ mulAvxTwo_8x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -45764,57 +72659,119 @@ mulAvxTwo_8x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -45825,57 +72782,119 @@ mulAvxTwo_8x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -45886,57 +72905,119 @@ mulAvxTwo_8x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -45947,57 +73028,119 @@ mulAvxTwo_8x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 @@ -46008,57 +73151,119 @@ mulAvxTwo_8x9_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -46069,57 +73274,119 @@ mulAvxTwo_8x9_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -46150,7 +73417,7 @@ mulAvxTwo_8x9_end: RET // func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -46198,73 +73465,135 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif MOVQ 192(R12), R14 VMOVDQU (R14)(R13*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -46275,57 +73604,119 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -46336,57 +73727,119 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -46397,57 +73850,119 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -46458,57 +73973,119 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -46519,57 +74096,119 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 @@ -46580,57 +74219,119 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -46641,57 +74342,119 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -46722,7 +74485,7 @@ mulAvxTwo_8x9Xor_end: RET // func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -46825,63 +74588,132 @@ mulAvxTwo_8x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -46892,63 +74724,132 @@ mulAvxTwo_8x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -46959,63 +74860,132 @@ mulAvxTwo_8x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -47026,63 +74996,132 @@ mulAvxTwo_8x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -47093,63 +75132,132 @@ mulAvxTwo_8x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 @@ -47160,63 +75268,132 @@ mulAvxTwo_8x10_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -47227,63 +75404,132 @@ mulAvxTwo_8x10_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -47316,7 +75562,7 @@ mulAvxTwo_8x10_end: RET // func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -47364,81 +75610,150 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif MOVQ 192(R12), R14 VMOVDQU (R14)(R13*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif MOVQ 216(R12), R14 VMOVDQU (R14)(R13*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -47449,63 +75764,132 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -47516,63 +75900,132 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -47583,63 +76036,132 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -47650,63 +76172,132 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -47717,63 +76308,132 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 @@ -47784,63 +76444,132 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -47851,63 +76580,132 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -47940,7 +76738,7 @@ mulAvxTwo_8x10Xor_end: RET // func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x1(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -48004,9 +76802,15 @@ mulAvxTwo_9x1_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -48017,9 +76821,15 @@ mulAvxTwo_9x1_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -48030,9 +76840,15 @@ mulAvxTwo_9x1_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -48043,9 +76859,15 @@ mulAvxTwo_9x1_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -48056,9 +76878,15 @@ mulAvxTwo_9x1_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 @@ -48069,9 +76897,15 @@ mulAvxTwo_9x1_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 ADDQ $0x20, R12 @@ -48082,9 +76916,15 @@ mulAvxTwo_9x1_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -48095,9 +76935,15 @@ mulAvxTwo_9x1_loop: VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Store 1 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -48111,7 +76957,7 @@ mulAvxTwo_9x1_end: RET // func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -48189,11 +77035,23 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -48210,11 +77068,23 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -48231,11 +77101,23 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -48252,11 +77134,23 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -48273,11 +77167,23 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 @@ -48294,11 +77200,23 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 @@ -48315,11 +77233,23 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -48336,11 +77266,23 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) @@ -48355,7 +77297,7 @@ mulAvxTwo_9x1_64_end: RET // func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x1Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -48408,9 +77350,15 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 ADDQ $0x20, SI @@ -48421,9 +77369,15 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -48434,9 +77388,15 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -48447,9 +77407,15 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -48460,9 +77426,15 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -48473,9 +77445,15 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 @@ -48486,9 +77464,15 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 ADDQ $0x20, R12 @@ -48499,9 +77483,15 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -48512,9 +77502,15 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Store 1 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -48528,7 +77524,7 @@ mulAvxTwo_9x1Xor_end: RET // func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -48591,11 +77587,23 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -48612,11 +77620,23 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -48633,11 +77653,23 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -48654,11 +77686,23 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -48675,11 +77719,23 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -48696,11 +77752,23 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 @@ -48717,11 +77785,23 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 @@ -48738,11 +77818,23 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -48759,11 +77851,23 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) @@ -48778,7 +77882,7 @@ mulAvxTwo_9x1_64Xor_end: RET // func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -48849,15 +77953,28 @@ mulAvxTwo_9x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -48868,15 +77985,28 @@ mulAvxTwo_9x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -48887,15 +78017,28 @@ mulAvxTwo_9x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -48906,15 +78049,28 @@ mulAvxTwo_9x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -48925,15 +78081,28 @@ mulAvxTwo_9x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 @@ -48944,15 +78113,28 @@ mulAvxTwo_9x2_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 ADDQ $0x20, R12 @@ -48963,15 +78145,28 @@ mulAvxTwo_9x2_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -48982,15 +78177,28 @@ mulAvxTwo_9x2_loop: VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -49006,7 +78214,7 @@ mulAvxTwo_9x2_end: RET // func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -49094,21 +78302,46 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -49125,21 +78358,46 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -49156,21 +78414,46 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -49187,21 +78470,46 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -49218,21 +78526,46 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 @@ -49249,21 +78582,46 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 @@ -49280,21 +78638,46 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -49311,21 +78694,46 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -49343,7 +78751,7 @@ mulAvxTwo_9x2_64_end: RET // func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -49398,16 +78806,29 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -49418,15 +78839,28 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -49437,15 +78871,28 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -49456,15 +78903,28 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -49475,15 +78935,28 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -49494,15 +78967,28 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 @@ -49513,15 +78999,28 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 ADDQ $0x20, R12 @@ -49532,15 +79031,28 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -49551,15 +79063,28 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -49575,7 +79100,7 @@ mulAvxTwo_9x2Xor_end: RET // func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -49642,21 +79167,46 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -49673,21 +79223,46 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -49704,21 +79279,46 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -49735,21 +79335,46 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -49766,21 +79391,46 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -49797,21 +79447,46 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 @@ -49828,21 +79503,46 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 @@ -49859,21 +79559,46 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -49890,21 +79615,46 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -49922,7 +79672,7 @@ mulAvxTwo_9x2_64Xor_end: RET // func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x3(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -50000,21 +79750,41 @@ mulAvxTwo_9x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -50025,21 +79795,41 @@ mulAvxTwo_9x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -50050,21 +79840,41 @@ mulAvxTwo_9x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -50075,21 +79885,41 @@ mulAvxTwo_9x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -50100,21 +79930,41 @@ mulAvxTwo_9x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 @@ -50125,21 +79975,41 @@ mulAvxTwo_9x3_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R12), Y6 ADDQ $0x20, R12 @@ -50150,21 +80020,41 @@ mulAvxTwo_9x3_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -50175,21 +80065,41 @@ mulAvxTwo_9x3_loop: VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -50207,7 +80117,7 @@ mulAvxTwo_9x3_end: RET // func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x3_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -50305,31 +80215,69 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -50346,31 +80294,69 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -50387,31 +80373,69 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -50428,31 +80452,69 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -50469,31 +80531,69 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 @@ -50510,31 +80610,69 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 @@ -50551,31 +80689,69 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -50592,31 +80768,69 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -50637,7 +80851,7 @@ mulAvxTwo_9x3_64_end: RET // func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x3Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -50694,23 +80908,43 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU (R15), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -50721,21 +80955,41 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -50746,21 +81000,41 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -50771,21 +81045,41 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -50796,21 +81090,41 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -50821,21 +81135,41 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 @@ -50846,21 +81180,41 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R12), Y6 ADDQ $0x20, R12 @@ -50871,21 +81225,41 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -50896,21 +81270,41 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -50928,7 +81322,7 @@ mulAvxTwo_9x3Xor_end: RET // func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x3_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -50999,31 +81393,69 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -51040,31 +81472,69 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -51081,31 +81551,69 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -51122,31 +81630,69 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -51163,31 +81709,69 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -51204,31 +81788,69 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 @@ -51245,31 +81867,69 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 @@ -51286,31 +81946,69 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -51327,31 +82025,69 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -51372,7 +82108,7 @@ mulAvxTwo_9x3_64Xor_end: RET // func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -51459,27 +82195,54 @@ mulAvxTwo_9x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -51490,27 +82253,54 @@ mulAvxTwo_9x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -51521,27 +82311,54 @@ mulAvxTwo_9x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -51552,27 +82369,54 @@ mulAvxTwo_9x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -51583,27 +82427,54 @@ mulAvxTwo_9x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -51614,27 +82485,54 @@ mulAvxTwo_9x4_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 @@ -51645,27 +82543,54 @@ mulAvxTwo_9x4_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (AX), Y7 ADDQ $0x20, AX @@ -51676,27 +82601,54 @@ mulAvxTwo_9x4_loop: VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -51716,7 +82668,7 @@ mulAvxTwo_9x4_end: RET // func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x4Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -51777,30 +82729,57 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU (R15), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX @@ -51811,27 +82790,54 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -51842,27 +82848,54 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -51873,27 +82906,54 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -51904,27 +82964,54 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -51935,27 +83022,54 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -51966,27 +83080,54 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 @@ -51997,27 +83138,54 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (AX), Y7 ADDQ $0x20, AX @@ -52028,27 +83196,54 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -52068,7 +83263,7 @@ mulAvxTwo_9x4Xor_end: RET // func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -52148,33 +83343,67 @@ mulAvxTwo_9x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -52185,33 +83414,67 @@ mulAvxTwo_9x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -52222,33 +83485,67 @@ mulAvxTwo_9x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -52259,33 +83556,67 @@ mulAvxTwo_9x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -52296,33 +83627,67 @@ mulAvxTwo_9x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 @@ -52333,33 +83698,67 @@ mulAvxTwo_9x5_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 @@ -52370,33 +83769,67 @@ mulAvxTwo_9x5_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -52407,33 +83840,67 @@ mulAvxTwo_9x5_loop: VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -52456,7 +83923,7 @@ mulAvxTwo_9x5_end: RET // func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -52506,41 +83973,75 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -52551,33 +84052,67 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -52588,33 +84123,67 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -52625,33 +84194,67 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -52662,33 +84265,67 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -52699,33 +84336,67 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 @@ -52736,33 +84407,67 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 @@ -52773,33 +84478,67 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -52810,33 +84549,67 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -52859,7 +84632,7 @@ mulAvxTwo_9x5Xor_end: RET // func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -52944,39 +84717,80 @@ mulAvxTwo_9x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -52987,39 +84801,80 @@ mulAvxTwo_9x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -53030,39 +84885,80 @@ mulAvxTwo_9x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -53073,39 +84969,80 @@ mulAvxTwo_9x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 @@ -53116,39 +85053,80 @@ mulAvxTwo_9x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 @@ -53159,39 +85137,80 @@ mulAvxTwo_9x6_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 @@ -53202,39 +85221,80 @@ mulAvxTwo_9x6_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -53245,39 +85305,80 @@ mulAvxTwo_9x6_loop: VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -53302,7 +85403,7 @@ mulAvxTwo_9x6_end: RET // func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -53352,49 +85453,90 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -53405,39 +85547,80 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -53448,39 +85631,80 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -53491,39 +85715,80 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -53534,39 +85799,80 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 @@ -53577,39 +85883,80 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 @@ -53620,39 +85967,80 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 @@ -53663,39 +86051,80 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -53706,39 +86135,80 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -53763,7 +86233,7 @@ mulAvxTwo_9x6Xor_end: RET // func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -53853,45 +86323,93 @@ mulAvxTwo_9x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -53902,45 +86420,93 @@ mulAvxTwo_9x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -53951,45 +86517,93 @@ mulAvxTwo_9x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -54000,45 +86614,93 @@ mulAvxTwo_9x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -54049,45 +86711,93 @@ mulAvxTwo_9x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 @@ -54098,45 +86808,93 @@ mulAvxTwo_9x7_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 @@ -54147,45 +86905,93 @@ mulAvxTwo_9x7_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -54196,45 +87002,93 @@ mulAvxTwo_9x7_loop: VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -54261,7 +87115,7 @@ mulAvxTwo_9x7_end: RET // func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -54311,57 +87165,105 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -54372,45 +87274,93 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -54421,45 +87371,93 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -54470,45 +87468,93 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -54519,45 +87565,93 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -54568,45 +87662,93 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 @@ -54617,45 +87759,93 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 @@ -54666,45 +87856,93 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -54715,45 +87953,93 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -54780,7 +88066,7 @@ mulAvxTwo_9x7Xor_end: RET // func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -54875,51 +88161,106 @@ mulAvxTwo_9x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -54930,51 +88271,106 @@ mulAvxTwo_9x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -54985,51 +88381,106 @@ mulAvxTwo_9x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -55040,51 +88491,106 @@ mulAvxTwo_9x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -55095,51 +88601,106 @@ mulAvxTwo_9x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 @@ -55150,51 +88711,106 @@ mulAvxTwo_9x8_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 @@ -55205,51 +88821,106 @@ mulAvxTwo_9x8_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -55260,51 +88931,106 @@ mulAvxTwo_9x8_loop: VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -55333,7 +89059,7 @@ mulAvxTwo_9x8_end: RET // func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -55383,65 +89109,120 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -55452,51 +89233,106 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -55507,51 +89343,106 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -55562,51 +89453,106 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -55617,51 +89563,106 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -55672,51 +89673,106 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 @@ -55727,51 +89783,106 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 @@ -55782,51 +89893,106 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -55837,51 +90003,106 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -55910,7 +90131,7 @@ mulAvxTwo_9x8Xor_end: RET // func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -56010,57 +90231,119 @@ mulAvxTwo_9x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -56071,57 +90354,119 @@ mulAvxTwo_9x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -56132,57 +90477,119 @@ mulAvxTwo_9x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -56193,57 +90600,119 @@ mulAvxTwo_9x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -56254,57 +90723,119 @@ mulAvxTwo_9x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 @@ -56315,57 +90846,119 @@ mulAvxTwo_9x9_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 @@ -56376,57 +90969,119 @@ mulAvxTwo_9x9_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -56437,57 +91092,119 @@ mulAvxTwo_9x9_loop: VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -56518,7 +91235,7 @@ mulAvxTwo_9x9_end: RET // func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -56568,73 +91285,135 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif MOVQ 192(R13), R15 VMOVDQU (R15)(R14*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -56645,57 +91424,119 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -56706,57 +91547,119 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -56767,57 +91670,119 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -56828,57 +91793,119 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -56889,57 +91916,119 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 @@ -56950,57 +92039,119 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 @@ -57011,57 +92162,119 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -57072,57 +92285,119 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -57153,7 +92428,7 @@ mulAvxTwo_9x9Xor_end: RET // func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -57258,63 +92533,132 @@ mulAvxTwo_9x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -57325,63 +92669,132 @@ mulAvxTwo_9x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -57392,63 +92805,132 @@ mulAvxTwo_9x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -57459,63 +92941,132 @@ mulAvxTwo_9x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -57526,63 +93077,132 @@ mulAvxTwo_9x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 @@ -57593,63 +93213,132 @@ mulAvxTwo_9x10_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 @@ -57660,63 +93349,132 @@ mulAvxTwo_9x10_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -57727,63 +93485,132 @@ mulAvxTwo_9x10_loop: VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -57816,7 +93643,7 @@ mulAvxTwo_9x10_end: RET // func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -57866,81 +93693,150 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif MOVQ 192(R13), R15 VMOVDQU (R15)(R14*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif MOVQ 216(R13), R15 VMOVDQU (R15)(R14*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -57951,63 +93847,132 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -58018,63 +93983,132 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -58085,63 +94119,132 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -58152,63 +94255,132 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -58219,63 +94391,132 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 @@ -58286,63 +94527,132 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 @@ -58353,63 +94663,132 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -58420,63 +94799,132 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -58509,7 +94957,7 @@ mulAvxTwo_9x10Xor_end: RET // func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x1(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -58575,9 +95023,15 @@ mulAvxTwo_10x1_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -58588,9 +95042,15 @@ mulAvxTwo_10x1_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -58601,9 +95061,15 @@ mulAvxTwo_10x1_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -58614,9 +95080,15 @@ mulAvxTwo_10x1_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -58627,9 +95099,15 @@ mulAvxTwo_10x1_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 @@ -58640,9 +95118,15 @@ mulAvxTwo_10x1_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 ADDQ $0x20, R12 @@ -58653,9 +95137,15 @@ mulAvxTwo_10x1_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (R13), Y4 ADDQ $0x20, R13 @@ -58666,9 +95156,15 @@ mulAvxTwo_10x1_loop: VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 9 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -58679,9 +95175,15 @@ mulAvxTwo_10x1_loop: VMOVDQU 608(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Store 1 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -58695,7 +95197,7 @@ mulAvxTwo_10x1_end: RET // func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -58775,11 +95277,23 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -58796,11 +95310,23 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -58817,11 +95343,23 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -58838,11 +95376,23 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -58859,11 +95409,23 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 @@ -58880,11 +95442,23 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 @@ -58901,11 +95475,23 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (R13), Y6 VMOVDQU 32(R13), Y5 @@ -58922,11 +95508,23 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 9 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -58943,11 +95541,23 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -58962,7 +95572,7 @@ mulAvxTwo_10x1_64_end: RET // func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x1Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -59017,9 +95627,15 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 ADDQ $0x20, SI @@ -59030,9 +95646,15 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -59043,9 +95665,15 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -59056,9 +95684,15 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -59069,9 +95703,15 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -59082,9 +95722,15 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 @@ -59095,9 +95741,15 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 ADDQ $0x20, R12 @@ -59108,9 +95760,15 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (R13), Y4 ADDQ $0x20, R13 @@ -59121,9 +95779,15 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Load and process 32 bytes from input 9 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -59134,9 +95798,15 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 608(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y2, Y3, Y0 + +#else + VPXOR Y2, Y0, Y0 + VPXOR Y3, Y0, Y0 + +#endif // Store 1 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -59150,7 +95820,7 @@ mulAvxTwo_10x1Xor_end: RET // func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -59215,11 +95885,23 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -59236,11 +95918,23 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -59257,11 +95951,23 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -59278,11 +95984,23 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -59299,11 +96017,23 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -59320,11 +96050,23 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 @@ -59341,11 +96083,23 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 @@ -59362,11 +96116,23 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (R13), Y6 VMOVDQU 32(R13), Y5 @@ -59383,11 +96149,23 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Load and process 64 bytes from input 9 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -59404,11 +96182,23 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif // Store 1 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -59423,7 +96213,7 @@ mulAvxTwo_10x1_64Xor_end: RET // func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x2(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -59496,15 +96286,28 @@ mulAvxTwo_10x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -59515,15 +96318,28 @@ mulAvxTwo_10x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -59534,15 +96350,28 @@ mulAvxTwo_10x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -59553,15 +96382,28 @@ mulAvxTwo_10x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -59572,15 +96414,28 @@ mulAvxTwo_10x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 @@ -59591,15 +96446,28 @@ mulAvxTwo_10x2_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 ADDQ $0x20, R12 @@ -59610,15 +96478,28 @@ mulAvxTwo_10x2_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (R13), Y5 ADDQ $0x20, R13 @@ -59629,15 +96510,28 @@ mulAvxTwo_10x2_loop: VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 9 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -59648,15 +96542,28 @@ mulAvxTwo_10x2_loop: VMOVDQU 1184(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y3 VMOVDQU 1248(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R15) ADDQ $0x20, R15 @@ -59672,7 +96579,7 @@ mulAvxTwo_10x2_end: RET // func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x2_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -59762,21 +96669,46 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -59793,21 +96725,46 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -59824,21 +96781,46 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -59855,21 +96837,46 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -59886,21 +96893,46 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 @@ -59917,21 +96949,46 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 @@ -59948,21 +97005,46 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (R13), Y9 VMOVDQU 32(R13), Y11 @@ -59979,21 +97061,46 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 9 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -60010,21 +97117,46 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R15) VMOVDQU Y1, 32(R15) @@ -60042,7 +97174,7 @@ mulAvxTwo_10x2_64_end: RET // func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x2Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -60099,16 +97231,29 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -60119,15 +97264,28 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -60138,15 +97296,28 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -60157,15 +97328,28 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -60176,15 +97360,28 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -60195,15 +97392,28 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 @@ -60214,15 +97424,28 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 ADDQ $0x20, R12 @@ -60233,15 +97456,28 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (R13), Y5 ADDQ $0x20, R13 @@ -60252,15 +97488,28 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Load and process 32 bytes from input 9 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -60271,15 +97520,28 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 1184(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y0 + +#else + VPXOR Y3, Y0, Y0 + VPXOR Y4, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y3 VMOVDQU 1248(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y3, Y4, Y1 + +#else + VPXOR Y3, Y1, Y1 + VPXOR Y4, Y1, Y1 + +#endif // Store 2 outputs VMOVDQU Y0, (R15) ADDQ $0x20, R15 @@ -60295,7 +97557,7 @@ mulAvxTwo_10x2Xor_end: RET // func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x2_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -60364,21 +97626,46 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -60395,21 +97682,46 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -60426,21 +97738,46 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -60457,21 +97794,46 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -60488,21 +97850,46 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -60519,21 +97906,46 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 @@ -60550,21 +97962,46 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 @@ -60581,21 +98018,46 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (R13), Y9 VMOVDQU 32(R13), Y11 @@ -60612,21 +98074,46 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Load and process 64 bytes from input 9 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -60643,21 +98130,46 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif // Store 2 outputs VMOVDQU Y0, (R15) VMOVDQU Y1, 32(R15) @@ -60675,7 +98187,7 @@ mulAvxTwo_10x2_64Xor_end: RET // func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x3(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -60757,21 +98269,41 @@ mulAvxTwo_10x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -60782,21 +98314,41 @@ mulAvxTwo_10x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -60807,21 +98359,41 @@ mulAvxTwo_10x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -60832,21 +98404,41 @@ mulAvxTwo_10x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -60857,21 +98449,41 @@ mulAvxTwo_10x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -60882,21 +98494,41 @@ mulAvxTwo_10x3_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 @@ -60907,21 +98539,41 @@ mulAvxTwo_10x3_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (R12), Y6 ADDQ $0x20, R12 @@ -60932,21 +98584,41 @@ mulAvxTwo_10x3_loop: VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 9 to 3 outputs VMOVDQU (AX), Y6 ADDQ $0x20, AX @@ -60957,21 +98629,41 @@ mulAvxTwo_10x3_loop: VMOVDQU 1760(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y4 VMOVDQU 1824(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y4 VMOVDQU 1888(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -60989,7 +98681,7 @@ mulAvxTwo_10x3_end: RET // func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x3_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -61093,31 +98785,69 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -61134,31 +98864,69 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -61175,31 +98943,69 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -61216,31 +99022,69 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -61257,31 +99101,69 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -61298,31 +99180,69 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 @@ -61339,31 +99259,69 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 @@ -61380,31 +99338,69 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 9 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 @@ -61421,31 +99417,69 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -61466,7 +99500,7 @@ mulAvxTwo_10x3_64_end: RET // func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x3Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -61527,23 +99561,43 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU (R15), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -61554,21 +99608,41 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -61579,21 +99653,41 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -61604,21 +99698,41 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -61629,21 +99743,41 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -61654,21 +99788,41 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -61679,21 +99833,41 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 @@ -61704,21 +99878,41 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (R12), Y6 ADDQ $0x20, R12 @@ -61729,21 +99923,41 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Load and process 32 bytes from input 9 to 3 outputs VMOVDQU (AX), Y6 ADDQ $0x20, AX @@ -61754,21 +99968,41 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 1760(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y0 + +#else + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y4 VMOVDQU 1824(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y1 + +#else + VPXOR Y4, Y1, Y1 + VPXOR Y5, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y4 VMOVDQU 1888(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y4, Y5, Y2 + +#else + VPXOR Y4, Y2, Y2 + VPXOR Y5, Y2, Y2 + +#endif // Store 3 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -61786,7 +100020,7 @@ mulAvxTwo_10x3Xor_end: RET // func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x3_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -61863,31 +100097,69 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 @@ -61904,31 +100176,69 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -61945,31 +100255,69 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -61986,31 +100334,69 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -62027,31 +100413,69 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -62068,31 +100492,69 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -62109,31 +100571,69 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 @@ -62150,31 +100650,69 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 @@ -62191,31 +100729,69 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Load and process 64 bytes from input 9 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 @@ -62232,31 +100808,69 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif // Store 3 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -62277,7 +100891,7 @@ mulAvxTwo_10x3_64Xor_end: RET // func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -62354,27 +100968,54 @@ mulAvxTwo_10x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -62385,27 +101026,54 @@ mulAvxTwo_10x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -62416,27 +101084,54 @@ mulAvxTwo_10x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -62447,27 +101142,54 @@ mulAvxTwo_10x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -62478,27 +101200,54 @@ mulAvxTwo_10x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 @@ -62509,27 +101258,54 @@ mulAvxTwo_10x4_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R12), Y7 ADDQ $0x20, R12 @@ -62540,27 +101316,54 @@ mulAvxTwo_10x4_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (R13), Y7 ADDQ $0x20, R13 @@ -62571,27 +101374,54 @@ mulAvxTwo_10x4_loop: VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 9 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -62602,27 +101432,54 @@ mulAvxTwo_10x4_loop: VMOVDQU 2336(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y5 VMOVDQU 2400(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y5 VMOVDQU 2464(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y5 VMOVDQU 2528(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -62643,7 +101500,7 @@ mulAvxTwo_10x4_end: RET // func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x4Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -62695,33 +101552,60 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -62732,27 +101616,54 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -62763,27 +101674,54 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -62794,27 +101732,54 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -62825,27 +101790,54 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -62856,27 +101848,54 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 @@ -62887,27 +101906,54 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R12), Y7 ADDQ $0x20, R12 @@ -62918,27 +101964,54 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (R13), Y7 ADDQ $0x20, R13 @@ -62949,27 +102022,54 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Load and process 32 bytes from input 9 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -62980,27 +102080,54 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 2336(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y0 + +#else + VPXOR Y5, Y0, Y0 + VPXOR Y6, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y5 VMOVDQU 2400(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y1 + +#else + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y5 VMOVDQU 2464(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +#else + VPXOR Y5, Y2, Y2 + VPXOR Y6, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y5 VMOVDQU 2528(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y5, Y6, Y3 + +#else + VPXOR Y5, Y3, Y3 + VPXOR Y6, Y3, Y3 + +#endif // Store 4 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -63021,7 +102148,7 @@ mulAvxTwo_10x4Xor_end: RET // func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -63103,33 +102230,67 @@ mulAvxTwo_10x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -63140,33 +102301,67 @@ mulAvxTwo_10x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -63177,33 +102372,67 @@ mulAvxTwo_10x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -63214,33 +102443,67 @@ mulAvxTwo_10x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -63251,33 +102514,67 @@ mulAvxTwo_10x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 @@ -63288,33 +102585,67 @@ mulAvxTwo_10x5_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 @@ -63325,33 +102656,67 @@ mulAvxTwo_10x5_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (R13), Y8 ADDQ $0x20, R13 @@ -63362,33 +102727,67 @@ mulAvxTwo_10x5_loop: VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 9 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -63399,33 +102798,67 @@ mulAvxTwo_10x5_loop: VMOVDQU 2912(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 2944(CX), Y6 VMOVDQU 2976(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 3008(CX), Y6 VMOVDQU 3040(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 3072(CX), Y6 VMOVDQU 3104(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 3136(CX), Y6 VMOVDQU 3168(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -63448,7 +102881,7 @@ mulAvxTwo_10x5_end: RET // func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x5Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -63500,41 +102933,75 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -63545,33 +103012,67 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -63582,33 +103083,67 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -63619,33 +103154,67 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -63656,33 +103225,67 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -63693,33 +103296,67 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 @@ -63730,33 +103367,67 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 @@ -63767,33 +103438,67 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (R13), Y8 ADDQ $0x20, R13 @@ -63804,33 +103509,67 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Load and process 32 bytes from input 9 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -63841,33 +103580,67 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 2912(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y0 + +#else + VPXOR Y6, Y0, Y0 + VPXOR Y7, Y0, Y0 + +#endif VMOVDQU 2944(CX), Y6 VMOVDQU 2976(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y1 + +#else + VPXOR Y6, Y1, Y1 + VPXOR Y7, Y1, Y1 + +#endif VMOVDQU 3008(CX), Y6 VMOVDQU 3040(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y2 + +#else + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y2, Y2 + +#endif VMOVDQU 3072(CX), Y6 VMOVDQU 3104(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y3 + +#else + VPXOR Y6, Y3, Y3 + VPXOR Y7, Y3, Y3 + +#endif VMOVDQU 3136(CX), Y6 VMOVDQU 3168(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y6, Y7, Y4 + +#else + VPXOR Y6, Y4, Y4 + VPXOR Y7, Y4, Y4 + +#endif // Store 5 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -63890,7 +103663,7 @@ mulAvxTwo_10x5Xor_end: RET // func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -63977,39 +103750,80 @@ mulAvxTwo_10x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -64020,39 +103834,80 @@ mulAvxTwo_10x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -64063,39 +103918,80 @@ mulAvxTwo_10x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -64106,39 +104002,80 @@ mulAvxTwo_10x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 @@ -64149,39 +104086,80 @@ mulAvxTwo_10x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 @@ -64192,39 +104170,80 @@ mulAvxTwo_10x6_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 @@ -64235,39 +104254,80 @@ mulAvxTwo_10x6_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (R13), Y9 ADDQ $0x20, R13 @@ -64278,39 +104338,80 @@ mulAvxTwo_10x6_loop: VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 9 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -64321,39 +104422,80 @@ mulAvxTwo_10x6_loop: VMOVDQU 3488(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 3520(CX), Y7 VMOVDQU 3552(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 3584(CX), Y7 VMOVDQU 3616(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 3648(CX), Y7 VMOVDQU 3680(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 3712(CX), Y7 VMOVDQU 3744(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 3776(CX), Y7 VMOVDQU 3808(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -64378,7 +104520,7 @@ mulAvxTwo_10x6_end: RET // func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x6Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -64430,49 +104572,90 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -64483,39 +104666,80 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -64526,39 +104750,80 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -64569,39 +104834,80 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -64612,39 +104918,80 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 @@ -64655,39 +105002,80 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 @@ -64698,39 +105086,80 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 @@ -64741,39 +105170,80 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (R13), Y9 ADDQ $0x20, R13 @@ -64784,39 +105254,80 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Load and process 32 bytes from input 9 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -64827,39 +105338,80 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 3488(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y0 + +#else + VPXOR Y7, Y0, Y0 + VPXOR Y8, Y0, Y0 + +#endif VMOVDQU 3520(CX), Y7 VMOVDQU 3552(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y1 + +#else + VPXOR Y7, Y1, Y1 + VPXOR Y8, Y1, Y1 + +#endif VMOVDQU 3584(CX), Y7 VMOVDQU 3616(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y2 + +#else + VPXOR Y7, Y2, Y2 + VPXOR Y8, Y2, Y2 + +#endif VMOVDQU 3648(CX), Y7 VMOVDQU 3680(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y3 + +#else + VPXOR Y7, Y3, Y3 + VPXOR Y8, Y3, Y3 + +#endif VMOVDQU 3712(CX), Y7 VMOVDQU 3744(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y4 + +#else + VPXOR Y7, Y4, Y4 + VPXOR Y8, Y4, Y4 + +#endif VMOVDQU 3776(CX), Y7 VMOVDQU 3808(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y7, Y8, Y5 + +#else + VPXOR Y7, Y5, Y5 + VPXOR Y8, Y5, Y5 + +#endif // Store 6 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -64884,7 +105436,7 @@ mulAvxTwo_10x6Xor_end: RET // func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -64976,45 +105528,93 @@ mulAvxTwo_10x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -65025,45 +105625,93 @@ mulAvxTwo_10x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -65074,45 +105722,93 @@ mulAvxTwo_10x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -65123,45 +105819,93 @@ mulAvxTwo_10x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -65172,45 +105916,93 @@ mulAvxTwo_10x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 @@ -65221,45 +106013,93 @@ mulAvxTwo_10x7_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 @@ -65270,45 +106110,93 @@ mulAvxTwo_10x7_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (R13), Y10 ADDQ $0x20, R13 @@ -65319,45 +106207,93 @@ mulAvxTwo_10x7_loop: VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 9 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -65368,45 +106304,93 @@ mulAvxTwo_10x7_loop: VMOVDQU 4064(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 4096(CX), Y8 VMOVDQU 4128(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 4160(CX), Y8 VMOVDQU 4192(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 4224(CX), Y8 VMOVDQU 4256(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 4288(CX), Y8 VMOVDQU 4320(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 4352(CX), Y8 VMOVDQU 4384(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 4416(CX), Y8 VMOVDQU 4448(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -65433,7 +106417,7 @@ mulAvxTwo_10x7_end: RET // func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -65485,57 +106469,105 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -65546,45 +106578,93 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -65595,45 +106675,93 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -65644,45 +106772,93 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -65693,45 +106869,93 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -65742,45 +106966,93 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 @@ -65791,45 +107063,93 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 @@ -65840,45 +107160,93 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (R13), Y10 ADDQ $0x20, R13 @@ -65889,45 +107257,93 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Load and process 32 bytes from input 9 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -65938,45 +107354,93 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 4064(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y0 + +#else + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y0 + +#endif VMOVDQU 4096(CX), Y8 VMOVDQU 4128(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y1 + +#else + VPXOR Y8, Y1, Y1 + VPXOR Y9, Y1, Y1 + +#endif VMOVDQU 4160(CX), Y8 VMOVDQU 4192(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y2 + +#else + VPXOR Y8, Y2, Y2 + VPXOR Y9, Y2, Y2 + +#endif VMOVDQU 4224(CX), Y8 VMOVDQU 4256(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y3 + +#else + VPXOR Y8, Y3, Y3 + VPXOR Y9, Y3, Y3 + +#endif VMOVDQU 4288(CX), Y8 VMOVDQU 4320(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y4 + +#else + VPXOR Y8, Y4, Y4 + VPXOR Y9, Y4, Y4 + +#endif VMOVDQU 4352(CX), Y8 VMOVDQU 4384(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y5 + +#else + VPXOR Y8, Y5, Y5 + VPXOR Y9, Y5, Y5 + +#endif VMOVDQU 4416(CX), Y8 VMOVDQU 4448(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y8, Y9, Y6 + +#else + VPXOR Y8, Y6, Y6 + VPXOR Y9, Y6, Y6 + +#endif // Store 7 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -66003,7 +107467,7 @@ mulAvxTwo_10x7Xor_end: RET // func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -66100,51 +107564,106 @@ mulAvxTwo_10x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -66155,51 +107674,106 @@ mulAvxTwo_10x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -66210,51 +107784,106 @@ mulAvxTwo_10x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -66265,51 +107894,106 @@ mulAvxTwo_10x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -66320,51 +108004,106 @@ mulAvxTwo_10x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 @@ -66375,51 +108114,106 @@ mulAvxTwo_10x8_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 @@ -66430,51 +108224,106 @@ mulAvxTwo_10x8_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (R13), Y11 ADDQ $0x20, R13 @@ -66485,51 +108334,106 @@ mulAvxTwo_10x8_loop: VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 9 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -66540,51 +108444,106 @@ mulAvxTwo_10x8_loop: VMOVDQU 4640(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 4672(CX), Y9 VMOVDQU 4704(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 4736(CX), Y9 VMOVDQU 4768(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 4800(CX), Y9 VMOVDQU 4832(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 4864(CX), Y9 VMOVDQU 4896(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 4928(CX), Y9 VMOVDQU 4960(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 4992(CX), Y9 VMOVDQU 5024(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 5056(CX), Y9 VMOVDQU 5088(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -66613,7 +108572,7 @@ mulAvxTwo_10x8_end: RET // func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x8Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -66665,65 +108624,120 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -66734,51 +108748,106 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -66789,51 +108858,106 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -66844,51 +108968,106 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -66899,51 +109078,106 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -66954,51 +109188,106 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 @@ -67009,51 +109298,106 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 @@ -67064,51 +109408,106 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (R13), Y11 ADDQ $0x20, R13 @@ -67119,51 +109518,106 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Load and process 32 bytes from input 9 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -67174,51 +109628,106 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 4640(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y0 + +#else + VPXOR Y9, Y0, Y0 + VPXOR Y10, Y0, Y0 + +#endif VMOVDQU 4672(CX), Y9 VMOVDQU 4704(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y1 + +#else + VPXOR Y9, Y1, Y1 + VPXOR Y10, Y1, Y1 + +#endif VMOVDQU 4736(CX), Y9 VMOVDQU 4768(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +#else + VPXOR Y9, Y2, Y2 + VPXOR Y10, Y2, Y2 + +#endif VMOVDQU 4800(CX), Y9 VMOVDQU 4832(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y3 + +#else + VPXOR Y9, Y3, Y3 + VPXOR Y10, Y3, Y3 + +#endif VMOVDQU 4864(CX), Y9 VMOVDQU 4896(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +#else + VPXOR Y9, Y4, Y4 + VPXOR Y10, Y4, Y4 + +#endif VMOVDQU 4928(CX), Y9 VMOVDQU 4960(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y5 + +#else + VPXOR Y9, Y5, Y5 + VPXOR Y10, Y5, Y5 + +#endif VMOVDQU 4992(CX), Y9 VMOVDQU 5024(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +#else + VPXOR Y9, Y6, Y6 + VPXOR Y10, Y6, Y6 + +#endif VMOVDQU 5056(CX), Y9 VMOVDQU 5088(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y9, Y10, Y7 + +#else + VPXOR Y9, Y7, Y7 + VPXOR Y10, Y7, Y7 + +#endif // Store 8 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -67247,7 +109756,7 @@ mulAvxTwo_10x8Xor_end: RET // func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -67349,57 +109858,119 @@ mulAvxTwo_10x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -67410,57 +109981,119 @@ mulAvxTwo_10x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -67471,57 +110104,119 @@ mulAvxTwo_10x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -67532,57 +110227,119 @@ mulAvxTwo_10x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -67593,57 +110350,119 @@ mulAvxTwo_10x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 @@ -67654,57 +110473,119 @@ mulAvxTwo_10x9_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 @@ -67715,57 +110596,119 @@ mulAvxTwo_10x9_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (R13), Y12 ADDQ $0x20, R13 @@ -67776,57 +110719,119 @@ mulAvxTwo_10x9_loop: VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 9 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -67837,57 +110842,119 @@ mulAvxTwo_10x9_loop: VMOVDQU 5216(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 5248(CX), Y10 VMOVDQU 5280(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 5312(CX), Y10 VMOVDQU 5344(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 5376(CX), Y10 VMOVDQU 5408(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 5440(CX), Y10 VMOVDQU 5472(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 5504(CX), Y10 VMOVDQU 5536(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 5568(CX), Y10 VMOVDQU 5600(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 5632(CX), Y10 VMOVDQU 5664(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 5696(CX), Y10 VMOVDQU 5728(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -67918,7 +110985,7 @@ mulAvxTwo_10x9_end: RET // func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -67970,73 +111037,135 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif MOVQ 192(R14), BP VMOVDQU (BP)(R15*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -68047,57 +111176,119 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -68108,57 +111299,119 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -68169,57 +111422,119 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -68230,57 +111545,119 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -68291,57 +111668,119 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 @@ -68352,57 +111791,119 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 @@ -68413,57 +111914,119 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (R13), Y12 ADDQ $0x20, R13 @@ -68474,57 +112037,119 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Load and process 32 bytes from input 9 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -68535,57 +112160,119 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 5216(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y0 + +#else + VPXOR Y10, Y0, Y0 + VPXOR Y11, Y0, Y0 + +#endif VMOVDQU 5248(CX), Y10 VMOVDQU 5280(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y1 + +#else + VPXOR Y10, Y1, Y1 + VPXOR Y11, Y1, Y1 + +#endif VMOVDQU 5312(CX), Y10 VMOVDQU 5344(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y2 + +#else + VPXOR Y10, Y2, Y2 + VPXOR Y11, Y2, Y2 + +#endif VMOVDQU 5376(CX), Y10 VMOVDQU 5408(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y3 + +#else + VPXOR Y10, Y3, Y3 + VPXOR Y11, Y3, Y3 + +#endif VMOVDQU 5440(CX), Y10 VMOVDQU 5472(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y4 + +#else + VPXOR Y10, Y4, Y4 + VPXOR Y11, Y4, Y4 + +#endif VMOVDQU 5504(CX), Y10 VMOVDQU 5536(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y5 + +#else + VPXOR Y10, Y5, Y5 + VPXOR Y11, Y5, Y5 + +#endif VMOVDQU 5568(CX), Y10 VMOVDQU 5600(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y6 + +#else + VPXOR Y10, Y6, Y6 + VPXOR Y11, Y6, Y6 + +#endif VMOVDQU 5632(CX), Y10 VMOVDQU 5664(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y7 + +#else + VPXOR Y10, Y7, Y7 + VPXOR Y11, Y7, Y7 + +#endif VMOVDQU 5696(CX), Y10 VMOVDQU 5728(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y10, Y11, Y8 + +#else + VPXOR Y10, Y8, Y8 + VPXOR Y11, Y8, Y8 + +#endif // Store 9 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -68616,7 +112303,7 @@ mulAvxTwo_10x9Xor_end: RET // func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -68723,63 +112410,132 @@ mulAvxTwo_10x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -68790,63 +112546,132 @@ mulAvxTwo_10x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -68857,63 +112682,132 @@ mulAvxTwo_10x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -68924,63 +112818,132 @@ mulAvxTwo_10x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -68991,63 +112954,132 @@ mulAvxTwo_10x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 @@ -69058,63 +113090,132 @@ mulAvxTwo_10x10_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 @@ -69125,63 +113226,132 @@ mulAvxTwo_10x10_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (R13), Y13 ADDQ $0x20, R13 @@ -69192,63 +113362,132 @@ mulAvxTwo_10x10_loop: VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 9 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -69259,63 +113498,132 @@ mulAvxTwo_10x10_loop: VMOVDQU 5792(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 5824(CX), Y11 VMOVDQU 5856(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 5888(CX), Y11 VMOVDQU 5920(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 5952(CX), Y11 VMOVDQU 5984(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 6016(CX), Y11 VMOVDQU 6048(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 6080(CX), Y11 VMOVDQU 6112(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 6144(CX), Y11 VMOVDQU 6176(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 6208(CX), Y11 VMOVDQU 6240(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 6272(CX), Y11 VMOVDQU 6304(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 6336(CX), Y11 VMOVDQU 6368(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -69348,7 +113656,7 @@ mulAvxTwo_10x10_end: RET // func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x10Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -69400,81 +113708,150 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif MOVQ 192(R14), BP VMOVDQU (BP)(R15*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif MOVQ 216(R14), BP VMOVDQU (BP)(R15*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -69485,63 +113862,132 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -69552,63 +113998,132 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -69619,63 +114134,132 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -69686,63 +114270,132 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -69753,63 +114406,132 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 @@ -69820,63 +114542,132 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 @@ -69887,63 +114678,132 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (R13), Y13 ADDQ $0x20, R13 @@ -69954,63 +114814,132 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Load and process 32 bytes from input 9 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -70021,63 +114950,132 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 5792(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y0 + +#else + VPXOR Y11, Y0, Y0 + VPXOR Y12, Y0, Y0 + +#endif VMOVDQU 5824(CX), Y11 VMOVDQU 5856(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y1 + +#else + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y1, Y1 + +#endif VMOVDQU 5888(CX), Y11 VMOVDQU 5920(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y2 + +#else + VPXOR Y11, Y2, Y2 + VPXOR Y12, Y2, Y2 + +#endif VMOVDQU 5952(CX), Y11 VMOVDQU 5984(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y3 + +#else + VPXOR Y11, Y3, Y3 + VPXOR Y12, Y3, Y3 + +#endif VMOVDQU 6016(CX), Y11 VMOVDQU 6048(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y4 + +#else + VPXOR Y11, Y4, Y4 + VPXOR Y12, Y4, Y4 + +#endif VMOVDQU 6080(CX), Y11 VMOVDQU 6112(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y5 + +#else + VPXOR Y11, Y5, Y5 + VPXOR Y12, Y5, Y5 + +#endif VMOVDQU 6144(CX), Y11 VMOVDQU 6176(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y6 + +#else + VPXOR Y11, Y6, Y6 + VPXOR Y12, Y6, Y6 + +#endif VMOVDQU 6208(CX), Y11 VMOVDQU 6240(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y7 + +#else + VPXOR Y11, Y7, Y7 + VPXOR Y12, Y7, Y7 + +#endif VMOVDQU 6272(CX), Y11 VMOVDQU 6304(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y8 + +#else + VPXOR Y11, Y8, Y8 + VPXOR Y12, Y8, Y8 + +#endif VMOVDQU 6336(CX), Y11 VMOVDQU 6368(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 +#ifdef GOAMD64_v4 + VPTERNLOGD $0x96, Y11, Y12, Y9 + +#else + VPXOR Y11, Y9, Y9 + VPXOR Y12, Y9, Y9 + +#endif // Store 10 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) diff --git a/go.mod b/go.mod index 4920a67..6a4ad8e 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,5 @@ module github.com/klauspost/reedsolomon -go 1.14 +go 1.15 -require github.com/klauspost/cpuid/v2 v2.0.6 +require github.com/klauspost/cpuid/v2 v2.0.11 diff --git a/go.sum b/go.sum index 5b8b0f4..d323bdf 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,2 @@ -github.com/klauspost/cpuid/v2 v2.0.6 h1:dQ5ueTiftKxp0gyjKSx5+8BtPWkyQbd95m8Gys/RarI= -github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= +github.com/klauspost/cpuid/v2 v2.0.11 h1:i2lw1Pm7Yi/4O6XCSyJWqEHI2MDw2FzUK6o/D21xn2A= +github.com/klauspost/cpuid/v2 v2.0.11/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c= diff --git a/testlevel.go b/testlevel.go new file mode 100644 index 0000000..a905748 --- /dev/null +++ b/testlevel.go @@ -0,0 +1,31 @@ +//go:build ignore +// +build ignore + +package main + +import ( + "flag" + "log" + "strconv" + + "github.com/klauspost/cpuid/v2" +) + +func main() { + flag.Parse() + args := flag.Args() + if len(args) != 1 { + log.Fatalln("Supply CPU level 1-4 to test as argument") + } + l, err := strconv.Atoi(args[0]) + if err != nil { + log.Fatalln("Unable to parse level:", err) + } + if l < 1 || l > 4 { + log.Fatalln("Supply CPU level 1-4 to test as argument") + } + if cpuid.CPU.X64Level() < l { + // Does os.Exit(1) + log.Fatalln("CPU level not supported") + } +}