847 lines
24 KiB
Go
847 lines
24 KiB
Go
//go:build generate
|
|
// +build generate
|
|
|
|
package main
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/mmcloughlin/avo/attr"
|
|
. "github.com/mmcloughlin/avo/build"
|
|
. "github.com/mmcloughlin/avo/operand"
|
|
"github.com/mmcloughlin/avo/reg"
|
|
)
|
|
|
|
type table256 struct {
|
|
Lo, Hi Op
|
|
loadLo128, loadHi128 *Mem
|
|
loadLo256, loadHi256 *Mem
|
|
useZmmLo, useZmmHi *reg.VecPhysical
|
|
}
|
|
|
|
func (t *table256) prepare() {
|
|
t.prepareLo()
|
|
t.prepareHi()
|
|
}
|
|
|
|
func (t *table256) prepareHi() {
|
|
if t.loadHi128 != nil {
|
|
t.Hi = YMM()
|
|
// Load and expand tables
|
|
VBROADCASTI128(*t.loadHi128, t.Hi)
|
|
}
|
|
if t.loadHi256 != nil {
|
|
t.Hi = YMM()
|
|
// Load and expand tables
|
|
VMOVDQU(*t.loadHi256, t.Hi)
|
|
}
|
|
if t.useZmmHi != nil {
|
|
r := *t.useZmmHi
|
|
t.Hi = r.AsY()
|
|
}
|
|
}
|
|
|
|
func (t *table256) prepareLo() {
|
|
if t.loadLo128 != nil {
|
|
t.Lo = YMM()
|
|
// Load and expand tables
|
|
VBROADCASTI128(*t.loadLo128, t.Lo)
|
|
}
|
|
if t.loadLo256 != nil {
|
|
t.Lo = YMM()
|
|
// Load and expand tables
|
|
VMOVDQU(*t.loadLo256, t.Lo)
|
|
}
|
|
if t.useZmmLo != nil {
|
|
r := *t.useZmmLo
|
|
t.Lo = r.AsY()
|
|
}
|
|
}
|
|
|
|
// table128 contains memory pointers to tables
|
|
type table128 struct {
|
|
Lo, Hi Op
|
|
}
|
|
|
|
type gf16ctx struct {
|
|
clrMask reg.VecVirtual
|
|
clrMask128 reg.VecVirtual
|
|
avx512 bool
|
|
}
|
|
|
|
func genGF16() {
|
|
var ctx gf16ctx
|
|
// Ported from static void IFFT_DIT2
|
|
// https://github.com/catid/leopard/blob/master/LeopardFF16.cpp#L629
|
|
{
|
|
TEXT("ifftDIT2_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
|
|
Pragma("noescape")
|
|
tablePtr := Load(Param("table"), GP64())
|
|
tables := [4]table256{}
|
|
for i, t := range tables {
|
|
t.Lo, t.Hi = YMM(), YMM()
|
|
// Load and expand tables
|
|
VBROADCASTI128(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
|
|
VBROADCASTI128(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
|
|
tables[i] = t
|
|
}
|
|
bytes := Load(Param("x").Len(), GP64())
|
|
x := Load(Param("x").Base(), GP64())
|
|
y := Load(Param("y").Base(), GP64())
|
|
// Generate mask
|
|
ctx.clrMask = YMM()
|
|
tmpMask := GP64()
|
|
MOVQ(U32(15), tmpMask)
|
|
MOVQ(tmpMask, ctx.clrMask.AsX())
|
|
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
|
|
|
|
xLo, xHi, yLo, yHi := YMM(), YMM(), YMM(), YMM()
|
|
Label("loop")
|
|
VMOVDQU(Mem{Base: x, Disp: 0}, xLo)
|
|
VMOVDQU(Mem{Base: x, Disp: 32}, xHi)
|
|
VMOVDQU(Mem{Base: y, Disp: 0}, yLo)
|
|
VMOVDQU(Mem{Base: y, Disp: 32}, yHi)
|
|
VPXOR(yLo, xLo, yLo)
|
|
VPXOR(yHi, xHi, yHi)
|
|
VMOVDQU(yLo, Mem{Base: y, Disp: 0})
|
|
VMOVDQU(yHi, Mem{Base: y, Disp: 32})
|
|
leoMulAdd256(ctx, xLo, xHi, yLo, yHi, tables)
|
|
VMOVDQU(xLo, Mem{Base: x, Disp: 0})
|
|
VMOVDQU(xHi, Mem{Base: x, Disp: 32})
|
|
ADDQ(U8(64), x)
|
|
ADDQ(U8(64), y)
|
|
SUBQ(U8(64), bytes)
|
|
JNZ(LabelRef("loop"))
|
|
|
|
VZEROUPPER()
|
|
RET()
|
|
}
|
|
{
|
|
TEXT("fftDIT2_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
|
|
Pragma("noescape")
|
|
tablePtr := Load(Param("table"), GP64())
|
|
tables := [4]table256{}
|
|
for i, t := range tables {
|
|
t.Lo, t.Hi = YMM(), YMM()
|
|
// Load and expand tables
|
|
VBROADCASTI128(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
|
|
VBROADCASTI128(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
|
|
tables[i] = t
|
|
}
|
|
bytes := Load(Param("x").Len(), GP64())
|
|
x := Load(Param("x").Base(), GP64())
|
|
y := Load(Param("y").Base(), GP64())
|
|
// Generate mask
|
|
ctx.clrMask = YMM()
|
|
tmpMask := GP64()
|
|
MOVQ(U32(15), tmpMask)
|
|
MOVQ(tmpMask, ctx.clrMask.AsX())
|
|
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
|
|
|
|
xLo, xHi, yLo, yHi := YMM(), YMM(), YMM(), YMM()
|
|
Label("loop")
|
|
VMOVDQU(Mem{Base: x, Disp: 0}, xLo)
|
|
VMOVDQU(Mem{Base: x, Disp: 32}, xHi)
|
|
VMOVDQU(Mem{Base: y, Disp: 0}, yLo)
|
|
VMOVDQU(Mem{Base: y, Disp: 32}, yHi)
|
|
|
|
leoMulAdd256(ctx, xLo, xHi, yLo, yHi, tables)
|
|
VMOVDQU(xLo, Mem{Base: x, Disp: 0})
|
|
VMOVDQU(xHi, Mem{Base: x, Disp: 32})
|
|
|
|
// Reload, or we go beyond 16 regs..
|
|
if true {
|
|
yLo, yHi = YMM(), YMM()
|
|
VMOVDQU(Mem{Base: y, Disp: 0}, yLo)
|
|
VMOVDQU(Mem{Base: y, Disp: 32}, yHi)
|
|
}
|
|
|
|
VPXOR(yLo, xLo, yLo)
|
|
VPXOR(yHi, xHi, yHi)
|
|
VMOVDQU(yLo, Mem{Base: y, Disp: 0})
|
|
VMOVDQU(yHi, Mem{Base: y, Disp: 32})
|
|
ADDQ(U8(64), x)
|
|
ADDQ(U8(64), y)
|
|
SUBQ(U8(64), bytes)
|
|
JNZ(LabelRef("loop"))
|
|
|
|
VZEROUPPER()
|
|
RET()
|
|
}
|
|
|
|
{
|
|
TEXT("mulgf16_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
|
|
Pragma("noescape")
|
|
tablePtr := Load(Param("table"), GP64())
|
|
tables := [4]table256{}
|
|
for i, t := range tables {
|
|
t.Lo, t.Hi = YMM(), YMM()
|
|
// Load and expand tables
|
|
VBROADCASTI128(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
|
|
VBROADCASTI128(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
|
|
tables[i] = t
|
|
}
|
|
bytes := Load(Param("x").Len(), GP64())
|
|
x := Load(Param("x").Base(), GP64())
|
|
y := Load(Param("y").Base(), GP64())
|
|
// Generate mask
|
|
ctx.clrMask = YMM()
|
|
tmpMask := GP64()
|
|
MOVQ(U32(15), tmpMask)
|
|
MOVQ(tmpMask, ctx.clrMask.AsX())
|
|
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
|
|
|
|
dataLo, dataHi := YMM(), YMM()
|
|
Label("loop")
|
|
VMOVDQU(Mem{Base: y, Disp: 0}, dataLo)
|
|
VMOVDQU(Mem{Base: y, Disp: 32}, dataHi)
|
|
|
|
prodLo, prodHi := leoMul256(ctx, dataLo, dataHi, tables)
|
|
VMOVDQU(prodLo, Mem{Base: x, Disp: 0})
|
|
VMOVDQU(prodHi, Mem{Base: x, Disp: 32})
|
|
|
|
ADDQ(U8(64), x)
|
|
ADDQ(U8(64), y)
|
|
SUBQ(U8(64), bytes)
|
|
JNZ(LabelRef("loop"))
|
|
|
|
VZEROUPPER()
|
|
RET()
|
|
}
|
|
for _, avx512 := range []bool{true, false} {
|
|
// AVX-512 only uses more registers for tables.
|
|
var suffix = "avx2"
|
|
if avx512 {
|
|
suffix = "avx512"
|
|
}
|
|
ctx.avx512 = avx512
|
|
extZMMs := []reg.VecPhysical{reg.Z16, reg.Z17, reg.Z18, reg.Z19, reg.Z20, reg.Z21, reg.Z22, reg.Z23, reg.Z24, reg.Z25, reg.Z26, reg.Z27, reg.Z28, reg.Z29, reg.Z30, reg.Z31}
|
|
{
|
|
TEXT("ifftDIT4_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, table01 *[8*16]uint8, table23 *[8*16]uint8, table02 *[8*16]uint8, logMask uint8)"))
|
|
Pragma("noescape")
|
|
Comment("dist must be multiplied by 24 (size of slice header)")
|
|
Comment("logmask must be log_m01==kModulus, log_m23==kModulus, log_m02==kModulus from lowest to bit 3")
|
|
|
|
// Unpack tables to stack. Slower.
|
|
const unpackTables = false
|
|
|
|
table01Ptr := Load(Param("table01"), GP64())
|
|
table23Ptr := Load(Param("table23"), GP64())
|
|
table02Ptr := Load(Param("table02"), GP64())
|
|
|
|
// Prepare table pointers.
|
|
table01 := [4]table256{}
|
|
table23 := [4]table256{}
|
|
table02 := [4]table256{}
|
|
if avx512 {
|
|
usedZmm := 0
|
|
fill := func(t *[4]table256, ptr reg.Register) {
|
|
for i := range table01 {
|
|
t := &t[i]
|
|
if len(extZMMs)-usedZmm >= 2 {
|
|
tmpLo, tmpHi := YMM(), YMM()
|
|
t.useZmmLo, t.useZmmHi = &extZMMs[usedZmm], &extZMMs[usedZmm+1]
|
|
usedZmm += 2
|
|
// Load and expand tables
|
|
VBROADCASTI128(Mem{Base: ptr, Disp: i * 16}, tmpLo)
|
|
VBROADCASTI128(Mem{Base: ptr, Disp: i*16 + 16*4}, tmpHi)
|
|
VMOVAPS(tmpLo.AsZ(), *t.useZmmLo)
|
|
VMOVAPS(tmpHi.AsZ(), *t.useZmmHi)
|
|
} else {
|
|
t.loadLo128 = &Mem{Base: ptr, Disp: i * 16}
|
|
t.loadHi128 = &Mem{Base: ptr, Disp: i*16 + 16*4}
|
|
}
|
|
}
|
|
}
|
|
fill(&table02, table02Ptr)
|
|
fill(&table01, table01Ptr)
|
|
fill(&table23, table23Ptr)
|
|
}
|
|
for i := range table01 {
|
|
if avx512 {
|
|
continue
|
|
}
|
|
|
|
if unpackTables {
|
|
toStack := func(m Mem) *Mem {
|
|
stack := AllocLocal(32)
|
|
y := YMM()
|
|
VBROADCASTI128(m, y)
|
|
VMOVDQU(y, stack)
|
|
return &stack
|
|
}
|
|
|
|
table01[i].loadLo256 = toStack(Mem{Base: table01Ptr, Disp: i * 16})
|
|
table23[i].loadLo256 = toStack(Mem{Base: table23Ptr, Disp: i * 16})
|
|
table02[i].loadLo256 = toStack(Mem{Base: table02Ptr, Disp: i * 16})
|
|
|
|
table01[i].loadHi256 = toStack(Mem{Base: table01Ptr, Disp: i*16 + 16*4})
|
|
table23[i].loadHi256 = toStack(Mem{Base: table23Ptr, Disp: i*16 + 16*4})
|
|
table02[i].loadHi256 = toStack(Mem{Base: table02Ptr, Disp: i*16 + 16*4})
|
|
} else {
|
|
table01[i].loadLo128 = &Mem{Base: table01Ptr, Disp: i * 16}
|
|
table23[i].loadLo128 = &Mem{Base: table23Ptr, Disp: i * 16}
|
|
table02[i].loadLo128 = &Mem{Base: table02Ptr, Disp: i * 16}
|
|
|
|
table01[i].loadHi128 = &Mem{Base: table01Ptr, Disp: i*16 + 16*4}
|
|
table23[i].loadHi128 = &Mem{Base: table23Ptr, Disp: i*16 + 16*4}
|
|
table02[i].loadHi128 = &Mem{Base: table02Ptr, Disp: i*16 + 16*4}
|
|
}
|
|
}
|
|
// Generate mask
|
|
ctx.clrMask = YMM()
|
|
tmpMask := GP64()
|
|
MOVQ(U32(15), tmpMask)
|
|
MOVQ(tmpMask, ctx.clrMask.AsX())
|
|
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
|
|
|
|
dist := Load(Param("dist"), GP64())
|
|
|
|
// Pointers to each "work"
|
|
var work [4]reg.GPVirtual
|
|
workTable := Load(Param("work").Base(), GP64()) // &work[0]
|
|
bytes := GP64()
|
|
|
|
// Load length of work[0]
|
|
MOVQ(Mem{Base: workTable, Disp: 8}, bytes)
|
|
|
|
offset := GP64()
|
|
XORQ(offset, offset)
|
|
for i := range work {
|
|
work[i] = GP64()
|
|
// work[i] = &workTable[dist*i]
|
|
MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i])
|
|
if i < len(work)-1 {
|
|
ADDQ(dist, offset)
|
|
}
|
|
}
|
|
var workRegLo [4]reg.VecVirtual
|
|
var workRegHi [4]reg.VecVirtual
|
|
|
|
workRegLo[0], workRegHi[0] = YMM(), YMM()
|
|
workRegLo[1], workRegHi[1] = YMM(), YMM()
|
|
|
|
mask := Load(Param("logMask"), GP64())
|
|
Label("loop")
|
|
VMOVDQU(Mem{Base: work[0], Disp: 0}, workRegLo[0])
|
|
VMOVDQU(Mem{Base: work[0], Disp: 32}, workRegHi[0])
|
|
VMOVDQU(Mem{Base: work[1], Disp: 0}, workRegLo[1])
|
|
VMOVDQU(Mem{Base: work[1], Disp: 32}, workRegHi[1])
|
|
|
|
// First layer:
|
|
VPXOR(workRegLo[0], workRegLo[1], workRegLo[1])
|
|
VPXOR(workRegHi[0], workRegHi[1], workRegHi[1])
|
|
|
|
// Test bit 0
|
|
BTQ(U8(0), mask)
|
|
JC(LabelRef("skip_m01"))
|
|
leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[1], workRegHi[1], table01)
|
|
|
|
Label("skip_m01")
|
|
workRegLo[2], workRegHi[2] = YMM(), YMM()
|
|
workRegLo[3], workRegHi[3] = YMM(), YMM()
|
|
VMOVDQU(Mem{Base: work[2], Disp: 0}, workRegLo[2])
|
|
VMOVDQU(Mem{Base: work[2], Disp: 32}, workRegHi[2])
|
|
VMOVDQU(Mem{Base: work[3], Disp: 0}, workRegLo[3])
|
|
VMOVDQU(Mem{Base: work[3], Disp: 32}, workRegHi[3])
|
|
|
|
VPXOR(workRegLo[2], workRegLo[3], workRegLo[3])
|
|
VPXOR(workRegHi[2], workRegHi[3], workRegHi[3])
|
|
|
|
// Test bit 1
|
|
BTQ(U8(1), mask)
|
|
JC(LabelRef("skip_m23"))
|
|
leoMulAdd256(ctx, workRegLo[2], workRegHi[2], workRegLo[3], workRegHi[3], table23)
|
|
Label("skip_m23")
|
|
|
|
// Second layer:
|
|
VPXOR(workRegLo[0], workRegLo[2], workRegLo[2])
|
|
VPXOR(workRegHi[0], workRegHi[2], workRegHi[2])
|
|
VPXOR(workRegLo[1], workRegLo[3], workRegLo[3])
|
|
VPXOR(workRegHi[1], workRegHi[3], workRegHi[3])
|
|
|
|
// Test bit 2
|
|
BTQ(U8(2), mask)
|
|
JC(LabelRef("skip_m02"))
|
|
leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[2], workRegHi[2], table02)
|
|
leoMulAdd256(ctx, workRegLo[1], workRegHi[1], workRegLo[3], workRegHi[3], table02)
|
|
Label("skip_m02")
|
|
|
|
// Store + Next loop:
|
|
for i := range work {
|
|
VMOVDQU(workRegLo[i], Mem{Base: work[i], Disp: 0})
|
|
VMOVDQU(workRegHi[i], Mem{Base: work[i], Disp: 32})
|
|
ADDQ(U8(64), work[i])
|
|
}
|
|
|
|
SUBQ(U8(64), bytes)
|
|
JNZ(LabelRef("loop"))
|
|
|
|
VZEROUPPER()
|
|
RET()
|
|
}
|
|
{
|
|
TEXT("fftDIT4_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, table01 *[8*16]uint8, table23 *[8*16]uint8, table02 *[8*16]uint8, logMask uint8)"))
|
|
Pragma("noescape")
|
|
Comment("dist must be multiplied by 24 (size of slice header)")
|
|
Comment("logmask must be log_m01==kModulus, log_m23==kModulus, log_m02==kModulus from lowest to bit 3")
|
|
|
|
// Unpack tables to stack. Slower.
|
|
const unpackTables = false
|
|
|
|
table01Ptr := Load(Param("table01"), GP64())
|
|
table23Ptr := Load(Param("table23"), GP64())
|
|
table02Ptr := Load(Param("table02"), GP64())
|
|
|
|
// Prepare table pointers.
|
|
table01 := [4]table256{}
|
|
table23 := [4]table256{}
|
|
table02 := [4]table256{}
|
|
if avx512 {
|
|
usedZmm := 0
|
|
fill := func(t *[4]table256, ptr reg.Register) {
|
|
for i := range table01 {
|
|
t := &t[i]
|
|
if len(extZMMs)-usedZmm >= 2 {
|
|
tmpLo, tmpHi := YMM(), YMM()
|
|
t.useZmmLo, t.useZmmHi = &extZMMs[usedZmm], &extZMMs[usedZmm+1]
|
|
usedZmm += 2
|
|
// Load and expand tables
|
|
VBROADCASTI128(Mem{Base: ptr, Disp: i * 16}, tmpLo)
|
|
VBROADCASTI128(Mem{Base: ptr, Disp: i*16 + 16*4}, tmpHi)
|
|
VMOVAPS(tmpLo.AsZ(), *t.useZmmLo)
|
|
VMOVAPS(tmpHi.AsZ(), *t.useZmmHi)
|
|
} else {
|
|
t.loadLo128 = &Mem{Base: ptr, Disp: i * 16}
|
|
t.loadHi128 = &Mem{Base: ptr, Disp: i*16 + 16*4}
|
|
}
|
|
}
|
|
}
|
|
fill(&table02, table02Ptr)
|
|
fill(&table01, table01Ptr)
|
|
fill(&table23, table23Ptr)
|
|
}
|
|
for i := range table01 {
|
|
if avx512 {
|
|
continue
|
|
}
|
|
if unpackTables {
|
|
toStack := func(m Mem) *Mem {
|
|
stack := AllocLocal(32)
|
|
y := YMM()
|
|
VBROADCASTI128(m, y)
|
|
VMOVDQU(y, stack)
|
|
return &stack
|
|
}
|
|
|
|
table01[i].loadLo256 = toStack(Mem{Base: table01Ptr, Disp: i * 16})
|
|
table23[i].loadLo256 = toStack(Mem{Base: table23Ptr, Disp: i * 16})
|
|
table02[i].loadLo256 = toStack(Mem{Base: table02Ptr, Disp: i * 16})
|
|
|
|
table01[i].loadHi256 = toStack(Mem{Base: table01Ptr, Disp: i*16 + 16*4})
|
|
table23[i].loadHi256 = toStack(Mem{Base: table23Ptr, Disp: i*16 + 16*4})
|
|
table02[i].loadHi256 = toStack(Mem{Base: table02Ptr, Disp: i*16 + 16*4})
|
|
} else {
|
|
table01[i].loadLo128 = &Mem{Base: table01Ptr, Disp: i * 16}
|
|
table23[i].loadLo128 = &Mem{Base: table23Ptr, Disp: i * 16}
|
|
table02[i].loadLo128 = &Mem{Base: table02Ptr, Disp: i * 16}
|
|
|
|
table01[i].loadHi128 = &Mem{Base: table01Ptr, Disp: i*16 + 16*4}
|
|
table23[i].loadHi128 = &Mem{Base: table23Ptr, Disp: i*16 + 16*4}
|
|
table02[i].loadHi128 = &Mem{Base: table02Ptr, Disp: i*16 + 16*4}
|
|
}
|
|
}
|
|
// Generate mask
|
|
ctx.clrMask = YMM()
|
|
tmpMask := GP64()
|
|
MOVQ(U32(15), tmpMask)
|
|
MOVQ(tmpMask, ctx.clrMask.AsX())
|
|
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
|
|
|
|
dist := Load(Param("dist"), GP64())
|
|
|
|
// Pointers to each "work"
|
|
var work [4]reg.GPVirtual
|
|
workTable := Load(Param("work").Base(), GP64()) // &work[0]
|
|
bytes := GP64()
|
|
|
|
// Load length of work[0]
|
|
MOVQ(Mem{Base: workTable, Disp: 8}, bytes)
|
|
|
|
offset := GP64()
|
|
XORQ(offset, offset)
|
|
for i := range work {
|
|
work[i] = GP64()
|
|
// work[i] = &workTable[dist*i]
|
|
MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i])
|
|
if i < len(work)-1 {
|
|
ADDQ(dist, offset)
|
|
}
|
|
}
|
|
var workRegLo [4]reg.VecVirtual
|
|
var workRegHi [4]reg.VecVirtual
|
|
|
|
workRegLo[0], workRegHi[0] = YMM(), YMM()
|
|
workRegLo[1], workRegHi[1] = YMM(), YMM()
|
|
workRegLo[2], workRegHi[2] = YMM(), YMM()
|
|
workRegLo[3], workRegHi[3] = YMM(), YMM()
|
|
|
|
mask := Load(Param("logMask"), GP64())
|
|
Label("loop")
|
|
VMOVDQU(Mem{Base: work[0], Disp: 0}, workRegLo[0])
|
|
VMOVDQU(Mem{Base: work[0], Disp: 32}, workRegHi[0])
|
|
VMOVDQU(Mem{Base: work[2], Disp: 0}, workRegLo[2])
|
|
VMOVDQU(Mem{Base: work[2], Disp: 32}, workRegHi[2])
|
|
|
|
VMOVDQU(Mem{Base: work[1], Disp: 0}, workRegLo[1])
|
|
VMOVDQU(Mem{Base: work[1], Disp: 32}, workRegHi[1])
|
|
VMOVDQU(Mem{Base: work[3], Disp: 0}, workRegLo[3])
|
|
VMOVDQU(Mem{Base: work[3], Disp: 32}, workRegHi[3])
|
|
|
|
// First layer:
|
|
|
|
// Test bit 0
|
|
BTQ(U8(0), mask)
|
|
JC(LabelRef("skip_m02"))
|
|
leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[2], workRegHi[2], table02)
|
|
leoMulAdd256(ctx, workRegLo[1], workRegHi[1], workRegLo[3], workRegHi[3], table02)
|
|
|
|
Label("skip_m02")
|
|
|
|
VPXOR(workRegLo[0], workRegLo[2], workRegLo[2])
|
|
VPXOR(workRegHi[0], workRegHi[2], workRegHi[2])
|
|
VPXOR(workRegLo[1], workRegLo[3], workRegLo[3])
|
|
VPXOR(workRegHi[1], workRegHi[3], workRegHi[3])
|
|
|
|
// Second layer:
|
|
// Test bit 1
|
|
BTQ(U8(1), mask)
|
|
JC(LabelRef("skip_m01"))
|
|
leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[1], workRegHi[1], table01)
|
|
Label("skip_m01")
|
|
VPXOR(workRegLo[0], workRegLo[1], workRegLo[1])
|
|
VPXOR(workRegHi[0], workRegHi[1], workRegHi[1])
|
|
|
|
// Store...
|
|
for i := range work[:2] {
|
|
VMOVDQU(workRegLo[i], Mem{Base: work[i], Disp: 0})
|
|
VMOVDQU(workRegHi[i], Mem{Base: work[i], Disp: 32})
|
|
ADDQ(U8(64), work[i])
|
|
}
|
|
|
|
// Test bit 2
|
|
BTQ(U8(2), mask)
|
|
JC(LabelRef("skip_m23"))
|
|
leoMulAdd256(ctx, workRegLo[2], workRegHi[2], workRegLo[3], workRegHi[3], table23)
|
|
Label("skip_m23")
|
|
VPXOR(workRegLo[2], workRegLo[3], workRegLo[3])
|
|
VPXOR(workRegHi[2], workRegHi[3], workRegHi[3])
|
|
|
|
// Store + Next loop:
|
|
for i := range work[2:] {
|
|
i := i + 2
|
|
VMOVDQU(workRegLo[i], Mem{Base: work[i], Disp: 0})
|
|
VMOVDQU(workRegHi[i], Mem{Base: work[i], Disp: 32})
|
|
ADDQ(U8(64), work[i])
|
|
}
|
|
|
|
SUBQ(U8(64), bytes)
|
|
JNZ(LabelRef("loop"))
|
|
|
|
VZEROUPPER()
|
|
RET()
|
|
}
|
|
}
|
|
// SSSE3:
|
|
ctx.avx512 = false
|
|
{
|
|
TEXT("ifftDIT2_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
|
|
Pragma("noescape")
|
|
tablePtr := Load(Param("table"), GP64())
|
|
tables := [4]table128{}
|
|
for i, t := range tables {
|
|
// We almost have enough space for all tables.
|
|
if i > 2 {
|
|
t.Lo, t.Hi = Mem{Base: tablePtr, Disp: i * 16}, Mem{Base: tablePtr, Disp: i*16 + 16*4}
|
|
} else {
|
|
t.Lo, t.Hi = XMM(), XMM()
|
|
MOVUPS(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
|
|
MOVUPS(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
|
|
}
|
|
tables[i] = t
|
|
}
|
|
// Generate mask
|
|
zero := XMM()
|
|
XORPS(zero, zero) // Zero, so bytes will be copied.
|
|
fifteen, mask := GP64(), XMM()
|
|
MOVQ(U32(0xf), fifteen)
|
|
MOVQ(fifteen, mask)
|
|
PSHUFB(zero, mask)
|
|
ctx.clrMask128 = mask
|
|
|
|
bytes := Load(Param("x").Len(), GP64())
|
|
x := Load(Param("x").Base(), GP64())
|
|
y := Load(Param("y").Base(), GP64())
|
|
|
|
Label("loop")
|
|
for i := 0; i < 2; i++ {
|
|
xLo, xHi, yLo, yHi := XMM(), XMM(), XMM(), XMM()
|
|
MOVUPS(Mem{Base: x, Disp: i*16 + 0}, xLo)
|
|
MOVUPS(Mem{Base: x, Disp: i*16 + 32}, xHi)
|
|
MOVUPS(Mem{Base: y, Disp: i*16 + 0}, yLo)
|
|
MOVUPS(Mem{Base: y, Disp: i*16 + 32}, yHi)
|
|
PXOR(xLo, yLo)
|
|
PXOR(xHi, yHi)
|
|
MOVUPS(yLo, Mem{Base: y, Disp: i*16 + 0})
|
|
MOVUPS(yHi, Mem{Base: y, Disp: i*16 + 32})
|
|
leoMulAdd128(ctx, xLo, xHi, yLo, yHi, tables)
|
|
MOVUPS(xLo, Mem{Base: x, Disp: i*16 + 0})
|
|
MOVUPS(xHi, Mem{Base: x, Disp: i*16 + 32})
|
|
}
|
|
ADDQ(U8(64), x)
|
|
ADDQ(U8(64), y)
|
|
SUBQ(U8(64), bytes)
|
|
JNZ(LabelRef("loop"))
|
|
|
|
RET()
|
|
}
|
|
{
|
|
TEXT("fftDIT2_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
|
|
Pragma("noescape")
|
|
tablePtr := Load(Param("table"), GP64())
|
|
tables := [4]table128{}
|
|
for i, t := range tables {
|
|
// We almost have enough space for all tables.
|
|
if i > 2 {
|
|
t.Lo, t.Hi = Mem{Base: tablePtr, Disp: i * 16}, Mem{Base: tablePtr, Disp: i*16 + 16*4}
|
|
} else {
|
|
t.Lo, t.Hi = XMM(), XMM()
|
|
MOVUPS(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
|
|
MOVUPS(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
|
|
}
|
|
tables[i] = t
|
|
}
|
|
// Generate mask
|
|
zero := XMM()
|
|
XORPS(zero, zero) // Zero, so bytes will be copied.
|
|
fifteen, mask := GP64(), XMM()
|
|
MOVQ(U32(0xf), fifteen)
|
|
MOVQ(fifteen, mask)
|
|
PSHUFB(zero, mask)
|
|
ctx.clrMask128 = mask
|
|
|
|
bytes := Load(Param("x").Len(), GP64())
|
|
x := Load(Param("x").Base(), GP64())
|
|
y := Load(Param("y").Base(), GP64())
|
|
|
|
Label("loop")
|
|
for i := 0; i < 2; i++ {
|
|
xLo, xHi, yLo, yHi := XMM(), XMM(), XMM(), XMM()
|
|
MOVUPS(Mem{Base: y, Disp: i*16 + 0}, yLo)
|
|
MOVUPS(Mem{Base: y, Disp: i*16 + 32}, yHi)
|
|
|
|
prodLo, prodHi := leoMul128(ctx, yLo, yHi, tables)
|
|
|
|
MOVUPS(Mem{Base: x, Disp: i*16 + 0}, xLo)
|
|
MOVUPS(Mem{Base: x, Disp: i*16 + 32}, xHi)
|
|
PXOR(prodLo, xLo)
|
|
PXOR(prodHi, xHi)
|
|
MOVUPS(xLo, Mem{Base: x, Disp: i*16 + 0})
|
|
MOVUPS(xHi, Mem{Base: x, Disp: i*16 + 32})
|
|
|
|
PXOR(xLo, yLo)
|
|
PXOR(xHi, yHi)
|
|
MOVUPS(yLo, Mem{Base: y, Disp: i*16 + 0})
|
|
MOVUPS(yHi, Mem{Base: y, Disp: i*16 + 32})
|
|
|
|
}
|
|
|
|
ADDQ(U8(64), x)
|
|
ADDQ(U8(64), y)
|
|
SUBQ(U8(64), bytes)
|
|
JNZ(LabelRef("loop"))
|
|
|
|
RET()
|
|
}
|
|
{
|
|
TEXT("mulgf16_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
|
|
Pragma("noescape")
|
|
tablePtr := Load(Param("table"), GP64())
|
|
tables := [4]table128{}
|
|
for i, t := range tables {
|
|
// We have enough space for all tables.
|
|
if i > 3 {
|
|
t.Lo, t.Hi = Mem{Base: tablePtr, Disp: i * 16}, Mem{Base: tablePtr, Disp: i*16 + 16*4}
|
|
} else {
|
|
t.Lo, t.Hi = XMM(), XMM()
|
|
MOVUPS(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
|
|
MOVUPS(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
|
|
}
|
|
tables[i] = t
|
|
}
|
|
bytes := Load(Param("x").Len(), GP64())
|
|
x := Load(Param("x").Base(), GP64())
|
|
y := Load(Param("y").Base(), GP64())
|
|
// Generate mask
|
|
zero := XMM()
|
|
XORPS(zero, zero) // Zero, so bytes will be copied.
|
|
fifteen, mask := GP64(), XMM()
|
|
MOVQ(U32(0xf), fifteen)
|
|
MOVQ(fifteen, mask)
|
|
PSHUFB(zero, mask)
|
|
ctx.clrMask128 = mask
|
|
|
|
Label("loop")
|
|
for i := 0; i < 2; i++ {
|
|
dataLo, dataHi := XMM(), XMM()
|
|
MOVUPS(Mem{Base: y, Disp: i*16 + 0}, dataLo)
|
|
MOVUPS(Mem{Base: y, Disp: i*16 + 32}, dataHi)
|
|
|
|
prodLo, prodHi := leoMul128(ctx, dataLo, dataHi, tables)
|
|
MOVUPS(prodLo, Mem{Base: x, Disp: i*16 + 0})
|
|
MOVUPS(prodHi, Mem{Base: x, Disp: i*16 + 32})
|
|
}
|
|
|
|
ADDQ(U8(64), x)
|
|
ADDQ(U8(64), y)
|
|
SUBQ(U8(64), bytes)
|
|
JNZ(LabelRef("loop"))
|
|
|
|
RET()
|
|
}
|
|
|
|
}
|
|
|
|
// xLo, xHi updated, yLo, yHi preserved...
|
|
func leoMulAdd256(ctx gf16ctx, xLo, xHi, yLo, yHi reg.VecVirtual, table [4]table256) {
|
|
// inlined:
|
|
// prodLo, prodHi := leoMul256(ctx, yLo, yHi, table)
|
|
lo := yLo
|
|
hi := yHi
|
|
data0, data1 := YMM(), YMM()
|
|
VPSRLQ(U8(4), lo, data1) // data1 = lo >> 4
|
|
VPAND(ctx.clrMask, lo, data0) // data0 = lo&0xf
|
|
VPAND(ctx.clrMask, data1, data1) // data 1 = data1 &0xf
|
|
prodLo, prodHi := YMM(), YMM()
|
|
table[0].prepare()
|
|
VPSHUFB(data0, table[0].Lo, prodLo)
|
|
VPSHUFB(data0, table[0].Hi, prodHi)
|
|
tmpLo, tmpHi := YMM(), YMM()
|
|
table[1].prepare()
|
|
VPSHUFB(data1, table[1].Lo, tmpLo)
|
|
VPSHUFB(data1, table[1].Hi, tmpHi)
|
|
VPXOR(prodLo, tmpLo, prodLo)
|
|
VPXOR(prodHi, tmpHi, prodHi)
|
|
|
|
// Now process high
|
|
data0, data1 = YMM(), YMM() // Realloc to break dep
|
|
VPAND(hi, ctx.clrMask, data0)
|
|
VPSRLQ(U8(4), hi, data1)
|
|
VPAND(ctx.clrMask, data1, data1)
|
|
|
|
tmpLo, tmpHi = YMM(), YMM() // Realloc to break dep
|
|
table[2].prepare()
|
|
VPSHUFB(data0, table[2].Lo, tmpLo)
|
|
VPSHUFB(data0, table[2].Hi, tmpHi)
|
|
VPXOR(prodLo, tmpLo, prodLo)
|
|
VPXOR(prodHi, tmpHi, prodHi)
|
|
table[3].prepare()
|
|
VPSHUFB(data1, table[3].Lo, tmpLo)
|
|
VPSHUFB(data1, table[3].Hi, tmpHi)
|
|
if ctx.avx512 {
|
|
VPTERNLOGD(U8(0x96), prodLo, tmpLo, xLo)
|
|
VPTERNLOGD(U8(0x96), prodHi, tmpHi, xHi)
|
|
} else {
|
|
VPXOR3way(prodLo, tmpLo, xLo)
|
|
VPXOR3way(prodHi, tmpHi, xHi)
|
|
}
|
|
}
|
|
|
|
// leoMul256 lo, hi preserved...
|
|
func leoMul256(ctx gf16ctx, lo, hi reg.VecVirtual, table [4]table256) (prodLo, prodHi reg.VecVirtual) {
|
|
data0, data1 := YMM(), YMM()
|
|
VPSRLQ(U8(4), lo, data1) // data1 = lo >> 4
|
|
VPAND(ctx.clrMask, lo, data0) // data0 = lo&0xf
|
|
VPAND(ctx.clrMask, data1, data1) // data 1 = data1 &0xf
|
|
prodLo, prodHi = YMM(), YMM()
|
|
table[0].prepare()
|
|
VPSHUFB(data0, table[0].Lo, prodLo)
|
|
VPSHUFB(data0, table[0].Hi, prodHi)
|
|
tmpLo, tmpHi := YMM(), YMM()
|
|
table[1].prepare()
|
|
VPSHUFB(data1, table[1].Lo, tmpLo)
|
|
VPSHUFB(data1, table[1].Hi, tmpHi)
|
|
VPXOR(prodLo, tmpLo, prodLo)
|
|
VPXOR(prodHi, tmpHi, prodHi)
|
|
|
|
// Now process high
|
|
data0, data1 = YMM(), YMM() // Realloc to break dep
|
|
VPAND(hi, ctx.clrMask, data0)
|
|
VPSRLQ(U8(4), hi, data1)
|
|
VPAND(ctx.clrMask, data1, data1)
|
|
|
|
tmpLo, tmpHi = YMM(), YMM() // Realloc to break dep
|
|
table[2].prepare()
|
|
VPSHUFB(data0, table[2].Lo, tmpLo)
|
|
VPSHUFB(data0, table[2].Hi, tmpHi)
|
|
VPXOR(prodLo, tmpLo, prodLo)
|
|
VPXOR(prodHi, tmpHi, prodHi)
|
|
table[3].prepare()
|
|
VPSHUFB(data1, table[3].Lo, tmpLo)
|
|
VPSHUFB(data1, table[3].Hi, tmpHi)
|
|
VPXOR(prodLo, tmpLo, prodLo)
|
|
VPXOR(prodHi, tmpHi, prodHi)
|
|
return
|
|
}
|
|
|
|
func leoMulAdd128(ctx gf16ctx, xLo, xHi, yLo, yHi reg.VecVirtual, table [4]table128) {
|
|
prodLo, prodHi := leoMul128(ctx, yLo, yHi, table)
|
|
PXOR(prodLo, xLo)
|
|
PXOR(prodHi, xHi)
|
|
}
|
|
|
|
// leoMul128 lo, hi preseved (but likely will take extra regs to reuse)
|
|
func leoMul128(ctx gf16ctx, lo, hi reg.VecVirtual, table [4]table128) (prodLo, prodHi reg.VecVirtual) {
|
|
data0, data1 := XMM(), XMM()
|
|
MOVAPS(lo, data1)
|
|
PSRLQ(U8(4), data1) // data1 = lo >> 4
|
|
MOVAPS(lo, data0)
|
|
PAND(ctx.clrMask128, data0) // data0 = lo&0xf
|
|
PAND(ctx.clrMask128, data1) // data 1 = data1 &0xf
|
|
prodLo, prodHi = XMM(), XMM()
|
|
MOVUPS(table[0].Lo, prodLo)
|
|
MOVUPS(table[0].Hi, prodHi)
|
|
PSHUFB(data0, prodLo)
|
|
PSHUFB(data0, prodHi)
|
|
tmpLo, tmpHi := XMM(), XMM()
|
|
MOVUPS(table[1].Lo, tmpLo)
|
|
MOVUPS(table[1].Hi, tmpHi)
|
|
PSHUFB(data1, tmpLo)
|
|
PSHUFB(data1, tmpHi)
|
|
PXOR(tmpLo, prodLo)
|
|
PXOR(tmpHi, prodHi)
|
|
|
|
// Now process high
|
|
data0, data1 = XMM(), XMM() // Realloc to break dep
|
|
MOVAPS(hi, data0)
|
|
MOVAPS(hi, data1)
|
|
PAND(ctx.clrMask128, data0)
|
|
PSRLQ(U8(4), data1)
|
|
PAND(ctx.clrMask128, data1)
|
|
|
|
tmpLo, tmpHi = XMM(), XMM() // Realloc to break dep
|
|
MOVUPS(table[2].Lo, tmpLo)
|
|
MOVUPS(table[2].Hi, tmpHi)
|
|
PSHUFB(data0, tmpLo)
|
|
PSHUFB(data0, tmpHi)
|
|
PXOR(tmpLo, prodLo)
|
|
PXOR(tmpHi, prodHi)
|
|
MOVUPS(table[3].Lo, tmpLo)
|
|
MOVUPS(table[3].Hi, tmpHi)
|
|
PSHUFB(data1, tmpLo)
|
|
PSHUFB(data1, tmpHi)
|
|
PXOR(tmpLo, prodLo)
|
|
PXOR(tmpHi, prodHi)
|
|
return
|
|
}
|