reedsolomon-go/_gen/gf16.go

847 lines
24 KiB
Go

//go:build generate
// +build generate
package main
import (
"fmt"
"github.com/mmcloughlin/avo/attr"
. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/operand"
"github.com/mmcloughlin/avo/reg"
)
type table256 struct {
Lo, Hi Op
loadLo128, loadHi128 *Mem
loadLo256, loadHi256 *Mem
useZmmLo, useZmmHi *reg.VecPhysical
}
func (t *table256) prepare() {
t.prepareLo()
t.prepareHi()
}
func (t *table256) prepareHi() {
if t.loadHi128 != nil {
t.Hi = YMM()
// Load and expand tables
VBROADCASTI128(*t.loadHi128, t.Hi)
}
if t.loadHi256 != nil {
t.Hi = YMM()
// Load and expand tables
VMOVDQU(*t.loadHi256, t.Hi)
}
if t.useZmmHi != nil {
r := *t.useZmmHi
t.Hi = r.AsY()
}
}
func (t *table256) prepareLo() {
if t.loadLo128 != nil {
t.Lo = YMM()
// Load and expand tables
VBROADCASTI128(*t.loadLo128, t.Lo)
}
if t.loadLo256 != nil {
t.Lo = YMM()
// Load and expand tables
VMOVDQU(*t.loadLo256, t.Lo)
}
if t.useZmmLo != nil {
r := *t.useZmmLo
t.Lo = r.AsY()
}
}
// table128 contains memory pointers to tables
type table128 struct {
Lo, Hi Op
}
type gf16ctx struct {
clrMask reg.VecVirtual
clrMask128 reg.VecVirtual
avx512 bool
}
func genGF16() {
var ctx gf16ctx
// Ported from static void IFFT_DIT2
// https://github.com/catid/leopard/blob/master/LeopardFF16.cpp#L629
{
TEXT("ifftDIT2_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
Pragma("noescape")
tablePtr := Load(Param("table"), GP64())
tables := [4]table256{}
for i, t := range tables {
t.Lo, t.Hi = YMM(), YMM()
// Load and expand tables
VBROADCASTI128(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
VBROADCASTI128(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
tables[i] = t
}
bytes := Load(Param("x").Len(), GP64())
x := Load(Param("x").Base(), GP64())
y := Load(Param("y").Base(), GP64())
// Generate mask
ctx.clrMask = YMM()
tmpMask := GP64()
MOVQ(U32(15), tmpMask)
MOVQ(tmpMask, ctx.clrMask.AsX())
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
xLo, xHi, yLo, yHi := YMM(), YMM(), YMM(), YMM()
Label("loop")
VMOVDQU(Mem{Base: x, Disp: 0}, xLo)
VMOVDQU(Mem{Base: x, Disp: 32}, xHi)
VMOVDQU(Mem{Base: y, Disp: 0}, yLo)
VMOVDQU(Mem{Base: y, Disp: 32}, yHi)
VPXOR(yLo, xLo, yLo)
VPXOR(yHi, xHi, yHi)
VMOVDQU(yLo, Mem{Base: y, Disp: 0})
VMOVDQU(yHi, Mem{Base: y, Disp: 32})
leoMulAdd256(ctx, xLo, xHi, yLo, yHi, tables)
VMOVDQU(xLo, Mem{Base: x, Disp: 0})
VMOVDQU(xHi, Mem{Base: x, Disp: 32})
ADDQ(U8(64), x)
ADDQ(U8(64), y)
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
VZEROUPPER()
RET()
}
{
TEXT("fftDIT2_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
Pragma("noescape")
tablePtr := Load(Param("table"), GP64())
tables := [4]table256{}
for i, t := range tables {
t.Lo, t.Hi = YMM(), YMM()
// Load and expand tables
VBROADCASTI128(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
VBROADCASTI128(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
tables[i] = t
}
bytes := Load(Param("x").Len(), GP64())
x := Load(Param("x").Base(), GP64())
y := Load(Param("y").Base(), GP64())
// Generate mask
ctx.clrMask = YMM()
tmpMask := GP64()
MOVQ(U32(15), tmpMask)
MOVQ(tmpMask, ctx.clrMask.AsX())
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
xLo, xHi, yLo, yHi := YMM(), YMM(), YMM(), YMM()
Label("loop")
VMOVDQU(Mem{Base: x, Disp: 0}, xLo)
VMOVDQU(Mem{Base: x, Disp: 32}, xHi)
VMOVDQU(Mem{Base: y, Disp: 0}, yLo)
VMOVDQU(Mem{Base: y, Disp: 32}, yHi)
leoMulAdd256(ctx, xLo, xHi, yLo, yHi, tables)
VMOVDQU(xLo, Mem{Base: x, Disp: 0})
VMOVDQU(xHi, Mem{Base: x, Disp: 32})
// Reload, or we go beyond 16 regs..
if true {
yLo, yHi = YMM(), YMM()
VMOVDQU(Mem{Base: y, Disp: 0}, yLo)
VMOVDQU(Mem{Base: y, Disp: 32}, yHi)
}
VPXOR(yLo, xLo, yLo)
VPXOR(yHi, xHi, yHi)
VMOVDQU(yLo, Mem{Base: y, Disp: 0})
VMOVDQU(yHi, Mem{Base: y, Disp: 32})
ADDQ(U8(64), x)
ADDQ(U8(64), y)
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
VZEROUPPER()
RET()
}
{
TEXT("mulgf16_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
Pragma("noescape")
tablePtr := Load(Param("table"), GP64())
tables := [4]table256{}
for i, t := range tables {
t.Lo, t.Hi = YMM(), YMM()
// Load and expand tables
VBROADCASTI128(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
VBROADCASTI128(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
tables[i] = t
}
bytes := Load(Param("x").Len(), GP64())
x := Load(Param("x").Base(), GP64())
y := Load(Param("y").Base(), GP64())
// Generate mask
ctx.clrMask = YMM()
tmpMask := GP64()
MOVQ(U32(15), tmpMask)
MOVQ(tmpMask, ctx.clrMask.AsX())
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
dataLo, dataHi := YMM(), YMM()
Label("loop")
VMOVDQU(Mem{Base: y, Disp: 0}, dataLo)
VMOVDQU(Mem{Base: y, Disp: 32}, dataHi)
prodLo, prodHi := leoMul256(ctx, dataLo, dataHi, tables)
VMOVDQU(prodLo, Mem{Base: x, Disp: 0})
VMOVDQU(prodHi, Mem{Base: x, Disp: 32})
ADDQ(U8(64), x)
ADDQ(U8(64), y)
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
VZEROUPPER()
RET()
}
for _, avx512 := range []bool{true, false} {
// AVX-512 only uses more registers for tables.
var suffix = "avx2"
if avx512 {
suffix = "avx512"
}
ctx.avx512 = avx512
extZMMs := []reg.VecPhysical{reg.Z16, reg.Z17, reg.Z18, reg.Z19, reg.Z20, reg.Z21, reg.Z22, reg.Z23, reg.Z24, reg.Z25, reg.Z26, reg.Z27, reg.Z28, reg.Z29, reg.Z30, reg.Z31}
{
TEXT("ifftDIT4_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, table01 *[8*16]uint8, table23 *[8*16]uint8, table02 *[8*16]uint8, logMask uint8)"))
Pragma("noescape")
Comment("dist must be multiplied by 24 (size of slice header)")
Comment("logmask must be log_m01==kModulus, log_m23==kModulus, log_m02==kModulus from lowest to bit 3")
// Unpack tables to stack. Slower.
const unpackTables = false
table01Ptr := Load(Param("table01"), GP64())
table23Ptr := Load(Param("table23"), GP64())
table02Ptr := Load(Param("table02"), GP64())
// Prepare table pointers.
table01 := [4]table256{}
table23 := [4]table256{}
table02 := [4]table256{}
if avx512 {
usedZmm := 0
fill := func(t *[4]table256, ptr reg.Register) {
for i := range table01 {
t := &t[i]
if len(extZMMs)-usedZmm >= 2 {
tmpLo, tmpHi := YMM(), YMM()
t.useZmmLo, t.useZmmHi = &extZMMs[usedZmm], &extZMMs[usedZmm+1]
usedZmm += 2
// Load and expand tables
VBROADCASTI128(Mem{Base: ptr, Disp: i * 16}, tmpLo)
VBROADCASTI128(Mem{Base: ptr, Disp: i*16 + 16*4}, tmpHi)
VMOVAPS(tmpLo.AsZ(), *t.useZmmLo)
VMOVAPS(tmpHi.AsZ(), *t.useZmmHi)
} else {
t.loadLo128 = &Mem{Base: ptr, Disp: i * 16}
t.loadHi128 = &Mem{Base: ptr, Disp: i*16 + 16*4}
}
}
}
fill(&table02, table02Ptr)
fill(&table01, table01Ptr)
fill(&table23, table23Ptr)
}
for i := range table01 {
if avx512 {
continue
}
if unpackTables {
toStack := func(m Mem) *Mem {
stack := AllocLocal(32)
y := YMM()
VBROADCASTI128(m, y)
VMOVDQU(y, stack)
return &stack
}
table01[i].loadLo256 = toStack(Mem{Base: table01Ptr, Disp: i * 16})
table23[i].loadLo256 = toStack(Mem{Base: table23Ptr, Disp: i * 16})
table02[i].loadLo256 = toStack(Mem{Base: table02Ptr, Disp: i * 16})
table01[i].loadHi256 = toStack(Mem{Base: table01Ptr, Disp: i*16 + 16*4})
table23[i].loadHi256 = toStack(Mem{Base: table23Ptr, Disp: i*16 + 16*4})
table02[i].loadHi256 = toStack(Mem{Base: table02Ptr, Disp: i*16 + 16*4})
} else {
table01[i].loadLo128 = &Mem{Base: table01Ptr, Disp: i * 16}
table23[i].loadLo128 = &Mem{Base: table23Ptr, Disp: i * 16}
table02[i].loadLo128 = &Mem{Base: table02Ptr, Disp: i * 16}
table01[i].loadHi128 = &Mem{Base: table01Ptr, Disp: i*16 + 16*4}
table23[i].loadHi128 = &Mem{Base: table23Ptr, Disp: i*16 + 16*4}
table02[i].loadHi128 = &Mem{Base: table02Ptr, Disp: i*16 + 16*4}
}
}
// Generate mask
ctx.clrMask = YMM()
tmpMask := GP64()
MOVQ(U32(15), tmpMask)
MOVQ(tmpMask, ctx.clrMask.AsX())
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
dist := Load(Param("dist"), GP64())
// Pointers to each "work"
var work [4]reg.GPVirtual
workTable := Load(Param("work").Base(), GP64()) // &work[0]
bytes := GP64()
// Load length of work[0]
MOVQ(Mem{Base: workTable, Disp: 8}, bytes)
offset := GP64()
XORQ(offset, offset)
for i := range work {
work[i] = GP64()
// work[i] = &workTable[dist*i]
MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i])
if i < len(work)-1 {
ADDQ(dist, offset)
}
}
var workRegLo [4]reg.VecVirtual
var workRegHi [4]reg.VecVirtual
workRegLo[0], workRegHi[0] = YMM(), YMM()
workRegLo[1], workRegHi[1] = YMM(), YMM()
mask := Load(Param("logMask"), GP64())
Label("loop")
VMOVDQU(Mem{Base: work[0], Disp: 0}, workRegLo[0])
VMOVDQU(Mem{Base: work[0], Disp: 32}, workRegHi[0])
VMOVDQU(Mem{Base: work[1], Disp: 0}, workRegLo[1])
VMOVDQU(Mem{Base: work[1], Disp: 32}, workRegHi[1])
// First layer:
VPXOR(workRegLo[0], workRegLo[1], workRegLo[1])
VPXOR(workRegHi[0], workRegHi[1], workRegHi[1])
// Test bit 0
BTQ(U8(0), mask)
JC(LabelRef("skip_m01"))
leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[1], workRegHi[1], table01)
Label("skip_m01")
workRegLo[2], workRegHi[2] = YMM(), YMM()
workRegLo[3], workRegHi[3] = YMM(), YMM()
VMOVDQU(Mem{Base: work[2], Disp: 0}, workRegLo[2])
VMOVDQU(Mem{Base: work[2], Disp: 32}, workRegHi[2])
VMOVDQU(Mem{Base: work[3], Disp: 0}, workRegLo[3])
VMOVDQU(Mem{Base: work[3], Disp: 32}, workRegHi[3])
VPXOR(workRegLo[2], workRegLo[3], workRegLo[3])
VPXOR(workRegHi[2], workRegHi[3], workRegHi[3])
// Test bit 1
BTQ(U8(1), mask)
JC(LabelRef("skip_m23"))
leoMulAdd256(ctx, workRegLo[2], workRegHi[2], workRegLo[3], workRegHi[3], table23)
Label("skip_m23")
// Second layer:
VPXOR(workRegLo[0], workRegLo[2], workRegLo[2])
VPXOR(workRegHi[0], workRegHi[2], workRegHi[2])
VPXOR(workRegLo[1], workRegLo[3], workRegLo[3])
VPXOR(workRegHi[1], workRegHi[3], workRegHi[3])
// Test bit 2
BTQ(U8(2), mask)
JC(LabelRef("skip_m02"))
leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[2], workRegHi[2], table02)
leoMulAdd256(ctx, workRegLo[1], workRegHi[1], workRegLo[3], workRegHi[3], table02)
Label("skip_m02")
// Store + Next loop:
for i := range work {
VMOVDQU(workRegLo[i], Mem{Base: work[i], Disp: 0})
VMOVDQU(workRegHi[i], Mem{Base: work[i], Disp: 32})
ADDQ(U8(64), work[i])
}
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
VZEROUPPER()
RET()
}
{
TEXT("fftDIT4_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, table01 *[8*16]uint8, table23 *[8*16]uint8, table02 *[8*16]uint8, logMask uint8)"))
Pragma("noescape")
Comment("dist must be multiplied by 24 (size of slice header)")
Comment("logmask must be log_m01==kModulus, log_m23==kModulus, log_m02==kModulus from lowest to bit 3")
// Unpack tables to stack. Slower.
const unpackTables = false
table01Ptr := Load(Param("table01"), GP64())
table23Ptr := Load(Param("table23"), GP64())
table02Ptr := Load(Param("table02"), GP64())
// Prepare table pointers.
table01 := [4]table256{}
table23 := [4]table256{}
table02 := [4]table256{}
if avx512 {
usedZmm := 0
fill := func(t *[4]table256, ptr reg.Register) {
for i := range table01 {
t := &t[i]
if len(extZMMs)-usedZmm >= 2 {
tmpLo, tmpHi := YMM(), YMM()
t.useZmmLo, t.useZmmHi = &extZMMs[usedZmm], &extZMMs[usedZmm+1]
usedZmm += 2
// Load and expand tables
VBROADCASTI128(Mem{Base: ptr, Disp: i * 16}, tmpLo)
VBROADCASTI128(Mem{Base: ptr, Disp: i*16 + 16*4}, tmpHi)
VMOVAPS(tmpLo.AsZ(), *t.useZmmLo)
VMOVAPS(tmpHi.AsZ(), *t.useZmmHi)
} else {
t.loadLo128 = &Mem{Base: ptr, Disp: i * 16}
t.loadHi128 = &Mem{Base: ptr, Disp: i*16 + 16*4}
}
}
}
fill(&table02, table02Ptr)
fill(&table01, table01Ptr)
fill(&table23, table23Ptr)
}
for i := range table01 {
if avx512 {
continue
}
if unpackTables {
toStack := func(m Mem) *Mem {
stack := AllocLocal(32)
y := YMM()
VBROADCASTI128(m, y)
VMOVDQU(y, stack)
return &stack
}
table01[i].loadLo256 = toStack(Mem{Base: table01Ptr, Disp: i * 16})
table23[i].loadLo256 = toStack(Mem{Base: table23Ptr, Disp: i * 16})
table02[i].loadLo256 = toStack(Mem{Base: table02Ptr, Disp: i * 16})
table01[i].loadHi256 = toStack(Mem{Base: table01Ptr, Disp: i*16 + 16*4})
table23[i].loadHi256 = toStack(Mem{Base: table23Ptr, Disp: i*16 + 16*4})
table02[i].loadHi256 = toStack(Mem{Base: table02Ptr, Disp: i*16 + 16*4})
} else {
table01[i].loadLo128 = &Mem{Base: table01Ptr, Disp: i * 16}
table23[i].loadLo128 = &Mem{Base: table23Ptr, Disp: i * 16}
table02[i].loadLo128 = &Mem{Base: table02Ptr, Disp: i * 16}
table01[i].loadHi128 = &Mem{Base: table01Ptr, Disp: i*16 + 16*4}
table23[i].loadHi128 = &Mem{Base: table23Ptr, Disp: i*16 + 16*4}
table02[i].loadHi128 = &Mem{Base: table02Ptr, Disp: i*16 + 16*4}
}
}
// Generate mask
ctx.clrMask = YMM()
tmpMask := GP64()
MOVQ(U32(15), tmpMask)
MOVQ(tmpMask, ctx.clrMask.AsX())
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
dist := Load(Param("dist"), GP64())
// Pointers to each "work"
var work [4]reg.GPVirtual
workTable := Load(Param("work").Base(), GP64()) // &work[0]
bytes := GP64()
// Load length of work[0]
MOVQ(Mem{Base: workTable, Disp: 8}, bytes)
offset := GP64()
XORQ(offset, offset)
for i := range work {
work[i] = GP64()
// work[i] = &workTable[dist*i]
MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i])
if i < len(work)-1 {
ADDQ(dist, offset)
}
}
var workRegLo [4]reg.VecVirtual
var workRegHi [4]reg.VecVirtual
workRegLo[0], workRegHi[0] = YMM(), YMM()
workRegLo[1], workRegHi[1] = YMM(), YMM()
workRegLo[2], workRegHi[2] = YMM(), YMM()
workRegLo[3], workRegHi[3] = YMM(), YMM()
mask := Load(Param("logMask"), GP64())
Label("loop")
VMOVDQU(Mem{Base: work[0], Disp: 0}, workRegLo[0])
VMOVDQU(Mem{Base: work[0], Disp: 32}, workRegHi[0])
VMOVDQU(Mem{Base: work[2], Disp: 0}, workRegLo[2])
VMOVDQU(Mem{Base: work[2], Disp: 32}, workRegHi[2])
VMOVDQU(Mem{Base: work[1], Disp: 0}, workRegLo[1])
VMOVDQU(Mem{Base: work[1], Disp: 32}, workRegHi[1])
VMOVDQU(Mem{Base: work[3], Disp: 0}, workRegLo[3])
VMOVDQU(Mem{Base: work[3], Disp: 32}, workRegHi[3])
// First layer:
// Test bit 0
BTQ(U8(0), mask)
JC(LabelRef("skip_m02"))
leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[2], workRegHi[2], table02)
leoMulAdd256(ctx, workRegLo[1], workRegHi[1], workRegLo[3], workRegHi[3], table02)
Label("skip_m02")
VPXOR(workRegLo[0], workRegLo[2], workRegLo[2])
VPXOR(workRegHi[0], workRegHi[2], workRegHi[2])
VPXOR(workRegLo[1], workRegLo[3], workRegLo[3])
VPXOR(workRegHi[1], workRegHi[3], workRegHi[3])
// Second layer:
// Test bit 1
BTQ(U8(1), mask)
JC(LabelRef("skip_m01"))
leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[1], workRegHi[1], table01)
Label("skip_m01")
VPXOR(workRegLo[0], workRegLo[1], workRegLo[1])
VPXOR(workRegHi[0], workRegHi[1], workRegHi[1])
// Store...
for i := range work[:2] {
VMOVDQU(workRegLo[i], Mem{Base: work[i], Disp: 0})
VMOVDQU(workRegHi[i], Mem{Base: work[i], Disp: 32})
ADDQ(U8(64), work[i])
}
// Test bit 2
BTQ(U8(2), mask)
JC(LabelRef("skip_m23"))
leoMulAdd256(ctx, workRegLo[2], workRegHi[2], workRegLo[3], workRegHi[3], table23)
Label("skip_m23")
VPXOR(workRegLo[2], workRegLo[3], workRegLo[3])
VPXOR(workRegHi[2], workRegHi[3], workRegHi[3])
// Store + Next loop:
for i := range work[2:] {
i := i + 2
VMOVDQU(workRegLo[i], Mem{Base: work[i], Disp: 0})
VMOVDQU(workRegHi[i], Mem{Base: work[i], Disp: 32})
ADDQ(U8(64), work[i])
}
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
VZEROUPPER()
RET()
}
}
// SSSE3:
ctx.avx512 = false
{
TEXT("ifftDIT2_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
Pragma("noescape")
tablePtr := Load(Param("table"), GP64())
tables := [4]table128{}
for i, t := range tables {
// We almost have enough space for all tables.
if i > 2 {
t.Lo, t.Hi = Mem{Base: tablePtr, Disp: i * 16}, Mem{Base: tablePtr, Disp: i*16 + 16*4}
} else {
t.Lo, t.Hi = XMM(), XMM()
MOVUPS(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
MOVUPS(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
}
tables[i] = t
}
// Generate mask
zero := XMM()
XORPS(zero, zero) // Zero, so bytes will be copied.
fifteen, mask := GP64(), XMM()
MOVQ(U32(0xf), fifteen)
MOVQ(fifteen, mask)
PSHUFB(zero, mask)
ctx.clrMask128 = mask
bytes := Load(Param("x").Len(), GP64())
x := Load(Param("x").Base(), GP64())
y := Load(Param("y").Base(), GP64())
Label("loop")
for i := 0; i < 2; i++ {
xLo, xHi, yLo, yHi := XMM(), XMM(), XMM(), XMM()
MOVUPS(Mem{Base: x, Disp: i*16 + 0}, xLo)
MOVUPS(Mem{Base: x, Disp: i*16 + 32}, xHi)
MOVUPS(Mem{Base: y, Disp: i*16 + 0}, yLo)
MOVUPS(Mem{Base: y, Disp: i*16 + 32}, yHi)
PXOR(xLo, yLo)
PXOR(xHi, yHi)
MOVUPS(yLo, Mem{Base: y, Disp: i*16 + 0})
MOVUPS(yHi, Mem{Base: y, Disp: i*16 + 32})
leoMulAdd128(ctx, xLo, xHi, yLo, yHi, tables)
MOVUPS(xLo, Mem{Base: x, Disp: i*16 + 0})
MOVUPS(xHi, Mem{Base: x, Disp: i*16 + 32})
}
ADDQ(U8(64), x)
ADDQ(U8(64), y)
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
RET()
}
{
TEXT("fftDIT2_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
Pragma("noescape")
tablePtr := Load(Param("table"), GP64())
tables := [4]table128{}
for i, t := range tables {
// We almost have enough space for all tables.
if i > 2 {
t.Lo, t.Hi = Mem{Base: tablePtr, Disp: i * 16}, Mem{Base: tablePtr, Disp: i*16 + 16*4}
} else {
t.Lo, t.Hi = XMM(), XMM()
MOVUPS(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
MOVUPS(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
}
tables[i] = t
}
// Generate mask
zero := XMM()
XORPS(zero, zero) // Zero, so bytes will be copied.
fifteen, mask := GP64(), XMM()
MOVQ(U32(0xf), fifteen)
MOVQ(fifteen, mask)
PSHUFB(zero, mask)
ctx.clrMask128 = mask
bytes := Load(Param("x").Len(), GP64())
x := Load(Param("x").Base(), GP64())
y := Load(Param("y").Base(), GP64())
Label("loop")
for i := 0; i < 2; i++ {
xLo, xHi, yLo, yHi := XMM(), XMM(), XMM(), XMM()
MOVUPS(Mem{Base: y, Disp: i*16 + 0}, yLo)
MOVUPS(Mem{Base: y, Disp: i*16 + 32}, yHi)
prodLo, prodHi := leoMul128(ctx, yLo, yHi, tables)
MOVUPS(Mem{Base: x, Disp: i*16 + 0}, xLo)
MOVUPS(Mem{Base: x, Disp: i*16 + 32}, xHi)
PXOR(prodLo, xLo)
PXOR(prodHi, xHi)
MOVUPS(xLo, Mem{Base: x, Disp: i*16 + 0})
MOVUPS(xHi, Mem{Base: x, Disp: i*16 + 32})
PXOR(xLo, yLo)
PXOR(xHi, yHi)
MOVUPS(yLo, Mem{Base: y, Disp: i*16 + 0})
MOVUPS(yHi, Mem{Base: y, Disp: i*16 + 32})
}
ADDQ(U8(64), x)
ADDQ(U8(64), y)
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
RET()
}
{
TEXT("mulgf16_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
Pragma("noescape")
tablePtr := Load(Param("table"), GP64())
tables := [4]table128{}
for i, t := range tables {
// We have enough space for all tables.
if i > 3 {
t.Lo, t.Hi = Mem{Base: tablePtr, Disp: i * 16}, Mem{Base: tablePtr, Disp: i*16 + 16*4}
} else {
t.Lo, t.Hi = XMM(), XMM()
MOVUPS(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
MOVUPS(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
}
tables[i] = t
}
bytes := Load(Param("x").Len(), GP64())
x := Load(Param("x").Base(), GP64())
y := Load(Param("y").Base(), GP64())
// Generate mask
zero := XMM()
XORPS(zero, zero) // Zero, so bytes will be copied.
fifteen, mask := GP64(), XMM()
MOVQ(U32(0xf), fifteen)
MOVQ(fifteen, mask)
PSHUFB(zero, mask)
ctx.clrMask128 = mask
Label("loop")
for i := 0; i < 2; i++ {
dataLo, dataHi := XMM(), XMM()
MOVUPS(Mem{Base: y, Disp: i*16 + 0}, dataLo)
MOVUPS(Mem{Base: y, Disp: i*16 + 32}, dataHi)
prodLo, prodHi := leoMul128(ctx, dataLo, dataHi, tables)
MOVUPS(prodLo, Mem{Base: x, Disp: i*16 + 0})
MOVUPS(prodHi, Mem{Base: x, Disp: i*16 + 32})
}
ADDQ(U8(64), x)
ADDQ(U8(64), y)
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
RET()
}
}
// xLo, xHi updated, yLo, yHi preserved...
func leoMulAdd256(ctx gf16ctx, xLo, xHi, yLo, yHi reg.VecVirtual, table [4]table256) {
// inlined:
// prodLo, prodHi := leoMul256(ctx, yLo, yHi, table)
lo := yLo
hi := yHi
data0, data1 := YMM(), YMM()
VPSRLQ(U8(4), lo, data1) // data1 = lo >> 4
VPAND(ctx.clrMask, lo, data0) // data0 = lo&0xf
VPAND(ctx.clrMask, data1, data1) // data 1 = data1 &0xf
prodLo, prodHi := YMM(), YMM()
table[0].prepare()
VPSHUFB(data0, table[0].Lo, prodLo)
VPSHUFB(data0, table[0].Hi, prodHi)
tmpLo, tmpHi := YMM(), YMM()
table[1].prepare()
VPSHUFB(data1, table[1].Lo, tmpLo)
VPSHUFB(data1, table[1].Hi, tmpHi)
VPXOR(prodLo, tmpLo, prodLo)
VPXOR(prodHi, tmpHi, prodHi)
// Now process high
data0, data1 = YMM(), YMM() // Realloc to break dep
VPAND(hi, ctx.clrMask, data0)
VPSRLQ(U8(4), hi, data1)
VPAND(ctx.clrMask, data1, data1)
tmpLo, tmpHi = YMM(), YMM() // Realloc to break dep
table[2].prepare()
VPSHUFB(data0, table[2].Lo, tmpLo)
VPSHUFB(data0, table[2].Hi, tmpHi)
VPXOR(prodLo, tmpLo, prodLo)
VPXOR(prodHi, tmpHi, prodHi)
table[3].prepare()
VPSHUFB(data1, table[3].Lo, tmpLo)
VPSHUFB(data1, table[3].Hi, tmpHi)
if ctx.avx512 {
VPTERNLOGD(U8(0x96), prodLo, tmpLo, xLo)
VPTERNLOGD(U8(0x96), prodHi, tmpHi, xHi)
} else {
VPXOR3way(prodLo, tmpLo, xLo)
VPXOR3way(prodHi, tmpHi, xHi)
}
}
// leoMul256 lo, hi preserved...
func leoMul256(ctx gf16ctx, lo, hi reg.VecVirtual, table [4]table256) (prodLo, prodHi reg.VecVirtual) {
data0, data1 := YMM(), YMM()
VPSRLQ(U8(4), lo, data1) // data1 = lo >> 4
VPAND(ctx.clrMask, lo, data0) // data0 = lo&0xf
VPAND(ctx.clrMask, data1, data1) // data 1 = data1 &0xf
prodLo, prodHi = YMM(), YMM()
table[0].prepare()
VPSHUFB(data0, table[0].Lo, prodLo)
VPSHUFB(data0, table[0].Hi, prodHi)
tmpLo, tmpHi := YMM(), YMM()
table[1].prepare()
VPSHUFB(data1, table[1].Lo, tmpLo)
VPSHUFB(data1, table[1].Hi, tmpHi)
VPXOR(prodLo, tmpLo, prodLo)
VPXOR(prodHi, tmpHi, prodHi)
// Now process high
data0, data1 = YMM(), YMM() // Realloc to break dep
VPAND(hi, ctx.clrMask, data0)
VPSRLQ(U8(4), hi, data1)
VPAND(ctx.clrMask, data1, data1)
tmpLo, tmpHi = YMM(), YMM() // Realloc to break dep
table[2].prepare()
VPSHUFB(data0, table[2].Lo, tmpLo)
VPSHUFB(data0, table[2].Hi, tmpHi)
VPXOR(prodLo, tmpLo, prodLo)
VPXOR(prodHi, tmpHi, prodHi)
table[3].prepare()
VPSHUFB(data1, table[3].Lo, tmpLo)
VPSHUFB(data1, table[3].Hi, tmpHi)
VPXOR(prodLo, tmpLo, prodLo)
VPXOR(prodHi, tmpHi, prodHi)
return
}
func leoMulAdd128(ctx gf16ctx, xLo, xHi, yLo, yHi reg.VecVirtual, table [4]table128) {
prodLo, prodHi := leoMul128(ctx, yLo, yHi, table)
PXOR(prodLo, xLo)
PXOR(prodHi, xHi)
}
// leoMul128 lo, hi preseved (but likely will take extra regs to reuse)
func leoMul128(ctx gf16ctx, lo, hi reg.VecVirtual, table [4]table128) (prodLo, prodHi reg.VecVirtual) {
data0, data1 := XMM(), XMM()
MOVAPS(lo, data1)
PSRLQ(U8(4), data1) // data1 = lo >> 4
MOVAPS(lo, data0)
PAND(ctx.clrMask128, data0) // data0 = lo&0xf
PAND(ctx.clrMask128, data1) // data 1 = data1 &0xf
prodLo, prodHi = XMM(), XMM()
MOVUPS(table[0].Lo, prodLo)
MOVUPS(table[0].Hi, prodHi)
PSHUFB(data0, prodLo)
PSHUFB(data0, prodHi)
tmpLo, tmpHi := XMM(), XMM()
MOVUPS(table[1].Lo, tmpLo)
MOVUPS(table[1].Hi, tmpHi)
PSHUFB(data1, tmpLo)
PSHUFB(data1, tmpHi)
PXOR(tmpLo, prodLo)
PXOR(tmpHi, prodHi)
// Now process high
data0, data1 = XMM(), XMM() // Realloc to break dep
MOVAPS(hi, data0)
MOVAPS(hi, data1)
PAND(ctx.clrMask128, data0)
PSRLQ(U8(4), data1)
PAND(ctx.clrMask128, data1)
tmpLo, tmpHi = XMM(), XMM() // Realloc to break dep
MOVUPS(table[2].Lo, tmpLo)
MOVUPS(table[2].Hi, tmpHi)
PSHUFB(data0, tmpLo)
PSHUFB(data0, tmpHi)
PXOR(tmpLo, prodLo)
PXOR(tmpHi, prodHi)
MOVUPS(table[3].Lo, tmpLo)
MOVUPS(table[3].Hi, tmpHi)
PSHUFB(data1, tmpLo)
PSHUFB(data1, tmpHi)
PXOR(tmpLo, prodLo)
PXOR(tmpHi, prodHi)
return
}