master
Klaus Post 2020-05-06 12:36:43 +02:00
parent 1b9e129671
commit 3067f8aed5
No known key found for this signature in database
GPG Key ID: BAA2096BE0B8A075
2 changed files with 162 additions and 165 deletions

View File

@ -3,33 +3,32 @@
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2019, Minio, Inc.
#define LOAD(OFFSET) \
MOVQ OFFSET(SI), BX \
VMOVDQU64 (BX)(R11*1), Z0 \
VPSRLQ $4, Z0, Z1 \ // high input
VPANDQ Z2, Z0, Z0 \ // low input
VPANDQ Z2, Z1, Z1 // high input
#define LOAD(OFFSET) \
MOVQ OFFSET(SI), BX \
VMOVDQU64 (BX)(R11*1), Z0 \
VPSRLQ $4, Z0, Z1 \ // high input
VPANDQ Z2, Z0, Z0 \ // low input
VPANDQ Z2, Z1, Z1 // high input
#define GALOIS(C1, C2, IN, LO, HI, OUT) \
VSHUFI64X2 $C1, IN, IN, LO \
VSHUFI64X2 $C2, IN, IN, HI \
VPSHUFB Z0, LO, LO \ // mul low part
VPSHUFB Z1, HI, HI \ // mul high part
VSHUFI64X2 $C1, IN, IN, LO \
VSHUFI64X2 $C2, IN, IN, HI \
VPSHUFB Z0, LO, LO \ // mul low part
VPSHUFB Z1, HI, HI \ // mul high part
VPTERNLOGD $0x96, LO, HI, OUT
//
// Process single output row from a total of 8 input rows
//
// func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
TEXT ·_galMulAVX512Parallel81(SB), 7, $0
MOVQ in+0(FP), SI //
MOVQ 8(SI), R9 // R9: len(in)
SHRQ $6, R9 // len(in) / 64
MOVQ in+0(FP), SI
MOVQ 8(SI), R9 // R9: len(in)
SHRQ $6, R9 // len(in) / 64
TESTQ R9, R9
JZ done_avx512_parallel81
MOVQ matrix+48(FP), SI
MOVQ matrix+48(FP), SI
VMOVDQU64 0x000(SI), Z16
VMOVDQU64 0x040(SI), Z17
VMOVDQU64 0x080(SI), Z18
@ -38,61 +37,61 @@ TEXT ·_galMulAVX512Parallel81(SB), 7, $0
MOVQ $15, BX
VPBROADCASTB BX, Z2
MOVB addTo+56(FP), AX
MOVB addTo+56(FP), AX
IMULQ $-0x1, AX
KMOVQ AX, K1
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), AX // number of inputs
XORQ R11, R11
MOVQ out+24(FP), DX
MOVQ (DX), DX // DX: &out[0][0]
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), AX // number of inputs
XORQ R11, R11
MOVQ out+24(FP), DX
MOVQ (DX), DX // DX: &out[0][0]
loopback_avx512_parallel81:
VMOVDQU64.Z (DX), K1, Z4
LOAD(0x00) // &in[0][0]
LOAD(0x00) // &in[0][0]
GALOIS(0x00, 0x55, Z16, Z14, Z15, Z4)
CMPQ AX, $1
JE skip_avx512_parallel81
CMPQ AX, $1
JE skip_avx512_parallel81
LOAD(0x18) // &in[1][0]
LOAD(0x18) // &in[1][0]
GALOIS(0xaa, 0xff, Z16, Z14, Z15, Z4)
CMPQ AX, $2
JE skip_avx512_parallel81
CMPQ AX, $2
JE skip_avx512_parallel81
LOAD(0x30) // &in[2][0]
LOAD(0x30) // &in[2][0]
GALOIS(0x00, 0x55, Z17, Z14, Z15, Z4)
CMPQ AX, $3
JE skip_avx512_parallel81
CMPQ AX, $3
JE skip_avx512_parallel81
LOAD(0x48) // &in[3][0]
LOAD(0x48) // &in[3][0]
GALOIS(0xaa, 0xff, Z17, Z14, Z15, Z4)
CMPQ AX, $4
JE skip_avx512_parallel81
CMPQ AX, $4
JE skip_avx512_parallel81
LOAD(0x60) // &in[4][0]
LOAD(0x60) // &in[4][0]
GALOIS(0x00, 0x55, Z18, Z14, Z15, Z4)
CMPQ AX, $5
JE skip_avx512_parallel81
CMPQ AX, $5
JE skip_avx512_parallel81
LOAD(0x78) // &in[5][0]
LOAD(0x78) // &in[5][0]
GALOIS(0xaa, 0xff, Z18, Z14, Z15, Z4)
CMPQ AX, $6
JE skip_avx512_parallel81
CMPQ AX, $6
JE skip_avx512_parallel81
LOAD(0x90) // &in[6][0]
LOAD(0x90) // &in[6][0]
GALOIS(0x00, 0x55, Z19, Z14, Z15, Z4)
CMPQ AX, $7
JE skip_avx512_parallel81
CMPQ AX, $7
JE skip_avx512_parallel81
LOAD(0xa8) // &in[7][0]
LOAD(0xa8) // &in[7][0]
GALOIS(0xaa, 0xff, Z19, Z14, Z15, Z4)
skip_avx512_parallel81:
@ -100,7 +99,7 @@ skip_avx512_parallel81:
ADDQ $64, R11 // in4+=64
ADDQ $64, DX // out+=64
ADDQ $64, DX // out+=64
SUBQ $1, R9
JNZ loopback_avx512_parallel81
@ -114,13 +113,13 @@ done_avx512_parallel81:
//
// func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
TEXT ·_galMulAVX512Parallel82(SB), 7, $0
MOVQ in+0(FP), SI //
MOVQ 8(SI), R9 // R9: len(in)
SHRQ $6, R9 // len(in) / 64
MOVQ in+0(FP), SI
MOVQ 8(SI), R9 // R9: len(in)
SHRQ $6, R9 // len(in) / 64
TESTQ R9, R9
JZ done_avx512_parallel82
MOVQ matrix+48(FP), SI
MOVQ matrix+48(FP), SI
VMOVDQU64 0x000(SI), Z16
VMOVDQU64 0x040(SI), Z17
VMOVDQU64 0x080(SI), Z18
@ -133,70 +132,70 @@ TEXT ·_galMulAVX512Parallel82(SB), 7, $0
MOVQ $15, BX
VPBROADCASTB BX, Z2
MOVB addTo+56(FP), AX
MOVB addTo+56(FP), AX
IMULQ $-0x1, AX
KMOVQ AX, K1
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), AX // number of inputs
XORQ R11, R11
MOVQ out+24(FP), DX
MOVQ 24(DX), CX // CX: &out[1][0]
MOVQ (DX), DX // DX: &out[0][0]
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), AX // number of inputs
XORQ R11, R11
MOVQ out+24(FP), DX
MOVQ 24(DX), CX // CX: &out[1][0]
MOVQ (DX), DX // DX: &out[0][0]
loopback_avx512_parallel82:
VMOVDQU64.Z (DX), K1, Z4
VMOVDQU64.Z (CX), K1, Z5
LOAD(0x00) // &in[0][0]
LOAD(0x00) // &in[0][0]
GALOIS(0x00, 0x55, Z16, Z14, Z15, Z4)
GALOIS(0x00, 0x55, Z20, Z12, Z13, Z5)
CMPQ AX, $1
JE skip_avx512_parallel82
CMPQ AX, $1
JE skip_avx512_parallel82
LOAD(0x18) // &in[1][0]
LOAD(0x18) // &in[1][0]
GALOIS(0xaa, 0xff, Z16, Z14, Z15, Z4)
GALOIS(0xaa, 0xff, Z20, Z12, Z13, Z5)
CMPQ AX, $2
JE skip_avx512_parallel82
CMPQ AX, $2
JE skip_avx512_parallel82
LOAD(0x30) // &in[2][0]
LOAD(0x30) // &in[2][0]
GALOIS(0x00, 0x55, Z17, Z14, Z15, Z4)
GALOIS(0x00, 0x55, Z21, Z12, Z13, Z5)
CMPQ AX, $3
JE skip_avx512_parallel82
CMPQ AX, $3
JE skip_avx512_parallel82
LOAD(0x48) // &in[3][0]
LOAD(0x48) // &in[3][0]
GALOIS(0xaa, 0xff, Z17, Z14, Z15, Z4)
GALOIS(0xaa, 0xff, Z21, Z12, Z13, Z5)
CMPQ AX, $4
JE skip_avx512_parallel82
CMPQ AX, $4
JE skip_avx512_parallel82
LOAD(0x60) // &in[4][0]
LOAD(0x60) // &in[4][0]
GALOIS(0x00, 0x55, Z18, Z14, Z15, Z4)
GALOIS(0x00, 0x55, Z22, Z12, Z13, Z5)
CMPQ AX, $5
JE skip_avx512_parallel82
CMPQ AX, $5
JE skip_avx512_parallel82
LOAD(0x78) // &in[5][0]
LOAD(0x78) // &in[5][0]
GALOIS(0xaa, 0xff, Z18, Z14, Z15, Z4)
GALOIS(0xaa, 0xff, Z22, Z12, Z13, Z5)
CMPQ AX, $6
JE skip_avx512_parallel82
CMPQ AX, $6
JE skip_avx512_parallel82
LOAD(0x90) // &in[6][0]
LOAD(0x90) // &in[6][0]
GALOIS(0x00, 0x55, Z19, Z14, Z15, Z4)
GALOIS(0x00, 0x55, Z23, Z12, Z13, Z5)
CMPQ AX, $7
JE skip_avx512_parallel82
CMPQ AX, $7
JE skip_avx512_parallel82
LOAD(0xa8) // &in[7][0]
LOAD(0xa8) // &in[7][0]
GALOIS(0xaa, 0xff, Z19, Z14, Z15, Z4)
GALOIS(0xaa, 0xff, Z23, Z12, Z13, Z5)
@ -206,8 +205,8 @@ skip_avx512_parallel82:
ADDQ $64, R11 // in4+=64
ADDQ $64, DX // out+=64
ADDQ $64, CX // out2+=64
ADDQ $64, DX // out+=64
ADDQ $64, CX // out2+=64
SUBQ $1, R9
JNZ loopback_avx512_parallel82
@ -221,13 +220,13 @@ done_avx512_parallel82:
//
// func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool)
TEXT ·_galMulAVX512Parallel84(SB), 7, $0
MOVQ in+0(FP), SI //
MOVQ 8(SI), R9 // R9: len(in)
SHRQ $6, R9 // len(in) / 64
MOVQ in+0(FP), SI
MOVQ 8(SI), R9 // R9: len(in)
SHRQ $6, R9 // len(in) / 64
TESTQ R9, R9
JZ done_avx512_parallel84
MOVQ matrix+48(FP), SI
MOVQ matrix+48(FP), SI
VMOVDQU64 0x000(SI), Z16
VMOVDQU64 0x040(SI), Z17
VMOVDQU64 0x080(SI), Z18
@ -248,17 +247,17 @@ TEXT ·_galMulAVX512Parallel84(SB), 7, $0
MOVQ $15, BX
VPBROADCASTB BX, Z2
MOVB addTo+56(FP), AX
MOVB addTo+56(FP), AX
IMULQ $-0x1, AX
KMOVQ AX, K1
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), AX // number of inputs
XORQ R11, R11
MOVQ out+24(FP), DX
MOVQ 24(DX), CX // CX: &out[1][0]
MOVQ 48(DX), R10 // R10: &out[2][0]
MOVQ 72(DX), R12 // R12: &out[3][0]
MOVQ (DX), DX // DX: &out[0][0]
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), AX // number of inputs
XORQ R11, R11
MOVQ out+24(FP), DX
MOVQ 24(DX), CX // CX: &out[1][0]
MOVQ 48(DX), R10 // R10: &out[2][0]
MOVQ 72(DX), R12 // R12: &out[3][0]
MOVQ (DX), DX // DX: &out[0][0]
loopback_avx512_parallel84:
VMOVDQU64.Z (DX), K1, Z4
@ -266,70 +265,70 @@ loopback_avx512_parallel84:
VMOVDQU64.Z (R10), K1, Z6
VMOVDQU64.Z (R12), K1, Z7
LOAD(0x00) // &in[0][0]
LOAD(0x00) // &in[0][0]
GALOIS(0x00, 0x55, Z16, Z14, Z15, Z4)
GALOIS(0x00, 0x55, Z20, Z12, Z13, Z5)
GALOIS(0x00, 0x55, Z24, Z10, Z11, Z6)
GALOIS(0x00, 0x55, Z28, Z8, Z9, Z7)
CMPQ AX, $1
JE skip_avx512_parallel84
JE skip_avx512_parallel84
LOAD(0x18) // &in[1][0]
LOAD(0x18) // &in[1][0]
GALOIS(0xaa, 0xff, Z16, Z14, Z15, Z4)
GALOIS(0xaa, 0xff, Z20, Z12, Z13, Z5)
GALOIS(0xaa, 0xff, Z24, Z10, Z11, Z6)
GALOIS(0xaa, 0xff, Z28, Z8, Z9, Z7)
CMPQ AX, $2
JE skip_avx512_parallel84
JE skip_avx512_parallel84
LOAD(0x30) // &in[2][0]
LOAD(0x30) // &in[2][0]
GALOIS(0x00, 0x55, Z17, Z14, Z15, Z4)
GALOIS(0x00, 0x55, Z21, Z12, Z13, Z5)
GALOIS(0x00, 0x55, Z25, Z10, Z11, Z6)
GALOIS(0x00, 0x55, Z29, Z8, Z9, Z7)
CMPQ AX, $3
JE skip_avx512_parallel84
JE skip_avx512_parallel84
LOAD(0x48) // &in[3][0]
LOAD(0x48) // &in[3][0]
GALOIS(0xaa, 0xff, Z17, Z14, Z15, Z4)
GALOIS(0xaa, 0xff, Z21, Z12, Z13, Z5)
GALOIS(0xaa, 0xff, Z25, Z10, Z11, Z6)
GALOIS(0xaa, 0xff, Z29, Z8, Z9, Z7)
CMPQ AX, $4
JE skip_avx512_parallel84
JE skip_avx512_parallel84
LOAD(0x60) // &in[4][0]
LOAD(0x60) // &in[4][0]
GALOIS(0x00, 0x55, Z18, Z14, Z15, Z4)
GALOIS(0x00, 0x55, Z22, Z12, Z13, Z5)
GALOIS(0x00, 0x55, Z26, Z10, Z11, Z6)
GALOIS(0x00, 0x55, Z30, Z8, Z9, Z7)
CMPQ AX, $5
JE skip_avx512_parallel84
JE skip_avx512_parallel84
LOAD(0x78) // &in[5][0]
LOAD(0x78) // &in[5][0]
GALOIS(0xaa, 0xff, Z18, Z14, Z15, Z4)
GALOIS(0xaa, 0xff, Z22, Z12, Z13, Z5)
GALOIS(0xaa, 0xff, Z26, Z10, Z11, Z6)
GALOIS(0xaa, 0xff, Z30, Z8, Z9, Z7)
CMPQ AX, $6
JE skip_avx512_parallel84
JE skip_avx512_parallel84
LOAD(0x90) // &in[6][0]
LOAD(0x90) // &in[6][0]
GALOIS(0x00, 0x55, Z19, Z14, Z15, Z4)
GALOIS(0x00, 0x55, Z23, Z12, Z13, Z5)
GALOIS(0x00, 0x55, Z27, Z10, Z11, Z6)
GALOIS(0x00, 0x55, Z31, Z8, Z9, Z7)
CMPQ AX, $7
JE skip_avx512_parallel84
JE skip_avx512_parallel84
LOAD(0xa8) // &in[7][0]
LOAD(0xa8) // &in[7][0]
GALOIS(0xaa, 0xff, Z19, Z14, Z15, Z4)
GALOIS(0xaa, 0xff, Z23, Z12, Z13, Z5)
GALOIS(0xaa, 0xff, Z27, Z10, Z11, Z6)

View File

@ -32,89 +32,87 @@
#define FLIP VS41
#define FLIP_ V9
// func galMulPpc(low, high, in, out []byte)
TEXT ·galMulPpc(SB), NOFRAME|NOSPLIT, $0-96
MOVD low+0(FP), LOW
MOVD high+24(FP), HIGH
MOVD in+48(FP), IN
MOVD in_len+56(FP), LEN
MOVD out+72(FP), OUT
MOVD low+0(FP), LOW
MOVD high+24(FP), HIGH
MOVD in+48(FP), IN
MOVD in_len+56(FP), LEN
MOVD out+72(FP), OUT
MOVD $16, OFFSET1
MOVD $32, OFFSET2
MOVD $16, OFFSET1
MOVD $32, OFFSET2
MOVD $·constants(SB), CONSTANTS
LXVD2X (CONSTANTS)(R0), ROTATE
LXVD2X (CONSTANTS)(OFFSET1), MASK
LXVD2X (CONSTANTS)(OFFSET2), FLIP
MOVD $·constants(SB), CONSTANTS
LXVD2X (CONSTANTS)(R0), ROTATE
LXVD2X (CONSTANTS)(OFFSET1), MASK
LXVD2X (CONSTANTS)(OFFSET2), FLIP
LXVD2X (LOW)(R0), X6
LXVD2X (HIGH)(R0), X7
VPERM X6_, V31, FLIP_, X6_
VPERM X7_, V31, FLIP_, X7_
LXVD2X (LOW)(R0), X6
LXVD2X (HIGH)(R0), X7
VPERM X6_, V31, FLIP_, X6_
VPERM X7_, V31, FLIP_, X7_
MOVD $0, OFFSET
MOVD $0, OFFSET
loop:
LXVD2X (IN)(OFFSET), MSG
LXVD2X (IN)(OFFSET), MSG
VSRB MSG_, ROTATE_, MSG_HI_
VAND MSG_, MASK_, MSG_
VPERM X6_, V31, MSG_, MSG_
VPERM X7_, V31, MSG_HI_, MSG_HI_
VSRB MSG_, ROTATE_, MSG_HI_
VAND MSG_, MASK_, MSG_
VPERM X6_, V31, MSG_, MSG_
VPERM X7_, V31, MSG_HI_, MSG_HI_
VXOR MSG_, MSG_HI_, MSG_
VXOR MSG_, MSG_HI_, MSG_
STXVD2X MSG, (OUT)(OFFSET)
ADD $16, OFFSET, OFFSET
CMP LEN, OFFSET
BGT loop
RET
STXVD2X MSG, (OUT)(OFFSET)
ADD $16, OFFSET, OFFSET
CMP LEN, OFFSET
BGT loop
RET
// func galMulPpcXorlow, high, in, out []byte)
TEXT ·galMulPpcXor(SB), NOFRAME|NOSPLIT, $0-96
MOVD low+0(FP), LOW
MOVD high+24(FP), HIGH
MOVD in+48(FP), IN
MOVD in_len+56(FP), LEN
MOVD out+72(FP), OUT
MOVD low+0(FP), LOW
MOVD high+24(FP), HIGH
MOVD in+48(FP), IN
MOVD in_len+56(FP), LEN
MOVD out+72(FP), OUT
MOVD $16, OFFSET1
MOVD $32, OFFSET2
MOVD $16, OFFSET1
MOVD $32, OFFSET2
MOVD $·constants(SB), CONSTANTS
LXVD2X (CONSTANTS)(R0), ROTATE
LXVD2X (CONSTANTS)(OFFSET1), MASK
LXVD2X (CONSTANTS)(OFFSET2), FLIP
MOVD $·constants(SB), CONSTANTS
LXVD2X (CONSTANTS)(R0), ROTATE
LXVD2X (CONSTANTS)(OFFSET1), MASK
LXVD2X (CONSTANTS)(OFFSET2), FLIP
LXVD2X (LOW)(R0), X6
LXVD2X (HIGH)(R0), X7
VPERM X6_, V31, FLIP_, X6_
VPERM X7_, V31, FLIP_, X7_
LXVD2X (LOW)(R0), X6
LXVD2X (HIGH)(R0), X7
VPERM X6_, V31, FLIP_, X6_
VPERM X7_, V31, FLIP_, X7_
MOVD $0, OFFSET
MOVD $0, OFFSET
loopXor:
LXVD2X (IN)(OFFSET), MSG
LXVD2X (OUT)(OFFSET), RESULT
LXVD2X (IN)(OFFSET), MSG
LXVD2X (OUT)(OFFSET), RESULT
VSRB MSG_, ROTATE_, MSG_HI_
VAND MSG_, MASK_, MSG_
VPERM X6_, V31, MSG_, MSG_
VPERM X7_, V31, MSG_HI_, MSG_HI_
VSRB MSG_, ROTATE_, MSG_HI_
VAND MSG_, MASK_, MSG_
VPERM X6_, V31, MSG_, MSG_
VPERM X7_, V31, MSG_HI_, MSG_HI_
VXOR MSG_, MSG_HI_, MSG_
VXOR MSG_, RESULT_, RESULT_
VXOR MSG_, MSG_HI_, MSG_
VXOR MSG_, RESULT_, RESULT_
STXVD2X RESULT, (OUT)(OFFSET)
STXVD2X RESULT, (OUT)(OFFSET)
ADD $16, OFFSET, OFFSET
CMP LEN, OFFSET
BGT loopXor
RET
ADD $16, OFFSET, OFFSET
CMP LEN, OFFSET
BGT loopXor
RET
DATA ·constants+0x0(SB)/8, $0x0404040404040404
DATA ·constants+0x8(SB)/8, $0x0404040404040404