asmfmt assembler.
parent
75cae61a5b
commit
a3ee8967cb
267
galois_amd64.s
267
galois_amd64.s
|
@ -7,172 +7,177 @@
|
||||||
|
|
||||||
// func galMulSSSE3Xor(low, high, in, out []byte)
|
// func galMulSSSE3Xor(low, high, in, out []byte)
|
||||||
TEXT ·galMulSSSE3Xor(SB), 7, $0
|
TEXT ·galMulSSSE3Xor(SB), 7, $0
|
||||||
MOVQ low+0(FP),SI // SI: &low
|
MOVQ low+0(FP), SI // SI: &low
|
||||||
MOVQ high+24(FP),DX // DX: &high
|
MOVQ high+24(FP), DX // DX: &high
|
||||||
MOVOU (SI), X6 // X6 low
|
MOVOU (SI), X6 // X6 low
|
||||||
MOVOU (DX), X7 // X7: high
|
MOVOU (DX), X7 // X7: high
|
||||||
MOVQ $15, BX // BX: low mask
|
MOVQ $15, BX // BX: low mask
|
||||||
MOVQ BX, X8
|
MOVQ BX, X8
|
||||||
PXOR X5, X5
|
PXOR X5, X5
|
||||||
MOVQ in+48(FP),SI // R11: &in
|
MOVQ in+48(FP), SI // R11: &in
|
||||||
MOVQ in_len+56(FP),R9 // R9: len(in)
|
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||||||
MOVQ out+72(FP), DX // DX: &out
|
MOVQ out+72(FP), DX // DX: &out
|
||||||
PSHUFB X5, X8 // X8: lomask (unpacked)
|
PSHUFB X5, X8 // X8: lomask (unpacked)
|
||||||
SHRQ $4, R9 // len(in) / 16
|
SHRQ $4, R9 // len(in) / 16
|
||||||
CMPQ R9 ,$0
|
CMPQ R9, $0
|
||||||
JEQ done_xor
|
JEQ done_xor
|
||||||
|
|
||||||
loopback_xor:
|
loopback_xor:
|
||||||
MOVOU (SI),X0 // in[x]
|
MOVOU (SI), X0 // in[x]
|
||||||
MOVOU (DX),X4 // out[x]
|
MOVOU (DX), X4 // out[x]
|
||||||
MOVOU X0, X1 // in[x]
|
MOVOU X0, X1 // in[x]
|
||||||
MOVOU X6, X2 // low copy
|
MOVOU X6, X2 // low copy
|
||||||
MOVOU X7, X3 // high copy
|
MOVOU X7, X3 // high copy
|
||||||
PSRLQ $4, X1 // X1: high input
|
PSRLQ $4, X1 // X1: high input
|
||||||
PAND X8, X0 // X0: low input
|
PAND X8, X0 // X0: low input
|
||||||
PAND X8, X1 // X0: high input
|
PAND X8, X1 // X0: high input
|
||||||
PSHUFB X0, X2 // X2: mul low part
|
PSHUFB X0, X2 // X2: mul low part
|
||||||
PSHUFB X1, X3 // X3: mul high part
|
PSHUFB X1, X3 // X3: mul high part
|
||||||
PXOR X2, X3 // X3: Result
|
PXOR X2, X3 // X3: Result
|
||||||
PXOR X4, X3 // X3: Result xor existing out
|
PXOR X4, X3 // X3: Result xor existing out
|
||||||
MOVOU X3, (DX) // Store
|
MOVOU X3, (DX) // Store
|
||||||
ADDQ $16, SI // in+=16
|
ADDQ $16, SI // in+=16
|
||||||
ADDQ $16, DX // out+=16
|
ADDQ $16, DX // out+=16
|
||||||
SUBQ $1, R9
|
SUBQ $1, R9
|
||||||
JNZ loopback_xor
|
JNZ loopback_xor
|
||||||
|
|
||||||
done_xor:
|
done_xor:
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// func galMulSSSE3(low, high, in, out []byte)
|
// func galMulSSSE3(low, high, in, out []byte)
|
||||||
TEXT ·galMulSSSE3(SB), 7, $0
|
TEXT ·galMulSSSE3(SB), 7, $0
|
||||||
MOVQ low+0(FP),SI // SI: &low
|
MOVQ low+0(FP), SI // SI: &low
|
||||||
MOVQ high+24(FP),DX // DX: &high
|
MOVQ high+24(FP), DX // DX: &high
|
||||||
MOVOU (SI), X6 // X6 low
|
MOVOU (SI), X6 // X6 low
|
||||||
MOVOU (DX), X7 // X7: high
|
MOVOU (DX), X7 // X7: high
|
||||||
MOVQ $15, BX // BX: low mask
|
MOVQ $15, BX // BX: low mask
|
||||||
MOVQ BX, X8
|
MOVQ BX, X8
|
||||||
PXOR X5, X5
|
PXOR X5, X5
|
||||||
MOVQ in+48(FP),SI // R11: &in
|
MOVQ in+48(FP), SI // R11: &in
|
||||||
MOVQ in_len+56(FP),R9 // R9: len(in)
|
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||||||
MOVQ out+72(FP), DX // DX: &out
|
MOVQ out+72(FP), DX // DX: &out
|
||||||
PSHUFB X5, X8 // X8: lomask (unpacked)
|
PSHUFB X5, X8 // X8: lomask (unpacked)
|
||||||
SHRQ $4, R9 // len(in) / 16
|
SHRQ $4, R9 // len(in) / 16
|
||||||
CMPQ R9 ,$0
|
CMPQ R9, $0
|
||||||
JEQ done
|
JEQ done
|
||||||
|
|
||||||
loopback:
|
loopback:
|
||||||
MOVOU (SI),X0 // in[x]
|
MOVOU (SI), X0 // in[x]
|
||||||
MOVOU X0, X1 // in[x]
|
MOVOU X0, X1 // in[x]
|
||||||
MOVOU X6, X2 // low copy
|
MOVOU X6, X2 // low copy
|
||||||
MOVOU X7, X3 // high copy
|
MOVOU X7, X3 // high copy
|
||||||
PSRLQ $4, X1 // X1: high input
|
PSRLQ $4, X1 // X1: high input
|
||||||
PAND X8, X0 // X0: low input
|
PAND X8, X0 // X0: low input
|
||||||
PAND X8, X1 // X0: high input
|
PAND X8, X1 // X0: high input
|
||||||
PSHUFB X0, X2 // X2: mul low part
|
PSHUFB X0, X2 // X2: mul low part
|
||||||
PSHUFB X1, X3 // X3: mul high part
|
PSHUFB X1, X3 // X3: mul high part
|
||||||
PXOR X2, X3 // X3: Result
|
PXOR X2, X3 // X3: Result
|
||||||
MOVOU X3, (DX) // Store
|
MOVOU X3, (DX) // Store
|
||||||
ADDQ $16, SI // in+=16
|
ADDQ $16, SI // in+=16
|
||||||
ADDQ $16, DX // out+=16
|
ADDQ $16, DX // out+=16
|
||||||
SUBQ $1, R9
|
SUBQ $1, R9
|
||||||
JNZ loopback
|
JNZ loopback
|
||||||
|
|
||||||
done:
|
done:
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// func galMulAVX2Xor(low, high, in, out []byte)
|
// func galMulAVX2Xor(low, high, in, out []byte)
|
||||||
TEXT ·galMulAVX2Xor(SB), 7, $0
|
TEXT ·galMulAVX2Xor(SB), 7, $0
|
||||||
MOVQ low+0(FP),SI // SI: &low
|
MOVQ low+0(FP), SI // SI: &low
|
||||||
MOVQ high+24(FP),DX // DX: &high
|
MOVQ high+24(FP), DX // DX: &high
|
||||||
MOVQ $15, BX // BX: low mask
|
MOVQ $15, BX // BX: low mask
|
||||||
MOVQ BX, X5
|
MOVQ BX, X5
|
||||||
MOVOU (SI), X6 // X6 low
|
MOVOU (SI), X6 // X6 low
|
||||||
MOVOU (DX), X7 // X7: high
|
MOVOU (DX), X7 // X7: high
|
||||||
MOVQ in_len+56(FP),R9 // R9: len(in)
|
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
YASM:
|
YASM:
|
||||||
|
|
||||||
VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
|
VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
|
||||||
VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
|
VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
|
||||||
VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
||||||
*/
|
*/
|
||||||
|
BYTE $0xc4;BYTE $0xe3;BYTE $0x4d;BYTE $0x38;BYTE $0xf6;BYTE $0x01;BYTE $0xc4;BYTE $0xe3;BYTE $0x45;BYTE $0x38;BYTE $0xff;BYTE $0x01;BYTE $0xc4;BYTE $0x62;BYTE $0x7d;BYTE $0x78;BYTE $0xc5
|
||||||
|
|
||||||
BYTE $0xc4;BYTE $0xe3;BYTE $0x4d;BYTE $0x38;BYTE $0xf6;BYTE $0x01;BYTE $0xc4;BYTE $0xe3;BYTE $0x45;BYTE $0x38;BYTE $0xff;BYTE $0x01;BYTE $0xc4;BYTE $0x62;BYTE $0x7d;BYTE $0x78;BYTE $0xc5;
|
SHRQ $5, R9 // len(in) /32
|
||||||
|
MOVQ out+72(FP), DX // DX: &out
|
||||||
|
MOVQ in+48(FP), SI // R11: &in
|
||||||
|
TESTQ R9, R9
|
||||||
|
JZ done_xor_avx2
|
||||||
|
|
||||||
SHRQ $5, R9 // len(in) /32
|
|
||||||
MOVQ out+72(FP), DX // DX: &out
|
|
||||||
MOVQ in+48(FP), SI // R11: &in
|
|
||||||
TESTQ R9, R9
|
|
||||||
JZ done_xor_avx2
|
|
||||||
loopback_xor_avx2:
|
loopback_xor_avx2:
|
||||||
/* Yasm:
|
/* Yasm:
|
||||||
|
|
||||||
VMOVDQU YMM0, [rsi]
|
VMOVDQU YMM0, [rsi]
|
||||||
VMOVDQU YMM4, [rdx]
|
VMOVDQU YMM4, [rdx]
|
||||||
VPSRLQ YMM1, YMM0, 4 ; X1: high input
|
VPSRLQ YMM1, YMM0, 4 ; X1: high input
|
||||||
VPAND YMM0, YMM0, YMM8 ; X0: low input
|
VPAND YMM0, YMM0, YMM8 ; X0: low input
|
||||||
VPAND YMM1, YMM1, YMM8 ; X1: high input
|
VPAND YMM1, YMM1, YMM8 ; X1: high input
|
||||||
VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
|
VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
|
||||||
VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
|
VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
|
||||||
VPXOR YMM3, YMM2, YMM3 ; X3: Result
|
VPXOR YMM3, YMM2, YMM3 ; X3: Result
|
||||||
VPXOR YMM4, YMM3, YMM4 ; X4: Result
|
VPXOR YMM4, YMM3, YMM4 ; X4: Result
|
||||||
VMOVDQU [rdx], YMM4
|
VMOVDQU [rdx], YMM4
|
||||||
*/
|
*/
|
||||||
BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x06;BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x22;BYTE $0xc5;BYTE $0xf5;BYTE $0x73;BYTE $0xd0;BYTE $0x04;BYTE $0xc4;BYTE $0xc1;BYTE $0x7d;BYTE $0xdb;BYTE $0xc0;BYTE $0xc4;BYTE $0xc1;BYTE $0x75;BYTE $0xdb;BYTE $0xc8;BYTE $0xc4;BYTE $0xe2;BYTE $0x4d;BYTE $0x00;BYTE $0xd0;BYTE $0xc4;BYTE $0xe2;BYTE $0x45;BYTE $0x00;BYTE $0xd9;BYTE $0xc5;BYTE $0xed;BYTE $0xef;BYTE $0xdb;BYTE $0xc5;BYTE $0xe5;BYTE $0xef;BYTE $0xe4;BYTE $0xc5;BYTE $0xfe;BYTE $0x7f;BYTE $0x22;
|
BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x06;BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x22;BYTE $0xc5;BYTE $0xf5;BYTE $0x73;BYTE $0xd0;BYTE $0x04;BYTE $0xc4;BYTE $0xc1;BYTE $0x7d;BYTE $0xdb;BYTE $0xc0;BYTE $0xc4;BYTE $0xc1;BYTE $0x75;BYTE $0xdb;BYTE $0xc8;BYTE $0xc4;BYTE $0xe2;BYTE $0x4d;BYTE $0x00;BYTE $0xd0;BYTE $0xc4;BYTE $0xe2;BYTE $0x45;BYTE $0x00;BYTE $0xd9;BYTE $0xc5;BYTE $0xed;BYTE $0xef;BYTE $0xdb;BYTE $0xc5;BYTE $0xe5;BYTE $0xef;BYTE $0xe4;BYTE $0xc5;BYTE $0xfe;BYTE $0x7f;BYTE $0x22
|
||||||
|
|
||||||
|
ADDQ $32, SI // in+=32
|
||||||
|
ADDQ $32, DX // out+=32
|
||||||
|
SUBQ $1, R9
|
||||||
|
JNZ loopback_xor_avx2
|
||||||
|
|
||||||
ADDQ $32, SI // in+=32
|
|
||||||
ADDQ $32, DX // out+=32
|
|
||||||
SUBQ $1, R9
|
|
||||||
JNZ loopback_xor_avx2
|
|
||||||
done_xor_avx2:
|
done_xor_avx2:
|
||||||
// VZEROUPPER
|
// VZEROUPPER
|
||||||
BYTE $0xc5;BYTE $0xf8;BYTE $0x77;
|
BYTE $0xc5;BYTE $0xf8;BYTE $0x77
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// func galMulAVX2(low, high, in, out []byte)
|
// func galMulAVX2(low, high, in, out []byte)
|
||||||
TEXT ·galMulAVX2(SB), 7, $0
|
TEXT ·galMulAVX2(SB), 7, $0
|
||||||
MOVQ low+0(FP),SI // SI: &low
|
MOVQ low+0(FP), SI // SI: &low
|
||||||
MOVQ high+24(FP),DX // DX: &high
|
MOVQ high+24(FP), DX // DX: &high
|
||||||
MOVQ $15, BX // BX: low mask
|
MOVQ $15, BX // BX: low mask
|
||||||
MOVQ BX, X5
|
MOVQ BX, X5
|
||||||
MOVOU (SI), X6 // X6 low
|
MOVOU (SI), X6 // X6 low
|
||||||
MOVOU (DX), X7 // X7: high
|
MOVOU (DX), X7 // X7: high
|
||||||
MOVQ in_len+56(FP),R9 // R9: len(in)
|
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
YASM:
|
YASM:
|
||||||
|
|
||||||
VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
|
VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
|
||||||
VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
|
VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
|
||||||
VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
||||||
*/
|
*/
|
||||||
|
BYTE $0xc4;BYTE $0xe3;BYTE $0x4d;BYTE $0x38;BYTE $0xf6;BYTE $0x01;BYTE $0xc4;BYTE $0xe3;BYTE $0x45;BYTE $0x38;BYTE $0xff;BYTE $0x01;BYTE $0xc4;BYTE $0x62;BYTE $0x7d;BYTE $0x78;BYTE $0xc5
|
||||||
|
|
||||||
BYTE $0xc4;BYTE $0xe3;BYTE $0x4d;BYTE $0x38;BYTE $0xf6;BYTE $0x01;BYTE $0xc4;BYTE $0xe3;BYTE $0x45;BYTE $0x38;BYTE $0xff;BYTE $0x01;BYTE $0xc4;BYTE $0x62;BYTE $0x7d;BYTE $0x78;BYTE $0xc5;
|
SHRQ $5, R9 // len(in) /32
|
||||||
|
MOVQ out+72(FP), DX // DX: &out
|
||||||
|
MOVQ in+48(FP), SI // R11: &in
|
||||||
|
TESTQ R9, R9
|
||||||
|
JZ done_avx2
|
||||||
|
|
||||||
SHRQ $5, R9 // len(in) /32
|
|
||||||
MOVQ out+72(FP), DX // DX: &out
|
|
||||||
MOVQ in+48(FP), SI // R11: &in
|
|
||||||
TESTQ R9, R9
|
|
||||||
JZ done_avx2
|
|
||||||
loopback_avx2:
|
loopback_avx2:
|
||||||
/* Yasm:
|
/* Yasm:
|
||||||
|
|
||||||
VMOVDQU YMM0, [rsi]
|
VMOVDQU YMM0, [rsi]
|
||||||
VPSRLQ YMM1, YMM0, 4 ; X1: high input
|
VPSRLQ YMM1, YMM0, 4 ; X1: high input
|
||||||
VPAND YMM0, YMM0, YMM8 ; X0: low input
|
VPAND YMM0, YMM0, YMM8 ; X0: low input
|
||||||
VPAND YMM1, YMM1, YMM8 ; X1: high input
|
VPAND YMM1, YMM1, YMM8 ; X1: high input
|
||||||
VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
|
VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
|
||||||
VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
|
VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
|
||||||
VPXOR YMM4, YMM2, YMM3 ; X4: Result
|
VPXOR YMM4, YMM2, YMM3 ; X4: Result
|
||||||
VMOVDQU [rdx], YMM4
|
VMOVDQU [rdx], YMM4
|
||||||
*/
|
*/
|
||||||
BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x06;BYTE $0xc5;BYTE $0xf5;BYTE $0x73;BYTE $0xd0;BYTE $0x04;BYTE $0xc4;BYTE $0xc1;BYTE $0x7d;BYTE $0xdb;BYTE $0xc0;BYTE $0xc4;BYTE $0xc1;BYTE $0x75;BYTE $0xdb;BYTE $0xc8;BYTE $0xc4;BYTE $0xe2;BYTE $0x4d;BYTE $0x00;BYTE $0xd0;BYTE $0xc4;BYTE $0xe2;BYTE $0x45;BYTE $0x00;BYTE $0xd9;BYTE $0xc5;BYTE $0xed;BYTE $0xef;BYTE $0xe3;BYTE $0xc5;BYTE $0xfe;BYTE $0x7f;BYTE $0x22;
|
BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x06;BYTE $0xc5;BYTE $0xf5;BYTE $0x73;BYTE $0xd0;BYTE $0x04;BYTE $0xc4;BYTE $0xc1;BYTE $0x7d;BYTE $0xdb;BYTE $0xc0;BYTE $0xc4;BYTE $0xc1;BYTE $0x75;BYTE $0xdb;BYTE $0xc8;BYTE $0xc4;BYTE $0xe2;BYTE $0x4d;BYTE $0x00;BYTE $0xd0;BYTE $0xc4;BYTE $0xe2;BYTE $0x45;BYTE $0x00;BYTE $0xd9;BYTE $0xc5;BYTE $0xed;BYTE $0xef;BYTE $0xe3;BYTE $0xc5;BYTE $0xfe;BYTE $0x7f;BYTE $0x22
|
||||||
|
|
||||||
ADDQ $32, SI // in+=32
|
ADDQ $32, SI // in+=32
|
||||||
ADDQ $32, DX // out+=32
|
ADDQ $32, DX // out+=32
|
||||||
SUBQ $1, R9
|
SUBQ $1, R9
|
||||||
JNZ loopback_avx2
|
JNZ loopback_avx2
|
||||||
JMP done_avx2
|
JMP done_avx2
|
||||||
|
|
||||||
done_avx2:
|
done_avx2:
|
||||||
// VZEROUPPER
|
// VZEROUPPER
|
||||||
BYTE $0xc5;BYTE $0xf8;BYTE $0x77;
|
BYTE $0xc5;BYTE $0xf8;BYTE $0x77
|
||||||
RET
|
RET
|
||||||
|
|
Loading…
Reference in New Issue