Add generated byte assembler using asm2plan9s
Add recompilable assembler using asm2plan9smaster
parent
cebd11778b
commit
467733eb9c
|
@ -186,6 +186,10 @@ Example of performance scaling on Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz - 4 ph
|
|||
| 4 | 3179,33 | 235% |
|
||||
| 8 | 4346,18 | 321% |
|
||||
|
||||
# asm2plan9s
|
||||
|
||||
[asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.
|
||||
|
||||
# Links
|
||||
* [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/).
|
||||
* [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon). Compatible java library by Backblaze.
|
||||
|
|
|
@ -91,14 +91,9 @@ TEXT ·galMulAVX2Xor(SB), 7, $0
|
|||
MOVOU (DX), X7 // X7: high
|
||||
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||||
|
||||
/*
|
||||
YASM:
|
||||
|
||||
VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
|
||||
VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
|
||||
VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
||||
*/
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0x4d; BYTE $0x38; BYTE $0xf6; BYTE $0x01; BYTE $0xc4; BYTE $0xe3; BYTE $0x45; BYTE $0x38; BYTE $0xff; BYTE $0x01; BYTE $0xc4; BYTE $0x62; BYTE $0x7d; BYTE $0x78; BYTE $0xc5
|
||||
LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
|
||||
LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
|
||||
LONG $0x787d62c4; BYTE $0xc5 // VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
||||
|
||||
SHRQ $5, R9 // len(in) /32
|
||||
MOVQ out+72(FP), DX // DX: &out
|
||||
|
@ -107,20 +102,16 @@ VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
|||
JZ done_xor_avx2
|
||||
|
||||
loopback_xor_avx2:
|
||||
/* Yasm:
|
||||
|
||||
VMOVDQU YMM0, [rsi]
|
||||
VMOVDQU YMM4, [rdx]
|
||||
VPSRLQ YMM1, YMM0, 4 ; X1: high input
|
||||
VPAND YMM0, YMM0, YMM8 ; X0: low input
|
||||
VPAND YMM1, YMM1, YMM8 ; X1: high input
|
||||
VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
|
||||
VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
|
||||
VPXOR YMM3, YMM2, YMM3 ; X3: Result
|
||||
VPXOR YMM4, YMM3, YMM4 ; X4: Result
|
||||
VMOVDQU [rdx], YMM4
|
||||
*/
|
||||
BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x06; BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x22; BYTE $0xc5; BYTE $0xf5; BYTE $0x73; BYTE $0xd0; BYTE $0x04; BYTE $0xc4; BYTE $0xc1; BYTE $0x7d; BYTE $0xdb; BYTE $0xc0; BYTE $0xc4; BYTE $0xc1; BYTE $0x75; BYTE $0xdb; BYTE $0xc8; BYTE $0xc4; BYTE $0xe2; BYTE $0x4d; BYTE $0x00; BYTE $0xd0; BYTE $0xc4; BYTE $0xe2; BYTE $0x45; BYTE $0x00; BYTE $0xd9; BYTE $0xc5; BYTE $0xed; BYTE $0xef; BYTE $0xdb; BYTE $0xc5; BYTE $0xe5; BYTE $0xef; BYTE $0xe4; BYTE $0xc5; BYTE $0xfe; BYTE $0x7f; BYTE $0x22
|
||||
LONG $0x066ffec5 // VMOVDQU YMM0, [rsi]
|
||||
LONG $0x226ffec5 // VMOVDQU YMM4, [rdx]
|
||||
LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ YMM1, YMM0, 4 ; X1: high input
|
||||
LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND YMM0, YMM0, YMM8 ; X0: low input
|
||||
LONG $0xdb75c1c4; BYTE $0xc8 // VPAND YMM1, YMM1, YMM8 ; X1: high input
|
||||
LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
|
||||
LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
|
||||
LONG $0xdbefedc5 // VPXOR YMM3, YMM2, YMM3 ; X3: Result
|
||||
LONG $0xe4efe5c5 // VPXOR YMM4, YMM3, YMM4 ; X4: Result
|
||||
LONG $0x227ffec5 // VMOVDQU [rdx], YMM4
|
||||
|
||||
ADDQ $32, SI // in+=32
|
||||
ADDQ $32, DX // out+=32
|
||||
|
@ -142,14 +133,9 @@ TEXT ·galMulAVX2(SB), 7, $0
|
|||
MOVOU (DX), X7 // X7: high
|
||||
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||||
|
||||
/*
|
||||
YASM:
|
||||
|
||||
VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
|
||||
VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
|
||||
VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
||||
*/
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0x4d; BYTE $0x38; BYTE $0xf6; BYTE $0x01; BYTE $0xc4; BYTE $0xe3; BYTE $0x45; BYTE $0x38; BYTE $0xff; BYTE $0x01; BYTE $0xc4; BYTE $0x62; BYTE $0x7d; BYTE $0x78; BYTE $0xc5
|
||||
LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
|
||||
LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
|
||||
LONG $0x787d62c4; BYTE $0xc5 // VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
||||
|
||||
SHRQ $5, R9 // len(in) /32
|
||||
MOVQ out+72(FP), DX // DX: &out
|
||||
|
@ -158,18 +144,14 @@ VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
|||
JZ done_avx2
|
||||
|
||||
loopback_avx2:
|
||||
/* Yasm:
|
||||
|
||||
VMOVDQU YMM0, [rsi]
|
||||
VPSRLQ YMM1, YMM0, 4 ; X1: high input
|
||||
VPAND YMM0, YMM0, YMM8 ; X0: low input
|
||||
VPAND YMM1, YMM1, YMM8 ; X1: high input
|
||||
VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
|
||||
VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
|
||||
VPXOR YMM4, YMM2, YMM3 ; X4: Result
|
||||
VMOVDQU [rdx], YMM4
|
||||
*/
|
||||
BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x06; BYTE $0xc5; BYTE $0xf5; BYTE $0x73; BYTE $0xd0; BYTE $0x04; BYTE $0xc4; BYTE $0xc1; BYTE $0x7d; BYTE $0xdb; BYTE $0xc0; BYTE $0xc4; BYTE $0xc1; BYTE $0x75; BYTE $0xdb; BYTE $0xc8; BYTE $0xc4; BYTE $0xe2; BYTE $0x4d; BYTE $0x00; BYTE $0xd0; BYTE $0xc4; BYTE $0xe2; BYTE $0x45; BYTE $0x00; BYTE $0xd9; BYTE $0xc5; BYTE $0xed; BYTE $0xef; BYTE $0xe3; BYTE $0xc5; BYTE $0xfe; BYTE $0x7f; BYTE $0x22
|
||||
LONG $0x066ffec5 // VMOVDQU YMM0, [rsi]
|
||||
LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ YMM1, YMM0, 4 ; X1: high input
|
||||
LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND YMM0, YMM0, YMM8 ; X0: low input
|
||||
LONG $0xdb75c1c4; BYTE $0xc8 // VPAND YMM1, YMM1, YMM8 ; X1: high input
|
||||
LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
|
||||
LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
|
||||
LONG $0xe3efedc5 // VPXOR YMM4, YMM2, YMM3 ; X4: Result
|
||||
LONG $0x227ffec5 // VMOVDQU [rdx], YMM4
|
||||
|
||||
ADDQ $32, SI // in+=32
|
||||
ADDQ $32, DX // out+=32
|
||||
|
@ -177,6 +159,6 @@ VMOVDQU [rdx], YMM4
|
|||
JNZ loopback_avx2
|
||||
|
||||
done_avx2:
|
||||
// VZEROUPPER
|
||||
BYTE $0xc5; BYTE $0xf8; BYTE $0x77
|
||||
|
||||
BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER
|
||||
RET
|
||||
|
|
Loading…
Reference in New Issue