diff --git a/galois_amd64.s b/galois_amd64.s index e4b8815..8a34bcd 100644 --- a/galois_amd64.s +++ b/galois_amd64.s @@ -98,7 +98,7 @@ VINSERTI128 YMM6, YMM6, XMM6, 1 ; low VINSERTI128 YMM7, YMM7, XMM7, 1 ; high VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked) */ - BYTE $0xc4;BYTE $0xe3;BYTE $0x4d;BYTE $0x38;BYTE $0xf6;BYTE $0x01;BYTE $0xc4;BYTE $0xe3;BYTE $0x45;BYTE $0x38;BYTE $0xff;BYTE $0x01;BYTE $0xc4;BYTE $0x62;BYTE $0x7d;BYTE $0x78;BYTE $0xc5 + BYTE $0xc4; BYTE $0xe3; BYTE $0x4d; BYTE $0x38; BYTE $0xf6; BYTE $0x01; BYTE $0xc4; BYTE $0xe3; BYTE $0x45; BYTE $0x38; BYTE $0xff; BYTE $0x01; BYTE $0xc4; BYTE $0x62; BYTE $0x7d; BYTE $0x78; BYTE $0xc5 SHRQ $5, R9 // len(in) /32 MOVQ out+72(FP), DX // DX: &out @@ -120,7 +120,7 @@ VPXOR YMM3, YMM2, YMM3 ; X3: Result VPXOR YMM4, YMM3, YMM4 ; X4: Result VMOVDQU [rdx], YMM4 */ - BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x06;BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x22;BYTE $0xc5;BYTE $0xf5;BYTE $0x73;BYTE $0xd0;BYTE $0x04;BYTE $0xc4;BYTE $0xc1;BYTE $0x7d;BYTE $0xdb;BYTE $0xc0;BYTE $0xc4;BYTE $0xc1;BYTE $0x75;BYTE $0xdb;BYTE $0xc8;BYTE $0xc4;BYTE $0xe2;BYTE $0x4d;BYTE $0x00;BYTE $0xd0;BYTE $0xc4;BYTE $0xe2;BYTE $0x45;BYTE $0x00;BYTE $0xd9;BYTE $0xc5;BYTE $0xed;BYTE $0xef;BYTE $0xdb;BYTE $0xc5;BYTE $0xe5;BYTE $0xef;BYTE $0xe4;BYTE $0xc5;BYTE $0xfe;BYTE $0x7f;BYTE $0x22 + BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x06; BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x22; BYTE $0xc5; BYTE $0xf5; BYTE $0x73; BYTE $0xd0; BYTE $0x04; BYTE $0xc4; BYTE $0xc1; BYTE $0x7d; BYTE $0xdb; BYTE $0xc0; BYTE $0xc4; BYTE $0xc1; BYTE $0x75; BYTE $0xdb; BYTE $0xc8; BYTE $0xc4; BYTE $0xe2; BYTE $0x4d; BYTE $0x00; BYTE $0xd0; BYTE $0xc4; BYTE $0xe2; BYTE $0x45; BYTE $0x00; BYTE $0xd9; BYTE $0xc5; BYTE $0xed; BYTE $0xef; BYTE $0xdb; BYTE $0xc5; BYTE $0xe5; BYTE $0xef; BYTE $0xe4; BYTE $0xc5; BYTE $0xfe; BYTE $0x7f; BYTE $0x22 ADDQ $32, SI // in+=32 ADDQ $32, DX // out+=32 @@ -129,7 +129,7 @@ VMOVDQU [rdx], YMM4 done_xor_avx2: // VZEROUPPER - BYTE $0xc5;BYTE $0xf8;BYTE $0x77 + BYTE $0xc5; BYTE $0xf8; BYTE $0x77 RET // func galMulAVX2(low, high, in, out []byte) @@ -149,7 +149,7 @@ VINSERTI128 YMM6, YMM6, XMM6, 1 ; low VINSERTI128 YMM7, YMM7, XMM7, 1 ; high VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked) */ - BYTE $0xc4;BYTE $0xe3;BYTE $0x4d;BYTE $0x38;BYTE $0xf6;BYTE $0x01;BYTE $0xc4;BYTE $0xe3;BYTE $0x45;BYTE $0x38;BYTE $0xff;BYTE $0x01;BYTE $0xc4;BYTE $0x62;BYTE $0x7d;BYTE $0x78;BYTE $0xc5 + BYTE $0xc4; BYTE $0xe3; BYTE $0x4d; BYTE $0x38; BYTE $0xf6; BYTE $0x01; BYTE $0xc4; BYTE $0xe3; BYTE $0x45; BYTE $0x38; BYTE $0xff; BYTE $0x01; BYTE $0xc4; BYTE $0x62; BYTE $0x7d; BYTE $0x78; BYTE $0xc5 SHRQ $5, R9 // len(in) /32 MOVQ out+72(FP), DX // DX: &out @@ -169,7 +169,7 @@ VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part VPXOR YMM4, YMM2, YMM3 ; X4: Result VMOVDQU [rdx], YMM4 */ - BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x06;BYTE $0xc5;BYTE $0xf5;BYTE $0x73;BYTE $0xd0;BYTE $0x04;BYTE $0xc4;BYTE $0xc1;BYTE $0x7d;BYTE $0xdb;BYTE $0xc0;BYTE $0xc4;BYTE $0xc1;BYTE $0x75;BYTE $0xdb;BYTE $0xc8;BYTE $0xc4;BYTE $0xe2;BYTE $0x4d;BYTE $0x00;BYTE $0xd0;BYTE $0xc4;BYTE $0xe2;BYTE $0x45;BYTE $0x00;BYTE $0xd9;BYTE $0xc5;BYTE $0xed;BYTE $0xef;BYTE $0xe3;BYTE $0xc5;BYTE $0xfe;BYTE $0x7f;BYTE $0x22 + BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x06; BYTE $0xc5; BYTE $0xf5; BYTE $0x73; BYTE $0xd0; BYTE $0x04; BYTE $0xc4; BYTE $0xc1; BYTE $0x7d; BYTE $0xdb; BYTE $0xc0; BYTE $0xc4; BYTE $0xc1; BYTE $0x75; BYTE $0xdb; BYTE $0xc8; BYTE $0xc4; BYTE $0xe2; BYTE $0x4d; BYTE $0x00; BYTE $0xd0; BYTE $0xc4; BYTE $0xe2; BYTE $0x45; BYTE $0x00; BYTE $0xd9; BYTE $0xc5; BYTE $0xed; BYTE $0xef; BYTE $0xe3; BYTE $0xc5; BYTE $0xfe; BYTE $0x7f; BYTE $0x22 ADDQ $32, SI // in+=32 ADDQ $32, DX // out+=32 @@ -179,5 +179,5 @@ VMOVDQU [rdx], YMM4 done_avx2: // VZEROUPPER - BYTE $0xc5;BYTE $0xf8;BYTE $0x77 + BYTE $0xc5; BYTE $0xf8; BYTE $0x77 RET