Basic 64-bit multiply using intrinsics working.

master
Ethan L. Miller 2013-02-15 15:41:46 -08:00
parent 6219bb9867
commit cf6a5dfa29
3 changed files with 17 additions and 14 deletions

View File

@ -12,8 +12,8 @@ HDRS = gf_complete.h gf_int.h
EXECUTABLES = gf_mult gf_div gf_add gf_unit gf_time gf_methods gf_poly \
gf_example_1 gf_example_2 gf_example_3 gf_example_4
CFLAGS = -O3 -msse4 -maes -DINTEL_SSE4 -DINTEL_AES
LDFLAGS = -O3 -msse4 -maes
CFLAGS = -O3 -msse4 -maes -mpclmul -DINTEL_SSE4 -DINTEL_PCLMUL
LDFLAGS = -O3 -msse4 -maes -mpclmul
RM = /bin/rm -f

View File

@ -10,7 +10,7 @@
#include <smmintrin.h>
#endif
#ifdef INTEL_AES
#ifdef INTEL_PCLMUL
#include <wmmintrin.h>
#endif

View File

@ -185,7 +185,7 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
return pr;
}
#ifdef INTEL_AES
#ifdef INTEL_PCLMUL
/*
* ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
*/
@ -197,22 +197,23 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
__m128i a, b;
__m128i result;
__m128i prim_poly;
__m128i v;
__m128i v, w;
gf_internal_t * h = gf->scratch;
a = _mm_set_epi32 (0, 0, (uint32_t)(a64 >> 32ULL), (uint32_t)(a64 & 0xffffffffULL));
b = _mm_set_epi32 (0, 0, (uint32_t)(b64 >> 32ULL), (uint32_t)(b64 & 0xffffffffULL));
a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
b = _mm_insert_epi64 (_mm_setzero_si128(), b64, 0);
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
/* Do the initial multiply */
result = _mm_clmulepi64_si128 (a, b, 0);
/* Mask off the high order 32 bits using subtraction of the polynomial.
* NOTE: this part requires that the polynomial have at least 32 leading 0 bits.
*/
v = _mm_srli_si128 (result, 12);
result = _mm_xor_si128 (result, _mm_clmulepi64_si128 (prim_poly, v, 0));
v = _mm_srli_si128 (result, 8);
result = _mm_xor_si128 (result, _mm_clmulepi64_si128 (prim_poly, v, 0));
v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
w = _mm_clmulepi64_si128 (prim_poly, v, 0);
result = _mm_xor_si128 (result, w);
v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
w = _mm_clmulepi64_si128 (prim_poly, v, 0);
result = _mm_xor_si128 (result, w);
return ((gf_val_64_t)_mm_extract_epi64(result, 0));
}
@ -408,7 +409,8 @@ gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_
static
int gf_w64_shift_init(gf_t *gf)
{
gf->multiply.w64 = gf_w64_shift_multiply;
/* gf->multiply.w64 = gf_w64_shift_multiply; */
gf->multiply.w64 = gf_w64_clm_multiply;
gf->inverse.w64 = gf_w64_euclid;
gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
return 1;
@ -1415,7 +1417,8 @@ int gf_w64_split_init(gf_t *gf)
/* Defaults */
gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
gf->multiply.w64 = gf_w64_shift_multiply;
gf->multiply.w64 = gf_w64_clm_multiply;
/* gf->multiply.w64 = gf_w64_shift_multiply; */
gf->inverse.w64 = gf_w64_euclid;
if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {