Adding support for carry-less multiply.

master
Ethan L. Miller 2013-02-15 11:58:45 -08:00
parent e6fd0a544b
commit 6219bb9867
3 changed files with 41 additions and 3 deletions

View File

@ -12,8 +12,8 @@ HDRS = gf_complete.h gf_int.h
EXECUTABLES = gf_mult gf_div gf_add gf_unit gf_time gf_methods gf_poly \
gf_example_1 gf_example_2 gf_example_3 gf_example_4
CFLAGS = -O3 -msse4 -DINTEL_SSE4
LDFLAGS = -O3 -msse4
CFLAGS = -O3 -msse4 -maes -DINTEL_SSE4 -DINTEL_AES
LDFLAGS = -O3 -msse4 -maes
RM = /bin/rm -f

View File

@ -7,6 +7,11 @@
#ifdef INTEL_SSE4
#include <nmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#endif
#ifdef INTEL_AES
#include <wmmintrin.h>
#endif
/* This does either memcpy or xor, depending on "xor" */

View File

@ -185,6 +185,39 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
return pr;
}
#ifdef INTEL_AES
/*
* ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
*/
static
inline
gf_val_64_t
gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
{
__m128i a, b;
__m128i result;
__m128i prim_poly;
__m128i v;
gf_internal_t * h = gf->scratch;
a = _mm_set_epi32 (0, 0, (uint32_t)(a64 >> 32ULL), (uint32_t)(a64 & 0xffffffffULL));
b = _mm_set_epi32 (0, 0, (uint32_t)(b64 >> 32ULL), (uint32_t)(b64 & 0xffffffffULL));
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
/* Do the initial multiply */
result = _mm_clmulepi64_si128 (a, b, 0);
/* Mask off the high order 32 bits using subtraction of the polynomial.
* NOTE: this part requires that the polynomial have at least 32 leading 0 bits.
*/
v = _mm_srli_si128 (result, 12);
result = _mm_xor_si128 (result, _mm_clmulepi64_si128 (prim_poly, v, 0));
v = _mm_srli_si128 (result, 8);
result = _mm_xor_si128 (result, _mm_clmulepi64_si128 (prim_poly, v, 0));
return ((gf_val_64_t)_mm_extract_epi64(result, 0));
}
#endif
void
gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
{
@ -1509,7 +1542,7 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg
break;
case GF_MULT_COMPOSITE:
if (region_type & ~(GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1;
if (arg1 == 2 && arg2 == 0 || arg1 == 2 && arg2 == 1) {
if ((arg1 == 2 && arg2 == 0) || (arg1 == 2 && arg2 == 1)) {
return sizeof(gf_internal_t) + sizeof(w64_composite_int_t) + 4;
} else {
return -1;