Adding support for carry-less multiply.

2013-02-15 11:58:45 -08:00 · 2013-02-15 11:58:45 -08:00 · 6219bb9867
parent e6fd0a544b
commit 6219bb9867
3 changed files with 41 additions and 3 deletions
--- a/4
+++ b/4
@ -12,8 +12,8 @@ HDRS = gf_complete.h gf_int.h
 EXECUTABLES = gf_mult gf_div gf_add gf_unit gf_time gf_methods gf_poly \
              gf_example_1 gf_example_2 gf_example_3 gf_example_4

-CFLAGS = -O3 -msse4 -DINTEL_SSE4
-LDFLAGS = -O3 -msse4
+CFLAGS = -O3 -msse4 -maes -DINTEL_SSE4 -DINTEL_AES
+LDFLAGS = -O3 -msse4 -maes

 RM = /bin/rm -f

--- a/gf_complete.h
+++ b/gf_complete.h
@ -7,6 +7,11 @@
 #ifdef  INTEL_SSE4
 #include <nmmintrin.h>
 #include <emmintrin.h>
+#include <smmintrin.h>
+#endif
+
+#ifdef  INTEL_AES
+#include <wmmintrin.h>
 #endif

 /* This does either memcpy or xor, depending on "xor" */
--- a/gf_w64.c
+++ b/gf_w64.c
@ -185,6 +185,39 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
  return pr;
 }

+#ifdef  INTEL_AES
+/*
+ * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
+ */
+static
+inline
+gf_val_64_t
+gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
+{
+        __m128i         a, b;
+        __m128i         result;
+        __m128i         prim_poly;
+        __m128i         v;
+        gf_internal_t * h = gf->scratch;
+
+        a = _mm_set_epi32 (0, 0, (uint32_t)(a64 >> 32ULL), (uint32_t)(a64 & 0xffffffffULL));
+        b = _mm_set_epi32 (0, 0, (uint32_t)(b64 >> 32ULL), (uint32_t)(b64 & 0xffffffffULL));
+        prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
+
+        /* Do the initial multiply */
+        result = _mm_clmulepi64_si128 (a, b, 0);
+        /* Mask off the high order 32 bits using subtraction of the polynomial.
+         * NOTE: this part requires that the polynomial have at least 32 leading 0 bits.
+         */
+        v = _mm_srli_si128 (result, 12);
+        result = _mm_xor_si128 (result, _mm_clmulepi64_si128 (prim_poly, v, 0));
+        v = _mm_srli_si128 (result, 8);
+        result = _mm_xor_si128 (result, _mm_clmulepi64_si128 (prim_poly, v, 0));
+
+        return ((gf_val_64_t)_mm_extract_epi64(result, 0));
+}
+#endif
+
 void
 gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
 {
@ -1509,7 +1542,7 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg
      break;
    case GF_MULT_COMPOSITE:
      if (region_type & ~(GF_REGION_ALTMAP | GF_REGION_STDMAP)) return -1;
-      if (arg1 == 2 && arg2 == 0 || arg1 == 2 && arg2 == 1) {
+      if ((arg1 == 2 && arg2 == 0) || (arg1 == 2 && arg2 == 1)) {
        return sizeof(gf_internal_t) + sizeof(w64_composite_int_t) + 4;
      } else {
        return -1;