From 1311a44f7a27b38217a94e9d7a5dbe3ae3dde035 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 17 Sep 2014 15:12:05 +0200 Subject: [PATCH] arm: NEON optimisations for gf_w4 Optimisations for the single table region multiplication and carry less multiplication using NEON's polynomial multiplication of 8-bit values. The single polynomial multiplication is not that useful but vector version is for region multiplication. Selected time_tool.sh results for a 1.7GHz cortex-a9: Region Best (MB/s): 672.72 W-Method: 4 -m CARRY_FREE - Region Best (MB/s): 265.84 W-Method: 4 -m BYTWO_p - Region Best (MB/s): 329.41 W-Method: 4 -m TABLE -r DOUBLE - Region Best (MB/s): 278.63 W-Method: 4 -m TABLE -r QUAD - Region Best (MB/s): 329.81 W-Method: 4 -m TABLE -r QUAD -r LAZY - Region Best (MB/s): 1318.03 W-Method: 4 -m TABLE -r SIMD - Region Best (MB/s): 165.15 W-Method: 4 -m TABLE -r NOSIMD - Region Best (MB/s): 99.73 W-Method: 4 -m LOG - --- include/gf_w4.h | 63 +++++++++++ src/Makefile.am | 7 ++ src/gf_w4.c | 68 +++--------- src/neon/gf_w4_neon.c | 247 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 335 insertions(+), 50 deletions(-) create mode 100644 include/gf_w4.h create mode 100644 src/neon/gf_w4_neon.c diff --git a/include/gf_w4.h b/include/gf_w4.h new file mode 100644 index 0000000..8ee94a3 --- /dev/null +++ b/include/gf_w4.h @@ -0,0 +1,63 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * gf_w4.h + * + * Defines and data structures for 4-bit Galois fields + */ + +#ifndef GF_COMPLETE_GF_W4_H +#define GF_COMPLETE_GF_W4_H + +#include + +#define GF_FIELD_WIDTH 4 +#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2) +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) +#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1) + +/* ------------------------------------------------------------ + JSP: Each implementation has its own data, which is allocated + at one time as part of the handle. For that reason, it + shouldn't be hierarchical -- i.e. one should be able to + allocate it with one call to malloc. */ + +struct gf_logtable_data { + uint8_t log_tbl[GF_FIELD_SIZE]; + uint8_t antilog_tbl[GF_FIELD_SIZE * 2]; + uint8_t *antilog_tbl_div; +}; + +struct gf_single_table_data { + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; +}; + +struct gf_double_table_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE]; +}; +struct gf_quad_table_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[GF_FIELD_SIZE][(1<<16)]; +}; + +struct gf_quad_table_lazy_data { + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE]; + uint16_t mult[(1 << 16)]; +}; + +struct gf_bytwo_data { + uint64_t prim_poly; + uint64_t mask1; + uint64_t mask2; +}; + +// ARM NEON init functions +int gf_w4_neon_cfm_init(gf_t *gf); +void gf_w4_neon_single_table_init(gf_t *gf); + +#endif /* GF_COMPLETE_GF_W4_H */ diff --git a/src/Makefile.am b/src/Makefile.am index 34633ea..5352d12 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,11 +1,18 @@ # GF-Complete 'core' AM file # Creates the library +AUTOMAKE_OPTIONS = subdir-objects + AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC lib_LTLIBRARIES = libgf_complete.la libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \ gf_w64.c gf_w128.c gf_rand.c gf_general.c + +if HAVE_NEON +libgf_complete_la_SOURCES += neon/gf_w4_neon.c +endif + libgf_complete_la_LDFLAGS = -version-info 1:0:0 diff --git a/src/gf_w4.c b/src/gf_w4.c index f098323..0e86aa8 100644 --- a/src/gf_w4.c +++ b/src/gf_w4.c @@ -11,49 +11,7 @@ #include "gf_int.h" #include #include - -#define GF_FIELD_WIDTH 4 -#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2) -#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH) -#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1) - -/* ------------------------------------------------------------ - JSP: Each implementation has its own data, which is allocated - at one time as part of the handle. For that reason, it - shouldn't be hierarchical -- i.e. one should be able to - allocate it with one call to malloc. */ - -struct gf_logtable_data { - uint8_t log_tbl[GF_FIELD_SIZE]; - uint8_t antilog_tbl[GF_FIELD_SIZE * 2]; - uint8_t *antilog_tbl_div; -}; - -struct gf_single_table_data { - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; -}; - -struct gf_double_table_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE]; -}; -struct gf_quad_table_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint16_t mult[GF_FIELD_SIZE][(1<<16)]; -}; - -struct gf_quad_table_lazy_data { - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE]; - uint16_t mult[(1 << 16)]; -}; - -struct gf_bytwo_data { - uint64_t prim_poly; - uint64_t mask1; - uint64_t mask2; -}; +#include "gf_w4.h" #define AB2(ip, am1 ,am2, b, t1, t2) {\ t1 = (b << 1) & am1;\ @@ -489,11 +447,15 @@ int gf_w4_single_table_init(gf_t *gf) gf->inverse.w32 = NULL; gf->divide.w32 = gf_w4_single_table_divide; gf->multiply.w32 = gf_w4_single_table_multiply; - #ifdef INTEL_SSSE3 + #if defined(INTEL_SSSE3) || defined(ARM_NEON) if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY)) gf->multiply_region.w32 = gf_w4_single_table_multiply_region; else + #if defined(INTEL_SSSE3) gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region; + #elif defined(ARM_NEON) + gf_w4_neon_single_table_init(gf); + #endif #else gf->multiply_region.w32 = gf_w4_single_table_multiply_region; if (h->region_type & GF_REGION_SIMD) return 0; @@ -774,16 +736,16 @@ int gf_w4_table_init(gf_t *gf) { int rt; gf_internal_t *h; - int issse3 = 0; + int simd = 0; -#ifdef INTEL_SSSE3 - issse3 = 1; +#if defined(INTEL_SSSE3) || defined(ARM_NEON) + simd = 1; #endif h = (gf_internal_t *) gf->scratch; rt = (h->region_type); - if (h->mult_type == GF_MULT_DEFAULT && !issse3) rt |= GF_REGION_DOUBLE_TABLE; + if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE; if (rt & GF_REGION_DOUBLE_TABLE) { return gf_w4_double_table_init(gf); @@ -1937,6 +1899,8 @@ int gf_w4_cfm_init(gf_t *gf) #if defined(INTEL_SSE4_PCLMUL) gf->multiply.w32 = gf_w4_clm_multiply; return 1; +#elif defined(ARM_NEON) + return gf_w4_neon_cfm_init(gf); #endif return 0; } @@ -1953,11 +1917,14 @@ int gf_w4_shift_init(gf_t *gf) int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { - int issse3 = 0; + int issse3 = 0, isneon = 0; #ifdef INTEL_SSSE3 issse3 = 1; #endif +#ifdef ARM_NEON + isneon = 1; +#endif switch(mult_type) { @@ -1971,7 +1938,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1 return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; } - if (mult_type == GF_MULT_DEFAULT && !issse3) region_type = GF_REGION_DOUBLE_TABLE; + if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon)) + region_type = GF_REGION_DOUBLE_TABLE; if (region_type & GF_REGION_DOUBLE_TABLE) { return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64; diff --git a/src/neon/gf_w4_neon.c b/src/neon/gf_w4_neon.c new file mode 100644 index 0000000..3a21432 --- /dev/null +++ b/src/neon/gf_w4_neon.c @@ -0,0 +1,247 @@ +/* + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. + * + * Copyright (c) 2014: Janne Grunau + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * - Neither the name of the University of Tennessee nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * gf_w4_neon.c + * + * Neon routines for 4-bit Galois fields + * + */ + +#include "gf_int.h" +#include +#include +#include "gf_w4.h" + +static +gf_val_32_t +gf_w4_neon_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4) +{ + gf_val_32_t rv = 0; + poly8x8_t result, prim_poly; + poly8x8_t a, b, w; + uint8x8_t v; + gf_internal_t * h = gf->scratch; + + a = vdup_n_p8 (a4); + b = vdup_n_p8 (b4); + + prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1fULL)); + + /* Do the initial multiply */ + result = vmul_p8 (a, b); + v = vshr_n_u8 (vreinterpret_u8_p8(result), 4); + w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v)); + result = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(result), vreinterpret_u8_p8(w))); + + /* Extracts 32 bit value from result. */ + rv = (gf_val_32_t)vget_lane_u8 (vreinterpret_u8_p8 (result), 0); + + return rv; +} + +static inline void +neon_clm_multiply_region_from_single (gf_t *gf, uint8_t *s8, uint8_t *d8, + gf_val_32_t val, uint8_t *d_end, int xor) +{ + gf_internal_t * h = gf->scratch; + poly8x8_t prim_poly; + poly8x8_t a, w, even, odd; + uint8x8_t b, c, v, mask; + + a = vdup_n_p8 (val); + mask = vdup_n_u8 (0xf); + prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0x1fULL)); + + while (d8 < d_end) { + b = vld1_u8 (s8); + + even = vreinterpret_p8_u8 (vand_u8 (b, mask)); + odd = vreinterpret_p8_u8 (vshr_n_u8 (b, 4)); + + if (xor) + c = vld1_u8 (d8); + + even = vmul_p8 (a, even); + odd = vmul_p8 (a, odd); + + v = vshr_n_u8 (vreinterpret_u8_p8(even), 4); + w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v)); + even = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(even), vreinterpret_u8_p8(w))); + + v = vshr_n_u8 (vreinterpret_u8_p8(odd), 4); + w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v)); + odd = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(odd), vreinterpret_u8_p8(w))); + + v = veor_u8 (vreinterpret_u8_p8 (even), vshl_n_u8 (vreinterpret_u8_p8 (odd), 4)); + + if (xor) + v = veor_u8 (c, v); + + vst1_u8 (d8, v); + + d8 += 8; + s8 += 8; + } +} + + +static void +gf_w4_neon_clm_multiply_region_from_single (gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint8_t *s8; + uint8_t *d8; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + s8 = (uint8_t *) rd.s_start; + d8 = (uint8_t *) rd.d_start; + + if (xor) + neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 1); + else + neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 0); + + gf_do_final_region_alignment(&rd); +} + +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + +static +inline +void +w4_single_table_multiply_region_neon(gf_t *gf, uint8_t *src, uint8_t *dst, + uint8_t * d_end, gf_val_32_t val, int xor) +{ + struct gf_single_table_data *std; + uint8_t *base; + uint8x16_t r, va, vh, vl, loset; + +#ifdef ARCH_AARCH64 + uint8x16_t th, tl; +#else + uint8x8x2_t th, tl; +#endif + + std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private; + base = (uint8_t *) std->mult; + base += (val << GF_FIELD_WIDTH); + +#ifdef ARCH_AARCH64 + tl = vld1q_u8 (base); + th = vshlq_n_u8 (tl, 4); +#else + tl.val[0] = vld1_u8 (base); + tl.val[1] = vld1_u8 (base + 8); + th.val[0] = vshl_n_u8 (tl.val[0], 4); + th.val[1] = vshl_n_u8 (tl.val[1], 4); +#endif + + loset = vdupq_n_u8(0xf); + + while (dst < d_end) { + va = vld1q_u8 (src); + + vh = vshrq_n_u8 (va, 4); + vl = vandq_u8 (va, loset); + + if (xor) + va = vld1q_u8 (dst); + + vh = vqtbl1q_u8 (th, vh); + vl = vqtbl1q_u8 (tl, vl); + + r = veorq_u8 (vh, vl); + + if (xor) + r = veorq_u8 (va, r); + + vst1q_u8 (dst, r); + + dst += 16; + src += 16; + } +} + +static +void +gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest, + gf_val_32_t val, int bytes, int xor) +{ + gf_region_data rd; + uint8_t *sptr, *dptr, *top; + + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } + + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); + gf_do_initial_region_alignment(&rd); + + sptr = rd.s_start; + dptr = rd.d_start; + top = rd.d_top; + + if (xor) + w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 1); + else + w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 0); + + gf_do_final_region_alignment(&rd); + +} + + +int gf_w4_neon_cfm_init(gf_t *gf) +{ + // single clm multiplication probably pointless + gf->multiply.w32 = gf_w4_neon_clm_multiply; + gf->multiply_region.w32 = gf_w4_neon_clm_multiply_region_from_single; + + return 1; +} + +void gf_w4_neon_single_table_init(gf_t *gf) +{ + gf->multiply_region.w32 = gf_w4_single_table_multiply_region_neon; +}