Merged in jannau/gf-complete/neon (pull request #25)

arm neon optimisations
master
Kevin Greenan 2014-10-24 14:19:31 -07:00
commit 70dd94ae38
30 changed files with 2253 additions and 405 deletions

View File

@ -3,9 +3,12 @@
# FIXME - add project url as the last argument
AC_INIT(gf-complete, 1.0)
# Override default CFLAGS
: ${CFLAGS="-Wall -Wpointer-arith -O3 -g"}
AC_PREREQ([2.61])
AM_INIT_AUTOMAKE([no-dependencies foreign])
AM_INIT_AUTOMAKE([no-dependencies foreign parallel-tests])
LT_INIT # libtool
AC_CONFIG_HEADER(include/config.h)
@ -16,14 +19,39 @@ AC_CONFIG_MACRO_DIR([m4])
# This prevents './configure; make' from trying to run autotools.
AM_MAINTAINER_MODE([disable])
# Override default CFLAGS
CFLAGS="-Wall -Wpointer-arith -O3 -g"
dnl Compiling with per-target flags requires AM_PROG_CC_C_O.
AC_PROG_CC
# Check for functions to provide aligned memory
#
AC_CHECK_FUNCS([posix_memalign],
[found_memalign=yes; break])
AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
AX_EXT()
AC_ARG_ENABLE([neon],
AS_HELP_STRING([--disable-neon], [Build without NEON optimizations]))
AS_IF([test "x$enable_neon" != "xno"],
[noneon_CPPFLAGS=$CPPFLAGS
CPPFLAGS="$CPPFLAGS $SIMD_FLAGS"
AC_CHECK_HEADER([arm_neon.h],
[have_neon=yes],
[have_neon=no
CPPFLAGS=$noneon_CPPFLAGS])],
[have_neon=no
AS_IF([test "x$ax_cv_have_neon_ext" = "xyes"],
[SIMD_FLAGS=""])
])
AS_IF([test "x$have_neon" = "xno"],
[AS_IF([test "x$enable_neon" = "xyes"],
[AC_MSG_ERROR([neon requested but arm_neon.h not found])])
])
AM_CONDITIONAL([HAVE_NEON], [test "x$have_neon" = "xyes"])
AC_ARG_ENABLE([sse],
AS_HELP_STRING([--disable-sse], [Build without SSE optimizations]),
[if test "x$enableval" = "xno" ; then

View File

@ -1,7 +1,7 @@
# GF-Complete 'examples' AM file
AM_CPPFLAGS=-I./ -I../include
AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
bin_PROGRAMS = gf_example_1 gf_example_2 gf_example_3 gf_example_4 \
gf_example_5 gf_example_6 gf_example_7

View File

@ -33,6 +33,10 @@
#include <wmmintrin.h>
#endif
#if defined(ARM_NEON)
#include <arm_neon.h>
#endif
/* These are the different ways to perform multiplication.
Not all are implemented for all values of w.
@ -61,7 +65,9 @@ typedef enum {GF_MULT_DEFAULT,
#define GF_REGION_DOUBLE_TABLE (0x1)
#define GF_REGION_QUAD_TABLE (0x2)
#define GF_REGION_LAZY (0x4)
#define GF_REGION_SIMD (0x8)
#define GF_REGION_SSE (0x8)
#define GF_REGION_NOSIMD (0x10)
#define GF_REGION_NOSSE (0x10)
#define GF_REGION_ALTMAP (0x20)
#define GF_REGION_CAUCHY (0x40)

View File

@ -113,7 +113,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
GF_E_DIVCOMP, /* Mult == Composite && Div != Default */
GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */
GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */
GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */
GF_E_SIMD_NO, /* Reg == SIMD && Reg == NOSIMD */
GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */
GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/
GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */
@ -129,9 +129,9 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
GF_E_QUAD__J, /* Reg == QUAD && other Reg */
GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/
GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */
GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */
GF_E_SSESHIF, /* Mult == Shift && Reg == SIMD|NOSIMD */
GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */
GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */
GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SIMD|NOSIMD */
GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */
GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */
GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */
@ -148,7 +148,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */
GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */
GF_E_TABLE_W, /* Mult == TABLE, w too big */
GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */
GF_E_TAB_SSE, /* Mult == TABLE, SIMD|NOSIMD only apply to w == 4 */
GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */
GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */
GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */
@ -172,7 +172,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */
GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */
GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */
GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */
GF_E_COMP_SS, /* Mult == COMP, SIMD|NOSIMD */
GF_E_COMP__W, /* Mult == COMP, Bad w. */
GF_E_UNKFLAG, /* Unknown flag in create_from.... */
GF_E_UNKNOWN, /* Unknown mult_type. */

66
include/gf_w16.h Normal file
View File

@ -0,0 +1,66 @@
/*
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
*
* gf_w16.h
*
* Defines and data structures for 16-bit Galois fields
*/
#ifndef GF_COMPLETE_GF_W16_H
#define GF_COMPLETE_GF_W16_H
#include <stdint.h>
#define GF_FIELD_WIDTH (16)
#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
#define GF_BASE_FIELD_WIDTH (8)
#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
struct gf_w16_logtable_data {
uint16_t log_tbl[GF_FIELD_SIZE];
uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
uint16_t inv_tbl[GF_FIELD_SIZE];
uint16_t *d_antilog;
};
struct gf_w16_zero_logtable_data {
int log_tbl[GF_FIELD_SIZE];
uint16_t _antilog_tbl[GF_FIELD_SIZE * 4];
uint16_t *antilog_tbl;
uint16_t inv_tbl[GF_FIELD_SIZE];
};
struct gf_w16_lazytable_data {
uint16_t log_tbl[GF_FIELD_SIZE];
uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
uint16_t inv_tbl[GF_FIELD_SIZE];
uint16_t *d_antilog;
uint16_t lazytable[GF_FIELD_SIZE];
};
struct gf_w16_bytwo_data {
uint64_t prim_poly;
uint64_t mask1;
uint64_t mask2;
};
struct gf_w16_split_8_8_data {
uint16_t tables[3][256][256];
};
struct gf_w16_group_4_4_data {
uint16_t reduce[16];
uint16_t shift[16];
};
struct gf_w16_composite_data {
uint8_t *mult_table;
};
void gf_w16_neon_split_init(gf_t *gf);
#endif /* GF_COMPLETE_GF_W16_H */

71
include/gf_w32.h Normal file
View File

@ -0,0 +1,71 @@
/*
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
*
* gf_w32.h
*
* Defines and data structures for 32-bit Galois fields
*/
#ifndef GF_COMPLETE_GF_W32_H
#define GF_COMPLETE_GF_W32_H
#include <stdint.h>
#define GF_FIELD_WIDTH (32)
#define GF_FIRST_BIT (1 << 31)
#define GF_BASE_FIELD_WIDTH (16)
#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
struct gf_split_2_32_lazy_data {
uint32_t tables[16][4];
uint32_t last_value;
};
struct gf_w32_split_8_8_data {
uint32_t tables[7][256][256];
uint32_t region_tables[4][256];
uint32_t last_value;
};
struct gf_w32_group_data {
uint32_t *reduce;
uint32_t *shift;
int tshift;
uint64_t rmask;
uint32_t *memory;
};
struct gf_split_16_32_lazy_data {
uint32_t tables[2][(1<<16)];
uint32_t last_value;
};
struct gf_split_8_32_lazy_data {
uint32_t tables[4][256];
uint32_t last_value;
};
struct gf_split_4_32_lazy_data {
uint32_t tables[8][16];
uint32_t last_value;
};
struct gf_w32_bytwo_data {
uint64_t prim_poly;
uint64_t mask1;
uint64_t mask2;
};
struct gf_w32_composite_data {
uint16_t *log;
uint16_t *alog;
};
void gf_w32_neon_split_init(gf_t *gf);
#endif /* GF_COMPLETE_GF_W32_H */

63
include/gf_w4.h Normal file
View File

@ -0,0 +1,63 @@
/*
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
*
* gf_w4.h
*
* Defines and data structures for 4-bit Galois fields
*/
#ifndef GF_COMPLETE_GF_W4_H
#define GF_COMPLETE_GF_W4_H
#include <stdint.h>
#define GF_FIELD_WIDTH 4
#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2)
#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1)
/* ------------------------------------------------------------
JSP: Each implementation has its own data, which is allocated
at one time as part of the handle. For that reason, it
shouldn't be hierarchical -- i.e. one should be able to
allocate it with one call to malloc. */
struct gf_logtable_data {
uint8_t log_tbl[GF_FIELD_SIZE];
uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
uint8_t *antilog_tbl_div;
};
struct gf_single_table_data {
uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
};
struct gf_double_table_data {
uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
};
struct gf_quad_table_data {
uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint16_t mult[GF_FIELD_SIZE][(1<<16)];
};
struct gf_quad_table_lazy_data {
uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint16_t mult[(1 << 16)];
};
struct gf_bytwo_data {
uint64_t prim_poly;
uint64_t mask1;
uint64_t mask2;
};
// ARM NEON init functions
int gf_w4_neon_cfm_init(gf_t *gf);
void gf_w4_neon_single_table_init(gf_t *gf);
#endif /* GF_COMPLETE_GF_W4_H */

50
include/gf_w64.h Normal file
View File

@ -0,0 +1,50 @@
/*
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
*
* gf_w64.h
*
* Defines and data structures for 64-bit Galois fields
*/
#ifndef GF_COMPLETE_GF_W64_H
#define GF_COMPLETE_GF_W64_H
#include <stdint.h>
#define GF_FIELD_WIDTH (64)
#define GF_FIRST_BIT (1ULL << 63)
#define GF_BASE_FIELD_WIDTH (32)
#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH)
#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
struct gf_w64_group_data {
uint64_t *reduce;
uint64_t *shift;
uint64_t *memory;
};
struct gf_split_4_64_lazy_data {
uint64_t tables[16][16];
uint64_t last_value;
};
struct gf_split_8_64_lazy_data {
uint64_t tables[8][(1<<8)];
uint64_t last_value;
};
struct gf_split_16_64_lazy_data {
uint64_t tables[4][(1<<16)];
uint64_t last_value;
};
struct gf_split_8_8_data {
uint64_t tables[15][256][256];
};
void gf_w64_neon_split_init(gf_t *gf);
#endif /* GF_COMPLETE_GF_W64_H */

99
include/gf_w8.h Normal file
View File

@ -0,0 +1,99 @@
/*
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
*
* gf_w8.c
*
* Defines and data stuctures for 8-bit Galois fields
*/
#ifndef GF_COMPLETE_GF_W8_H
#define GF_COMPLETE_GF_W8_H
#include "gf_int.h"
#include <stdint.h>
#define GF_FIELD_WIDTH (8)
#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2))
#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
#define GF_BASE_FIELD_WIDTH (4)
#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
struct gf_w8_logtable_data {
uint8_t log_tbl[GF_FIELD_SIZE];
uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
uint8_t inv_tbl[GF_FIELD_SIZE];
};
struct gf_w8_logzero_table_data {
short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
uint8_t antilog_tbl[512+512+1];
uint8_t *div_tbl;
uint8_t *inv_tbl;
};
struct gf_w8_logzero_small_table_data {
short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
uint8_t antilog_tbl[255*3];
uint8_t inv_tbl[GF_FIELD_SIZE];
uint8_t *div_tbl;
};
struct gf_w8_composite_data {
uint8_t *mult_table;
};
/* Don't change the order of these relative to gf_w8_half_table_data */
struct gf_w8_default_data {
uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
};
struct gf_w8_half_table_data {
uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
};
struct gf_w8_single_table_data {
uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
};
struct gf_w8_double_table_data {
uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
};
struct gf_w8_double_table_lazy_data {
uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE];
};
struct gf_w4_logtable_data {
uint8_t log_tbl[GF_BASE_FIELD_SIZE];
uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2];
uint8_t *antilog_tbl_div;
};
struct gf_w4_single_table_data {
uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
};
struct gf_w8_bytwo_data {
uint64_t prim_poly;
uint64_t mask1;
uint64_t mask2;
};
int gf_w8_neon_cfm_init(gf_t *gf);
void gf_w8_neon_split_init(gf_t *gf);
#endif /* GF_COMPLETE_GF_W8_H */

View File

@ -41,6 +41,55 @@ AC_DEFUN([AX_EXT],
AC_REQUIRE([AC_CANONICAL_HOST])
case $host_cpu in
aarch64*)
AC_DEFINE(HAVE_ARCH_AARCH64,,[targeting AArch64])
SIMD_FLAGS="$SIMD_FLAGS -DARCH_AARCH64"
AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext],
[
# TODO: detect / cross-compile
ax_cv_have_neon_ext=yes
])
AC_CACHE_CHECK([whether cryptographic extension is supported], [ax_cv_have_arm_crypt_ext],
[
# TODO: detect / cross-compile
ax_cv_have_arm_crypt_ext=yes
])
if test "$ax_cv_have_arm_crypt_ext" = yes; then
AC_DEFINE(HAVE_ARM_CRYPT_EXT,,[Support ARM cryptographic extension])
fi
if test "$ax_cv_have_neon_ext" = yes; then
AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
fi
if test "$ax_cv_have_arm_crypt_ext" = yes && test "$ax_cv_have_neon_ext" = yes; then
AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd+crypto,
SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd+crypto -DARM_CRYPT -DARM_NEON", [])
elif test "$ax_cv_have_arm_crypt_ext" = yes; then
AX_CHECK_COMPILE_FLAG(-march=armv8-a+crypto,
SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+crypto -DARM_CRYPT", [])
elif test "$ax_cv_have_neon_ext" = yes; then
AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd,
SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON", [])
fi
;;
arm*)
AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext],
[
# TODO: detect / cross-compile
ax_cv_have_neon_ext=yes
])
if test "$ax_cv_have_neon_ext" = yes; then
AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
AX_CHECK_COMPILE_FLAG(-mfpu=neon,
SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON", [])
fi
;;
powerpc*)
AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext],
[

View File

@ -1,11 +1,22 @@
# GF-Complete 'core' AM file
# Creates the library
AM_CPPFLAGS=-I./ -I../include
AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
AUTOMAKE_OPTIONS = subdir-objects
AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
lib_LTLIBRARIES = libgf_complete.la
libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
gf_w64.c gf_w128.c gf_rand.c gf_general.c
if HAVE_NEON
libgf_complete_la_SOURCES += neon/gf_w4_neon.c \
neon/gf_w8_neon.c \
neon/gf_w16_neon.c \
neon/gf_w32_neon.c \
neon/gf_w64_neon.c
endif
libgf_complete_la_LDFLAGS = -version-info 1:0:0

131
src/gf.c
View File

@ -41,7 +41,7 @@ void gf_error()
case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
case GF_E_SSE__NO: s = "Cannot specify -r SSE and -r NOSSE."; break;
case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;
case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
@ -51,23 +51,23 @@ void gf_error()
case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SSE|NOSSE."; break;
case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;
case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SSE|NOSSE."; break;
case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;
case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SSE|NOSSE."; break;
case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;
case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SSE|NOSSE."; break;
case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;
case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SSE, but SSE2 is not supported."; break;
case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;
case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SSE|NOSSE."; break;
case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;
case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
@ -77,33 +77,33 @@ void gf_error()
case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break;
case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;
case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
case GF_E_TAB_SSE: s = "With -m TABLE, SSE|NOSSE only applies to w=4."; break;
case GF_E_TABSSE3: s = "With -m TABLE, -r SSE, you need SSSE3 supported."; break;
case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;
case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;
case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break;
case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;
case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break;
case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;
case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SSE|NOSSE only with arg1/arg2 = 4/16."; break;
case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;
case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SSE|NOSSE only with arg1/arg2 = 4/32."; break;
case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;
case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SSE|NOSSE only with arg1/arg2 = 4/64."; break;
case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;
case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SSE."; break;
case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;
case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SSE and -r NOSSE do not apply."; break;
case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;
case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
@ -182,14 +182,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
int sse3 = 0;
int sse2 = 0;
int pclmul = 0;
int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp;
int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;
gf_internal_t *sub;
rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
rquad = (region_type & GF_REGION_QUAD_TABLE);
rlazy = (region_type & GF_REGION_LAZY);
rsse = (region_type & GF_REGION_SSE);
rnosse = (region_type & GF_REGION_NOSSE);
rsimd = (region_type & GF_REGION_SIMD);
rnosimd = (region_type & GF_REGION_NOSIMD);
raltmap = (region_type & GF_REGION_ALTMAP);
rcauchy = (region_type & GF_REGION_CAUCHY);
@ -201,7 +201,8 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
}
tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
GF_REGION_SSE | GF_REGION_NOSSE | GF_REGION_ALTMAP | GF_REGION_CAUCHY );
GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP |
GF_REGION_CAUCHY );
if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
#ifdef INTEL_SSE2
@ -216,6 +217,11 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
pclmul = 1;
#endif
#ifdef ARM_NEON
pclmul = 1;
sse3 = 1;
#endif
if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
@ -230,7 +236,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
return 1;
}
if (rsse && rnosse) { _gf_errno = GF_E_SSE__NO; return 0; }
if (rsimd && rnosimd) { _gf_errno = GF_E_SIMD_NO; return 0; }
if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; }
if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; }
if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; }
@ -252,7 +258,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; }
if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; }
if (rsse || rnosse || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; }
return 1;
}
@ -260,7 +266,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (rquad) {
if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; }
if (rsse || rnosse || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
return 1;
}
@ -268,7 +274,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (mult_type == GF_MULT_SHIFT) {
if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; }
if (rsse || rnosse) { _gf_errno = GF_E_SSESHIF; return 0; }
if (rsimd || rnosimd) { _gf_errno = GF_E_SSESHIF; return 0; }
return 1;
}
@ -281,7 +287,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; }
if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; }
if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
return 1;
}
@ -290,21 +296,21 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (w != 4 && w != 8 && w != 16 &&
w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }
if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; }
if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
return 1;
}
if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; }
if (rsse && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; }
if (rsimd && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; }
return 1;
}
if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
|| mult_type == GF_MULT_LOG_ZERO_EXT ) {
if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; }
if (raltmap || rsse || rnosse) { _gf_errno = GF_E_LOG___J; return 0; }
if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }
if (mult_type == GF_MULT_LOG_TABLE) return 1;
@ -324,14 +330,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
(arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; }
if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; }
if (raltmap || rsse || rnosse) { _gf_errno = GF_E_GR____J; return 0; }
if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_GR____J; return 0; }
return 1;
}
if (mult_type == GF_MULT_TABLE) {
if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; }
if (w != 4 && (rsse || rnosse)) { _gf_errno = GF_E_TAB_SSE; return 0; }
if (rsse && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; }
if (w != 4 && (rsimd || rnosimd)) { _gf_errno = GF_E_TAB_SSE; return 0; }
if (rsimd && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; }
if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; }
return 1;
}
@ -344,46 +350,46 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
}
if (w == 8) {
if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; }
if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; }
} else if (w == 16) {
if ((arg1 == 8 && arg2 == 8) ||
(arg1 == 8 && arg2 == 16)) {
if (rsse || rnosse) { _gf_errno = GF_E_SP_16_S; return 0; }
if (rsimd || rnosimd) { _gf_errno = GF_E_SP_16_S; return 0; }
if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; }
} else if (arg1 == 4 && arg2 == 16) {
if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
} else { _gf_errno = GF_E_SP_16AR; return 0; }
} else if (w == 32) {
if ((arg1 == 8 && arg2 == 8) ||
(arg1 == 8 && arg2 == 32) ||
(arg1 == 16 && arg2 == 32)) {
if (rsse || rnosse) { _gf_errno = GF_E_SP_32_S; return 0; }
if (rsimd || rnosimd) { _gf_errno = GF_E_SP_32_S; return 0; }
if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; }
} else if (arg1 == 4 && arg2 == 32) {
if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; }
if (raltmap && rnosse) { _gf_errno = GF_E_SP_32AS; return 0; }
if (raltmap && rnosimd) { _gf_errno = GF_E_SP_32AS; return 0; }
} else { _gf_errno = GF_E_SP_32AR; return 0; }
} else if (w == 64) {
if ((arg1 == 8 && arg2 == 8) ||
(arg1 == 8 && arg2 == 64) ||
(arg1 == 16 && arg2 == 64)) {
if (rsse || rnosse) { _gf_errno = GF_E_SP_64_S; return 0; }
if (rsimd || rnosimd) { _gf_errno = GF_E_SP_64_S; return 0; }
if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; }
} else if (arg1 == 4 && arg2 == 64) {
if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; }
if (raltmap && rnosse) { _gf_errno = GF_E_SP_64AS; return 0; }
if (raltmap && rnosimd) { _gf_errno = GF_E_SP_64AS; return 0; }
} else { _gf_errno = GF_E_SP_64AR; return 0; }
} else if (w == 128) {
if (arg1 == 8 && arg2 == 128) {
if (rsse || rnosse) { _gf_errno = GF_E_SP128_S; return 0; }
if (rsimd || rnosimd) { _gf_errno = GF_E_SP128_S; return 0; }
if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; }
} else if (arg1 == 4 && arg2 == 128) {
if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; }
if (raltmap && rnosse) { _gf_errno = GF_E_SP128AS; return 0; }
if (raltmap && rnosimd) { _gf_errno = GF_E_SP128AS; return 0; }
} else { _gf_errno = GF_E_SP128AR; return 0; }
} else { _gf_errno = GF_E_SPLIT_W; return 0; }
return 1;
@ -395,7 +401,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; }
if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; }
if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; }
if (rsse || rnosse) { _gf_errno = GF_E_COMP_SS; return 0; }
if (rsimd || rnosimd) { _gf_errno = GF_E_COMP_SS; return 0; }
if (base != NULL) {
sub = (gf_internal_t *) base->scratch;
if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; }
@ -953,7 +959,42 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
}
return;
#endif
#if defined(ARM_NEON)
s8 = (uint8_t *) src;
d8 = (uint8_t *) dest;
if (uls % 16 == uld % 16) {
gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
while (s8 != rd.s_start) {
*d8 ^= *s8;
s8++;
d8++;
}
while (s8 < (uint8_t *) rd.s_top) {
uint8x16_t vs = vld1q_u8 (s8);
uint8x16_t vd = vld1q_u8 (d8);
uint8x16_t vr = veorq_u8 (vs, vd);
vst1q_u8 (d8, vr);
s8 += 16;
d8 += 16;
}
} else {
while (s8 + 15 < (uint8_t *) src + bytes) {
uint8x16_t vs = vld1q_u8 (s8);
uint8x16_t vd = vld1q_u8 (d8);
uint8x16_t vr = veorq_u8 (vs, vd);
vst1q_u8 (d8, vr);
s8 += 16;
d8 += 16;
}
}
while (s8 < (uint8_t *) src + bytes) {
*d8 ^= *s8;
s8++;
d8++;
}
return;
#endif
if (uls % 8 != uld % 8) {
gf_unaligned_xor(src, dest, bytes);
return;

View File

@ -121,11 +121,17 @@ int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting)
} else if (strcmp(argv[starting], "LAZY") == 0) {
region_type |= GF_REGION_LAZY;
starting++;
} else if (strcmp(argv[starting], "SIMD") == 0) {
region_type |= GF_REGION_SIMD;
starting++;
} else if (strcmp(argv[starting], "NOSIMD") == 0) {
region_type |= GF_REGION_NOSIMD;
starting++;
} else if (strcmp(argv[starting], "SSE") == 0) {
region_type |= GF_REGION_SSE;
region_type |= GF_REGION_SIMD;
starting++;
} else if (strcmp(argv[starting], "NOSSE") == 0) {
region_type |= GF_REGION_NOSSE;
region_type |= GF_REGION_NOSIMD;
starting++;
} else if (strcmp(argv[starting], "CAUCHY") == 0) {
region_type |= GF_REGION_CAUCHY;

View File

@ -1527,7 +1527,7 @@ int gf_w128_split_init(gf_t *gf)
gf->multiply.w128 = gf_w128_bytwo_p_multiply;
#if defined(INTEL_SSE4_PCLMUL)
if (!(h->region_type & GF_REGION_NOSSE)){
if (!(h->region_type & GF_REGION_NOSIMD)){
gf->multiply.w128 = gf_w128_clm_multiply;
}
#endif
@ -1546,7 +1546,7 @@ int gf_w128_split_init(gf_t *gf)
if((h->region_type & GF_REGION_ALTMAP))
{
#ifdef INTEL_SSE4
if(!(h->region_type & GF_REGION_NOSSE))
if(!(h->region_type & GF_REGION_NOSIMD))
gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region;
else
return 0;
@ -1556,7 +1556,7 @@ int gf_w128_split_init(gf_t *gf)
}
else {
#ifdef INTEL_SSE4
if(!(h->region_type & GF_REGION_NOSSE))
if(!(h->region_type & GF_REGION_NOSIMD))
gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region;
else
gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;

View File

@ -11,54 +11,7 @@
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
#define GF_FIELD_WIDTH (16)
#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
#define GF_BASE_FIELD_WIDTH (8)
#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
struct gf_w16_logtable_data {
uint16_t log_tbl[GF_FIELD_SIZE];
uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
uint16_t inv_tbl[GF_FIELD_SIZE];
uint16_t *d_antilog;
};
struct gf_w16_zero_logtable_data {
int log_tbl[GF_FIELD_SIZE];
uint16_t _antilog_tbl[GF_FIELD_SIZE * 4];
uint16_t *antilog_tbl;
uint16_t inv_tbl[GF_FIELD_SIZE];
};
struct gf_w16_lazytable_data {
uint16_t log_tbl[GF_FIELD_SIZE];
uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
uint16_t inv_tbl[GF_FIELD_SIZE];
uint16_t *d_antilog;
uint16_t lazytable[GF_FIELD_SIZE];
};
struct gf_w16_bytwo_data {
uint64_t prim_poly;
uint64_t mask1;
uint64_t mask2;
};
struct gf_w16_split_8_8_data {
uint16_t tables[3][256][256];
};
struct gf_w16_group_4_4_data {
uint16_t reduce[16];
uint16_t shift[16];
};
struct gf_w16_composite_data {
uint8_t *mult_table;
};
#include "gf_w16.h"
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
@ -1264,6 +1217,7 @@ int gf_w16_split_init(gf_t *gf)
gf_internal_t *h;
struct gf_w16_split_8_8_data *d8;
int i, j, exp, issse3;
int isneon = 0;
uint32_t p, basep;
h = (gf_internal_t *) gf->scratch;
@ -1273,6 +1227,9 @@ int gf_w16_split_init(gf_t *gf)
#else
issse3 = 0;
#endif
#ifdef ARM_NEON
isneon = 1;
#endif
if (h->arg1 == 8 && h->arg2 == 8) {
d8 = (struct gf_w16_split_8_8_data *) h->private;
@ -1317,6 +1274,10 @@ int gf_w16_split_init(gf_t *gf)
if (issse3) {
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region;
} else if (isneon) {
#ifdef ARM_NEON
gf_w16_neon_split_init(gf);
#endif
} else {
gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
}
@ -1326,15 +1287,15 @@ int gf_w16_split_init(gf_t *gf)
gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
} else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
if (issse3) {
if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSSE)
if (issse3 || isneon) {
if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
else if(h->region_type & GF_REGION_NOSSE)
else if(h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
else if(h->region_type & GF_REGION_ALTMAP)
else if(h->region_type & GF_REGION_ALTMAP && issse3)
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region;
} else {
if(h->region_type & GF_REGION_SSE)
if(h->region_type & GF_REGION_SIMD)
return 0;
else if(h->region_type & GF_REGION_ALTMAP)
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
@ -1884,25 +1845,25 @@ int gf_w16_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w32 = gf_w16_bytwo_p_multiply;
#ifdef INTEL_SSE2
if (h->region_type & GF_REGION_NOSSE)
if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
if(h->region_type & GF_REGION_SSE)
if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
} else {
gf->multiply.w32 = gf_w16_bytwo_b_multiply;
#ifdef INTEL_SSE2
if (h->region_type & GF_REGION_NOSSE)
if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
if(h->region_type & GF_REGION_SSE)
if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
}

View File

@ -12,59 +12,7 @@
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
#define GF_FIELD_WIDTH (32)
#define GF_FIRST_BIT (1 << 31)
#define GF_BASE_FIELD_WIDTH (16)
#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
struct gf_split_2_32_lazy_data {
uint32_t tables[16][4];
uint32_t last_value;
};
struct gf_w32_split_8_8_data {
uint32_t tables[7][256][256];
uint32_t region_tables[4][256];
uint32_t last_value;
};
struct gf_w32_group_data {
uint32_t *reduce;
uint32_t *shift;
int tshift;
uint64_t rmask;
uint32_t *memory;
};
struct gf_split_16_32_lazy_data {
uint32_t tables[2][(1<<16)];
uint32_t last_value;
};
struct gf_split_8_32_lazy_data {
uint32_t tables[4][256];
uint32_t last_value;
};
struct gf_split_4_32_lazy_data {
uint32_t tables[8][16];
uint32_t last_value;
};
struct gf_w32_bytwo_data {
uint64_t prim_poly;
uint64_t mask1;
uint64_t mask2;
};
struct gf_w32_composite_data {
uint16_t *log;
uint16_t *alog;
};
#include "gf_w32.h"
#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
@ -1434,25 +1382,25 @@ int gf_w32_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w32 = gf_w32_bytwo_p_multiply;
#ifdef INTEL_SSE2
if (h->region_type & GF_REGION_NOSSE)
if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region;
if(h->region_type & GF_REGION_SSE)
if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
} else {
gf->multiply.w32 = gf_w32_bytwo_b_multiply;
#ifdef INTEL_SSE2
if (h->region_type & GF_REGION_NOSSE)
if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region;
if(h->region_type & GF_REGION_SSE)
if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
}
@ -2283,6 +2231,7 @@ int gf_w32_split_init(gf_t *gf)
struct gf_split_16_32_lazy_data *d16;
uint32_t p, basep;
int i, j, exp, ispclmul, issse3;
int isneon = 0;
#if defined(INTEL_SSE4_PCLMUL)
ispclmul = 1;
@ -2295,6 +2244,9 @@ int gf_w32_split_init(gf_t *gf)
#else
issse3 = 0;
#endif
#ifdef ARM_NEON
isneon = 1;
#endif
h = (gf_internal_t *) gf->scratch;
@ -2335,13 +2287,13 @@ int gf_w32_split_init(gf_t *gf)
ld2 = (struct gf_split_2_32_lazy_data *) h->private;
ld2->last_value = 0;
#ifdef INTEL_SSSE3
if (!(h->region_type & GF_REGION_NOSSE))
if (!(h->region_type & GF_REGION_NOSIMD))
gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region;
else
gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
#else
gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
if(h->region_type & GF_REGION_SSE) return 0;
if(h->region_type & GF_REGION_SIMD) return 0;
#endif
return 1;
}
@ -2349,11 +2301,15 @@ int gf_w32_split_init(gf_t *gf)
/* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
(issse3 && h->mult_type == GF_REGION_DEFAULT)) {
((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) {
ld4 = (struct gf_split_4_32_lazy_data *) h->private;
ld4->last_value = 0;
if ((h->region_type & GF_REGION_NOSSE) || !issse3) {
if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) {
gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
} else if (isneon) {
#ifdef ARM_NEON
gf_w32_neon_split_init(gf);
#endif
} else if (h->region_type & GF_REGION_ALTMAP) {
gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
} else {
@ -2731,10 +2687,14 @@ int gf_w32_composite_init(gf_t *gf)
int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
int issse3 = 0;
int isneon = 0;
#ifdef INTEL_SSSE3
issse3 = 1;
#endif
#ifdef ARM_NEON
isneon = 1;
#endif
switch(mult_type)
{
@ -2760,7 +2720,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg
return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
}
if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) ||
(mult_type == GF_MULT_DEFAULT && !issse3)) {
(mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) {
return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
}
if ((arg1 == 4 && arg2 == 32) ||

View File

@ -11,49 +11,7 @@
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
#define GF_FIELD_WIDTH 4
#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2)
#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1)
/* ------------------------------------------------------------
JSP: Each implementation has its own data, which is allocated
at one time as part of the handle. For that reason, it
shouldn't be hierarchical -- i.e. one should be able to
allocate it with one call to malloc. */
struct gf_logtable_data {
uint8_t log_tbl[GF_FIELD_SIZE];
uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
uint8_t *antilog_tbl_div;
};
struct gf_single_table_data {
uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
};
struct gf_double_table_data {
uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
};
struct gf_quad_table_data {
uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint16_t mult[GF_FIELD_SIZE][(1<<16)];
};
struct gf_quad_table_lazy_data {
uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint16_t mult[(1 << 16)];
};
struct gf_bytwo_data {
uint64_t prim_poly;
uint64_t mask1;
uint64_t mask2;
};
#include "gf_w4.h"
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
@ -489,14 +447,18 @@ int gf_w4_single_table_init(gf_t *gf)
gf->inverse.w32 = NULL;
gf->divide.w32 = gf_w4_single_table_divide;
gf->multiply.w32 = gf_w4_single_table_multiply;
#ifdef INTEL_SSSE3
if(h->region_type & (GF_REGION_NOSSE | GF_REGION_CAUCHY))
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
else
#if defined(INTEL_SSSE3)
gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
#elif defined(ARM_NEON)
gf_w4_neon_single_table_init(gf);
#endif
#else
gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
if (h->region_type & GF_REGION_SSE) return 0;
if (h->region_type & GF_REGION_SIMD) return 0;
#endif
return 1;
@ -774,16 +736,16 @@ int gf_w4_table_init(gf_t *gf)
{
int rt;
gf_internal_t *h;
int issse3 = 0;
int simd = 0;
#ifdef INTEL_SSSE3
issse3 = 1;
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
simd = 1;
#endif
h = (gf_internal_t *) gf->scratch;
rt = (h->region_type);
if (h->mult_type == GF_MULT_DEFAULT && !issse3) rt |= GF_REGION_DOUBLE_TABLE;
if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE;
if (rt & GF_REGION_DOUBLE_TABLE) {
return gf_w4_double_table_init(gf);
@ -1905,25 +1867,25 @@ int gf_w4_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w32 = gf_w4_bytwo_p_multiply;
#ifdef INTEL_SSE2
if (h->region_type & GF_REGION_NOSSE)
if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w4_bytwo_p_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w4_bytwo_p_nosse_multiply_region;
if (h->region_type & GF_REGION_SSE)
if (h->region_type & GF_REGION_SIMD)
return 0;
#endif
} else {
gf->multiply.w32 = gf_w4_bytwo_b_multiply;
#ifdef INTEL_SSE2
if (h->region_type & GF_REGION_NOSSE)
if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w4_bytwo_b_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w4_bytwo_b_nosse_multiply_region;
if (h->region_type & GF_REGION_SSE)
if (h->region_type & GF_REGION_SIMD)
return 0;
#endif
}
@ -1937,6 +1899,8 @@ int gf_w4_cfm_init(gf_t *gf)
#if defined(INTEL_SSE4_PCLMUL)
gf->multiply.w32 = gf_w4_clm_multiply;
return 1;
#elif defined(ARM_NEON)
return gf_w4_neon_cfm_init(gf);
#endif
return 0;
}
@ -1953,11 +1917,14 @@ int gf_w4_shift_init(gf_t *gf)
int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
int issse3 = 0;
int issse3 = 0, isneon = 0;
#ifdef INTEL_SSSE3
issse3 = 1;
#endif
#ifdef ARM_NEON
isneon = 1;
#endif
switch(mult_type)
{
@ -1971,7 +1938,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1
return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
}
if (mult_type == GF_MULT_DEFAULT && !issse3) region_type = GF_REGION_DOUBLE_TABLE;
if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))
region_type = GF_REGION_DOUBLE_TABLE;
if (region_type & GF_REGION_DOUBLE_TABLE) {
return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64;

View File

@ -11,38 +11,7 @@
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
#define GF_FIELD_WIDTH (64)
#define GF_FIRST_BIT (1ULL << 63)
#define GF_BASE_FIELD_WIDTH (32)
#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH)
#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
struct gf_w64_group_data {
uint64_t *reduce;
uint64_t *shift;
uint64_t *memory;
};
struct gf_split_4_64_lazy_data {
uint64_t tables[16][16];
uint64_t last_value;
};
struct gf_split_8_64_lazy_data {
uint64_t tables[8][(1<<8)];
uint64_t last_value;
};
struct gf_split_16_64_lazy_data {
uint64_t tables[4][(1<<16)];
uint64_t last_value;
};
struct gf_split_8_8_data {
uint64_t tables[15][256][256];
};
#include "gf_w64.h"
static
inline
@ -1488,25 +1457,25 @@ int gf_w64_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w64 = gf_w64_bytwo_p_multiply;
#ifdef INTEL_SSE2
if (h->region_type & GF_REGION_NOSSE)
if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region;
else
gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region;
#else
gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region;
if(h->region_type & GF_REGION_SSE)
if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
} else {
gf->multiply.w64 = gf_w64_bytwo_b_multiply;
#ifdef INTEL_SSE2
if (h->region_type & GF_REGION_NOSSE)
if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region;
else
gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region;
#else
gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region;
if(h->region_type & GF_REGION_SSE)
if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
}
@ -2006,7 +1975,7 @@ int gf_w64_split_init(gf_t *gf)
gf->multiply.w64 = gf_w64_bytwo_p_multiply;
#if defined(INTEL_SSE4_PCLMUL)
if ((!(h->region_type & GF_REGION_NOSSE) &&
if ((!(h->region_type & GF_REGION_NOSIMD) &&
(h->arg1 == 64 || h->arg2 == 64)) ||
h->mult_type == GF_MULT_DEFAULT){
@ -2027,11 +1996,15 @@ int gf_w64_split_init(gf_t *gf)
/* Allen: set region pointers for default mult type. Single pointers are
* taken care of above (explicitly for sse, implicitly for no sse). */
#ifdef INTEL_SSE4
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
if (h->mult_type == GF_MULT_DEFAULT) {
d4 = (struct gf_split_4_64_lazy_data *) h->private;
d4->last_value = 0;
#if defined(INTEL_SSE4)
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
#elif defined(ARCH_AARCH64)
gf_w64_neon_split_init(gf);
#endif
}
#else
if (h->mult_type == GF_MULT_DEFAULT) {
@ -2045,25 +2018,31 @@ int gf_w64_split_init(gf_t *gf)
d4 = (struct gf_split_4_64_lazy_data *) h->private;
d4->last_value = 0;
if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSSE)) return 0;
if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSIMD)) return 0;
if(h->region_type & GF_REGION_ALTMAP)
{
#ifdef INTEL_SSSE3
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region;
#elif defined(ARCH_AARCH64)
gf_w64_neon_split_init(gf);
#else
return 0;
#endif
}
else //no altmap
{
#ifdef INTEL_SSE4
if(h->region_type & GF_REGION_NOSSE)
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
if(h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
else
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
#if defined(INTEL_SSE4)
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
#elif defined(ARCH_AARCH64)
gf_w64_neon_split_init(gf);
#endif
#else
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
if(h->region_type & GF_REGION_SSE)
if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
}
@ -2134,7 +2113,7 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg
/* Allen: set the *local* arg1 and arg2, just for scratch size purposes,
* then fall through to split table scratch size code. */
#ifdef INTEL_SSE4
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
arg1 = 64;
arg2 = 4;
#else

View File

@ -9,88 +9,10 @@
*/
#include "gf_int.h"
#include "gf_w8.h"
#include <stdio.h>
#include <stdlib.h>
#define GF_FIELD_WIDTH (8)
#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2))
#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
#define GF_BASE_FIELD_WIDTH (4)
#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
struct gf_w8_logtable_data {
uint8_t log_tbl[GF_FIELD_SIZE];
uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
uint8_t inv_tbl[GF_FIELD_SIZE];
};
struct gf_w8_logzero_table_data {
short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
uint8_t antilog_tbl[512+512+1];
uint8_t *div_tbl;
uint8_t *inv_tbl;
};
struct gf_w8_logzero_small_table_data {
short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
uint8_t antilog_tbl[255*3];
uint8_t inv_tbl[GF_FIELD_SIZE];
uint8_t *div_tbl;
};
struct gf_w8_composite_data {
uint8_t *mult_table;
};
/* Don't change the order of these relative to gf_w8_half_table_data */
struct gf_w8_default_data {
uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
};
struct gf_w8_half_table_data {
uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
};
struct gf_w8_single_table_data {
uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
};
struct gf_w8_double_table_data {
uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
};
struct gf_w8_double_table_lazy_data {
uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE];
};
struct gf_w4_logtable_data {
uint8_t log_tbl[GF_BASE_FIELD_SIZE];
uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2];
uint8_t *antilog_tbl_div;
};
struct gf_w4_single_table_data {
uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
};
struct gf_w8_bytwo_data {
uint64_t prim_poly;
uint64_t mask1;
uint64_t mask2;
};
#define AB2(ip, am1 ,am2, b, t1, t2) {\
t1 = (b << 1) & am1;\
t2 = b & am2; \
@ -603,6 +525,8 @@ int gf_w8_cfm_init(gf_t *gf)
return 0;
}
return 1;
#elif defined(ARM_NEON)
return gf_w8_neon_cfm_init(gf);
#endif
return 0;
@ -938,7 +862,7 @@ gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
return (ftd->multtable[a][b]);
}
#ifdef INTEL_SSSE3
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
static
gf_val_32_t
gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
@ -1179,14 +1103,18 @@ int gf_w8_split_init(gf_t *gf)
gf->multiply.w32 = gf_w8_split_multiply;
#ifdef INTEL_SSSE3
if (h->region_type & GF_REGION_NOSSE)
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w8_split_multiply_region;
else
#if defined(INTEL_SSSE3)
gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
#elif defined(ARM_NEON)
gf_w8_neon_split_init(gf);
#endif
#else
gf->multiply_region.w32 = gf_w8_split_multiply_region;
if(h->region_type & GF_REGION_SSE)
if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
@ -1205,17 +1133,17 @@ int gf_w8_table_init(gf_t *gf)
struct gf_w8_double_table_data *dtd = NULL;
struct gf_w8_double_table_lazy_data *ltd = NULL;
struct gf_w8_default_data *dd = NULL;
int a, b, c, prod, scase, issse;
int a, b, c, prod, scase, use_simd;
h = (gf_internal_t *) gf->scratch;
#ifdef INTEL_SSSE3
issse = 1;
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
use_simd = 1;
#else
issse = 0;
use_simd = 0;
#endif
if (h->mult_type == GF_MULT_DEFAULT && issse) {
if (h->mult_type == GF_MULT_DEFAULT && use_simd) {
dd = (struct gf_w8_default_data *)h->private;
scase = 3;
bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
@ -1290,10 +1218,14 @@ int gf_w8_table_init(gf_t *gf)
gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
break;
case 3:
#ifdef INTEL_SSSE3
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
gf->divide.w32 = gf_w8_default_divide;
gf->multiply.w32 = gf_w8_default_multiply;
#if defined(INTEL_SSSE3)
gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
#elif defined(ARM_NEON)
gf_w8_neon_split_init(gf);
#endif
#endif
break;
}
@ -2259,25 +2191,25 @@ int gf_w8_bytwo_init(gf_t *gf)
if (h->mult_type == GF_MULT_BYTWO_p) {
gf->multiply.w32 = gf_w8_bytwo_p_multiply;
#ifdef INTEL_SSE2
if (h->region_type & GF_REGION_NOSSE)
if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w8_bytwo_p_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w8_bytwo_p_nosse_multiply_region;
if(h->region_type & GF_REGION_SSE)
if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
} else {
gf->multiply.w32 = gf_w8_bytwo_b_multiply;
#ifdef INTEL_SSE2
if (h->region_type & GF_REGION_NOSSE)
if (h->region_type & GF_REGION_NOSIMD)
gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
else
gf->multiply_region.w32 = gf_w8_bytwo_b_sse_multiply_region;
#else
gf->multiply_region.w32 = gf_w8_bytwo_b_nosse_multiply_region;
if(h->region_type & GF_REGION_SSE)
if(h->region_type & GF_REGION_SIMD)
return 0;
#endif
}
@ -2296,7 +2228,7 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1
switch(mult_type)
{
case GF_MULT_DEFAULT:
#ifdef INTEL_SSSE3
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
#endif
return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;

356
src/neon/gf_w16_neon.c Normal file
View File

@ -0,0 +1,356 @@
/*
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
*
* Copyright (c) 2014: Janne Grunau <j@jannau.net>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* - Neither the name of the University of Tennessee nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*
* gf_w16_neon.c
*
* Neon routines for 16-bit Galois fields
*
*/
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
#include "gf_w16.h"
#ifdef ARCH_AARCH64
static
inline
void
neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
uint16_t *d_end, uint8_t *tbl,
gf_val_32_t val, int xor)
{
unsigned i;
uint8_t *high = tbl + 4 * 16;
uint16x8_t va0, va1, r0, r1;
uint8x16_t loset, rl, rh;
uint8x16x2_t va;
uint8x16_t tbl_h[4], tbl_l[4];
for (i = 0; i < 4; i++) {
tbl_l[i] = vld1q_u8(tbl + i*16);
tbl_h[i] = vld1q_u8(high + i*16);
}
loset = vdupq_n_u8(0xf);
while (dst < d_end) {
va0 = vld1q_u16(src);
va1 = vld1q_u16(src + 8);
va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1));
rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
va.val[0] = vshrq_n_u8(va.val[0], 4);
va.val[1] = vshrq_n_u8(va.val[1], 4);
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
va = vtrnq_u8(rl, rh);
r0 = vreinterpretq_u16_u8(va.val[0]);
r1 = vreinterpretq_u16_u8(va.val[1]);
if (xor) {
va0 = vld1q_u16(dst);
va1 = vld1q_u16(dst + 8);
r0 = veorq_u16(r0, va0);
r1 = veorq_u16(r1, va1);
}
vst1q_u16(dst, r0);
vst1q_u16(dst + 8, r1);
src += 16;
dst += 16;
}
}
static
inline
void
neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
uint8_t *dst, uint8_t *d_end,
uint8_t *tbl, gf_val_32_t val,
int xor)
{
unsigned i;
uint8_t *high = tbl + 4 * 16;
uint8x16_t vh, vl, rh, rl;
uint8x16_t loset;
uint8x16_t tbl_h[4], tbl_l[4];
for (i = 0; i < 4; i++) {
tbl_l[i] = vld1q_u8(tbl + i*16);
tbl_h[i] = vld1q_u8(high + i*16);
}
loset = vdupq_n_u8(0xf);
while (dst < d_end) {
vh = vld1q_u8(src);
vl = vld1q_u8(src + 16);
rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));
rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));
vl = vshrq_n_u8(vl, 4);
vh = vshrq_n_u8(vh, 4);
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));
if (xor) {
vh = vld1q_u8(dst);
vl = vld1q_u8(dst + 16);
rh = veorq_u8(rh, vh);
rl = veorq_u8(rl, vl);
}
vst1q_u8(dst, rh);
vst1q_u8(dst + 16, rl);
src += 32;
dst += 32;
}
}
#else /* ARCH_AARCH64 */
static
inline
void
neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
uint16_t *d_end, uint8_t *tbl,
gf_val_32_t val, int xor)
{
unsigned i;
uint8_t *high = tbl + 4 * 16;
uint16x8_t va, r;
uint8x8_t loset, vb, vc, rl, rh;
uint8x8x2_t tbl_h[4], tbl_l[4];
for (i = 0; i < 4; i++) {
tbl_l[i].val[0] = vld1_u8(tbl + i*16);
tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
tbl_h[i].val[0] = vld1_u8(high + i*16);
tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
}
loset = vdup_n_u8(0xf);
while (dst < d_end) {
va = vld1q_u16(src);
vb = vmovn_u16(va);
vc = vshrn_n_u16(va, 8);
rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset));
rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset));
vb = vshr_n_u8(vb, 4);
rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset)));
rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset)));
vc = vshr_n_u8(vc, 4);
rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb));
rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb));
rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc));
rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc));
r = vmovl_u8(rl);
r = vorrq_u16(r, vshll_n_u8(rh, 8));
if (xor) {
va = vld1q_u16(dst);
r = veorq_u16(r, va);
}
vst1q_u16(dst, r);
src += 8;
dst += 8;
}
}
static
inline
void
neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
uint8_t *dst, uint8_t *d_end,
uint8_t *tbl, gf_val_32_t val,
int xor)
{
unsigned i;
uint8_t *high = tbl + 4 * 16;
uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3;
uint8x8_t loset;
uint8x8x2_t tbl_h[4], tbl_l[4];
for (i = 0; i < 4; i++) {
tbl_l[i].val[0] = vld1_u8(tbl + i*16);
tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
tbl_h[i].val[0] = vld1_u8(high + i*16);
tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
}
loset = vdup_n_u8(0xf);
while (dst < d_end) {
vh0 = vld1_u8(src);
vh1 = vld1_u8(src + 8);
vl0 = vld1_u8(src + 16);
vl1 = vld1_u8(src + 24);
r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset));
r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset));
r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset));
r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset));
vh0 = vshr_n_u8(vh0, 4);
vh1 = vshr_n_u8(vh1, 4);
vl0 = vshr_n_u8(vl0, 4);
vl1 = vshr_n_u8(vl1, 4);
r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0));
r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1));
r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0));
r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1));
if (xor) {
vh0 = vld1_u8(dst);
vh1 = vld1_u8(dst + 8);
vl0 = vld1_u8(dst + 16);
vl1 = vld1_u8(dst + 24);
r0 = veor_u8(r0, vh0);
r1 = veor_u8(r1, vh1);
r2 = veor_u8(r2, vl0);
r3 = veor_u8(r3, vl1);
}
vst1_u8(dst, r0);
vst1_u8(dst + 8, r1);
vst1_u8(dst + 16, r2);
vst1_u8(dst + 24, r3);
src += 32;
dst += 32;
}
}
#endif /* ARCH_AARCH64 */
static
inline
void
neon_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest,
gf_val_32_t val, int bytes, int xor,
int altmap)
{
gf_region_data rd;
unsigned i, j;
uint64_t c, prod;
uint8_t tbl[2 * 4 * 16];
uint8_t *high = tbl + 4 * 16;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
for (i = 0; i < 4; i++) {
for (j = 0; j < 16; j++) {
c = (j << (i*4));
prod = gf->multiply.w32(gf, c, val);
tbl[i*16 + j] = prod & 0xff;
high[i*16 + j] = prod >> 8;
}
}
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
gf_do_initial_region_alignment(&rd);
if (altmap) {
uint8_t *s8 = rd.s_start;
uint8_t *d8 = rd.d_start;
uint8_t *end8 = rd.d_top;
if (xor)
neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 1);
else
neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 0);
} else {
uint16_t *s16 = rd.s_start;
uint16_t *d16 = rd.d_start;
uint16_t *end16 = rd.d_top;
if (xor)
neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 1);
else
neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 0);
}
gf_do_final_region_alignment(&rd);
}
static
void
gf_w16_split_4_16_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
gf_val_32_t val, int bytes, int xor)
{
neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
}
static
void
gf_w16_split_4_16_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
void *dest,
gf_val_32_t val, int bytes,
int xor)
{
neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
}
void gf_w16_neon_split_init(gf_t *gf)
{
gf_internal_t *h = (gf_internal_t *) gf->scratch;
if (h->region_type & GF_REGION_ALTMAP)
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_altmap_multiply_region_neon;
else
gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region_neon;
}

269
src/neon/gf_w32_neon.c Normal file
View File

@ -0,0 +1,269 @@
/*
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
*
* Copyright (c) 2014: Janne Grunau <j@jannau.net>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* - Neither the name of the University of Tennessee nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* gf_w32_neon.c
*
* Neon routines for 32-bit Galois fields
*
*/
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
#include "gf_w32.h"
#ifndef ARCH_AARCH64
#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
vtbl2_u8(tbl, vget_high_u8(v)))
#endif
static
void
neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst,
uint32_t *d_end, uint8_t btable[8][4][16],
uint32_t val, int xor, int altmap)
{
int i, j;
#ifdef ARCH_AARCH64
uint8x16_t tables[8][4];
#else
uint8x8x2_t tables[8][4];
#endif
uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3;
uint8x16_t p0, p1, p2, p3, si, mask1;
uint16x8x2_t r0, r1;
uint8x16x2_t q0, q1;
for (i = 0; i < 8; i++) {
for (j = 0; j < 4; j++) {
#ifdef ARCH_AARCH64
tables[i][j] = vld1q_u8(btable[i][j]);
#else
tables[i][j].val[0] = vld1_u8(btable[i][j]);
tables[i][j].val[1] = vld1_u8(btable[i][j] + 8);
#endif
}
}
mask1 = vdupq_n_u8(0xf);
while (dst < d_end) {
v0 = vld1q_u32(src); src += 4;
v1 = vld1q_u32(src); src += 4;
v2 = vld1q_u32(src); src += 4;
v3 = vld1q_u32(src); src += 4;
if (altmap) {
q0.val[0] = vreinterpretq_u8_u32(v0);
q0.val[1] = vreinterpretq_u8_u32(v1);
q1.val[0] = vreinterpretq_u8_u32(v2);
q1.val[1] = vreinterpretq_u8_u32(v3);
} else {
r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2));
r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3));
q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]),
vreinterpretq_u8_u16(r1.val[0]));
q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]),
vreinterpretq_u8_u16(r1.val[1]));
}
si = vandq_u8(q0.val[0], mask1);
p0 = vqtbl1q_u8(tables[0][0], si);
p1 = vqtbl1q_u8(tables[0][1], si);
p2 = vqtbl1q_u8(tables[0][2], si);
p3 = vqtbl1q_u8(tables[0][3], si);
si = vshrq_n_u8(q0.val[0], 4);
p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si));
p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si));
p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si));
p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si));
si = vandq_u8(q0.val[1], mask1);
p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si));
p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si));
p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si));
p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si));
si = vshrq_n_u8(q0.val[1], 4);
p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si));
p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si));
p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si));
p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si));
si = vandq_u8(q1.val[0], mask1);
p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si));
p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si));
p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si));
p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si));
si = vshrq_n_u8(q1.val[0], 4);
p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si));
p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si));
p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si));
p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si));
si = vandq_u8(q1.val[1], mask1);
p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si));
p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si));
p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si));
p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si));
si = vshrq_n_u8(q1.val[1], 4);
p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si));
p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si));
p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si));
p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si));
if (altmap) {
s0 = vreinterpretq_u32_u8(p0);
s1 = vreinterpretq_u32_u8(p1);
s2 = vreinterpretq_u32_u8(p2);
s3 = vreinterpretq_u32_u8(p3);
} else {
q0 = vtrnq_u8(p0, p1);
q1 = vtrnq_u8(p2, p3);
r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]),
vreinterpretq_u16_u8(q1.val[0]));
r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]),
vreinterpretq_u16_u8(q1.val[1]));
s0 = vreinterpretq_u32_u16(r0.val[0]);
s1 = vreinterpretq_u32_u16(r1.val[0]);
s2 = vreinterpretq_u32_u16(r0.val[1]);
s3 = vreinterpretq_u32_u16(r1.val[1]);
}
if (xor) {
v0 = vld1q_u32(dst);
v1 = vld1q_u32(dst + 4);
v2 = vld1q_u32(dst + 8);
v3 = vld1q_u32(dst + 12);
s0 = veorq_u32(s0, v0);
s1 = veorq_u32(s1, v1);
s2 = veorq_u32(s2, v2);
s3 = veorq_u32(s3, v3);
}
vst1q_u32(dst, s0);
vst1q_u32(dst + 4, s1);
vst1q_u32(dst + 8, s2);
vst1q_u32(dst + 12, s3);
dst += 16;
}
}
static
inline
void
neon_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor, int altmap)
{
gf_internal_t *h;
int i, j, k;
uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
uint8_t btable[8][4][16];
gf_region_data rd;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
h = (gf_internal_t *) gf->scratch;
pp = h->prim_poly;
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
gf_do_initial_region_alignment(&rd);
s32 = (uint32_t *) rd.s_start;
d32 = (uint32_t *) rd.d_start;
top = (uint32_t *) rd.d_top;
v = val;
for (i = 0; i < 8; i++) {
tmp_table[0] = 0;
for (j = 1; j < 16; j <<= 1) {
for (k = 0; k < j; k++) {
tmp_table[k^j] = (v ^ tmp_table[k]);
}
v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
}
for (j = 0; j < 4; j++) {
for (k = 0; k < 16; k++) {
btable[i][j][k] = (uint8_t) tmp_table[k];
tmp_table[k] >>= 8;
}
}
}
if (xor)
neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 1, altmap);
else
neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 0, altmap);
gf_do_final_region_alignment(&rd);
}
static
void
gf_w32_split_4_32_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
gf_val_32_t val, int bytes, int xor)
{
neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
}
static
void
gf_w32_split_4_32_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
void *dest, gf_val_32_t val,
int bytes, int xor)
{
neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
}
void gf_w32_neon_split_init(gf_t *gf)
{
gf_internal_t *h = (gf_internal_t *) gf->scratch;
if (h->region_type & GF_REGION_ALTMAP)
gf->multiply_region.w32 = gf_w32_split_4_32_lazy_altmap_multiply_region_neon;
else
gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region_neon;
}

247
src/neon/gf_w4_neon.c Normal file
View File

@ -0,0 +1,247 @@
/*
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
*
* Copyright (c) 2014: Janne Grunau <j@jannau.net>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* - Neither the name of the University of Tennessee nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* gf_w4_neon.c
*
* Neon routines for 4-bit Galois fields
*
*/
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
#include "gf_w4.h"
static
gf_val_32_t
gf_w4_neon_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
{
gf_val_32_t rv = 0;
poly8x8_t result, prim_poly;
poly8x8_t a, b, w;
uint8x8_t v;
gf_internal_t * h = gf->scratch;
a = vdup_n_p8 (a4);
b = vdup_n_p8 (b4);
prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1fULL));
/* Do the initial multiply */
result = vmul_p8 (a, b);
v = vshr_n_u8 (vreinterpret_u8_p8(result), 4);
w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
result = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(result), vreinterpret_u8_p8(w)));
/* Extracts 32 bit value from result. */
rv = (gf_val_32_t)vget_lane_u8 (vreinterpret_u8_p8 (result), 0);
return rv;
}
static inline void
neon_clm_multiply_region_from_single (gf_t *gf, uint8_t *s8, uint8_t *d8,
gf_val_32_t val, uint8_t *d_end, int xor)
{
gf_internal_t * h = gf->scratch;
poly8x8_t prim_poly;
poly8x8_t a, w, even, odd;
uint8x8_t b, c, v, mask;
a = vdup_n_p8 (val);
mask = vdup_n_u8 (0xf);
prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0x1fULL));
while (d8 < d_end) {
b = vld1_u8 (s8);
even = vreinterpret_p8_u8 (vand_u8 (b, mask));
odd = vreinterpret_p8_u8 (vshr_n_u8 (b, 4));
if (xor)
c = vld1_u8 (d8);
even = vmul_p8 (a, even);
odd = vmul_p8 (a, odd);
v = vshr_n_u8 (vreinterpret_u8_p8(even), 4);
w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
even = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(even), vreinterpret_u8_p8(w)));
v = vshr_n_u8 (vreinterpret_u8_p8(odd), 4);
w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
odd = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(odd), vreinterpret_u8_p8(w)));
v = veor_u8 (vreinterpret_u8_p8 (even), vshl_n_u8 (vreinterpret_u8_p8 (odd), 4));
if (xor)
v = veor_u8 (c, v);
vst1_u8 (d8, v);
d8 += 8;
s8 += 8;
}
}
static void
gf_w4_neon_clm_multiply_region_from_single (gf_t *gf, void *src, void *dest,
gf_val_32_t val, int bytes, int xor)
{
gf_region_data rd;
uint8_t *s8;
uint8_t *d8;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
gf_do_initial_region_alignment(&rd);
s8 = (uint8_t *) rd.s_start;
d8 = (uint8_t *) rd.d_start;
if (xor)
neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 1);
else
neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 0);
gf_do_final_region_alignment(&rd);
}
#ifndef ARCH_AARCH64
#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
vtbl2_u8(tbl, vget_high_u8(v)))
#endif
static
inline
void
w4_single_table_multiply_region_neon(gf_t *gf, uint8_t *src, uint8_t *dst,
uint8_t * d_end, gf_val_32_t val, int xor)
{
struct gf_single_table_data *std;
uint8_t *base;
uint8x16_t r, va, vh, vl, loset;
#ifdef ARCH_AARCH64
uint8x16_t th, tl;
#else
uint8x8x2_t th, tl;
#endif
std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
base = (uint8_t *) std->mult;
base += (val << GF_FIELD_WIDTH);
#ifdef ARCH_AARCH64
tl = vld1q_u8 (base);
th = vshlq_n_u8 (tl, 4);
#else
tl.val[0] = vld1_u8 (base);
tl.val[1] = vld1_u8 (base + 8);
th.val[0] = vshl_n_u8 (tl.val[0], 4);
th.val[1] = vshl_n_u8 (tl.val[1], 4);
#endif
loset = vdupq_n_u8(0xf);
while (dst < d_end) {
va = vld1q_u8 (src);
vh = vshrq_n_u8 (va, 4);
vl = vandq_u8 (va, loset);
if (xor)
va = vld1q_u8 (dst);
vh = vqtbl1q_u8 (th, vh);
vl = vqtbl1q_u8 (tl, vl);
r = veorq_u8 (vh, vl);
if (xor)
r = veorq_u8 (va, r);
vst1q_u8 (dst, r);
dst += 16;
src += 16;
}
}
static
void
gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest,
gf_val_32_t val, int bytes, int xor)
{
gf_region_data rd;
uint8_t *sptr, *dptr, *top;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
gf_do_initial_region_alignment(&rd);
sptr = rd.s_start;
dptr = rd.d_start;
top = rd.d_top;
if (xor)
w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 1);
else
w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 0);
gf_do_final_region_alignment(&rd);
}
int gf_w4_neon_cfm_init(gf_t *gf)
{
// single clm multiplication probably pointless
gf->multiply.w32 = gf_w4_neon_clm_multiply;
gf->multiply_region.w32 = gf_w4_neon_clm_multiply_region_from_single;
return 1;
}
void gf_w4_neon_single_table_init(gf_t *gf)
{
gf->multiply_region.w32 = gf_w4_single_table_multiply_region_neon;
}

333
src/neon/gf_w64_neon.c Normal file
View File

@ -0,0 +1,333 @@
/*
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
*
* Copyright (c) 2014: Janne Grunau <j@jannau.net>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* - Neither the name of the University of Tennessee nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* gf_w64_neon.c
*
* Neon routines for 64-bit Galois fields
*
*/
#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
#include "gf_w64.h"
#ifndef ARCH_AARCH64
#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
vtbl2_u8(tbl, vget_high_u8(v)))
#endif
static
inline
void
neon_w64_split_4_lazy_altmap_multiply_region(gf_t *gf, uint64_t *src,
uint64_t *dst, uint64_t *d_end,
uint64_t val, int xor)
{
unsigned i, j, k;
uint8_t btable[16];
#ifdef ARCH_AARCH64
uint8x16_t tables[16][8];
#else
uint8x8x2_t tables[16][8];
#endif
uint8x16_t p[8], mask1, si;
gf_internal_t *h = (gf_internal_t *) gf->scratch;
struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
for (i = 0; i < 16; i++) {
for (j = 0; j < 8; j++) {
for (k = 0; k < 16; k++) {
btable[k] = (uint8_t) ld->tables[i][k];
ld->tables[i][k] >>= 8;
}
#ifdef ARCH_AARCH64
tables[i][j] = vld1q_u8(btable);
#else
tables[i][j].val[0] = vld1_u8(btable);
tables[i][j].val[1] = vld1_u8(btable + 8);
#endif
}
}
mask1 = vdupq_n_u8(0xf);
while (dst < d_end) {
if (xor) {
for (i = 0; i < 8; i++)
p[i] = vld1q_u8((uint8_t *) (dst + i * 2));
} else {
for (i = 0; i < 8; i++)
p[i] = vdupq_n_u8(0);
}
i = 0;
for (k = 0; k < 8; k++) {
uint8x16_t v0 = vld1q_u8((uint8_t *) src);
src += 2;
si = vandq_u8(v0, mask1);
for (j = 0; j < 8; j++) {
p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
}
i++;
si = vshrq_n_u8(v0, 4);
for (j = 0; j < 8; j++) {
p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
}
i++;
}
for (i = 0; i < 8; i++) {
vst1q_u8((uint8_t *) dst, p[i]);
dst += 2;
}
}
}
static
inline
void
neon_w64_split_4_lazy_multiply_region(gf_t *gf, uint64_t *src, uint64_t *dst,
uint64_t *d_end, uint64_t val, int xor)
{
unsigned i, j, k;
uint8_t btable[16];
#ifdef ARCH_AARCH64
uint8x16_t tables[16][8];
#else
uint8x8x2_t tables[16][8];
#endif
uint8x16_t p[8], mask1, si;
uint64x2_t st[8];
uint32x4x2_t s32[4];
uint16x8x2_t s16[4];
uint8x16x2_t s8[4];
gf_internal_t *h = (gf_internal_t *) gf->scratch;
struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
for (i = 0; i < 16; i++) {
for (j = 0; j < 8; j++) {
for (k = 0; k < 16; k++) {
btable[k] = (uint8_t) ld->tables[i][k];
ld->tables[i][k] >>= 8;
}
#ifdef ARCH_AARCH64
tables[i][j] = vld1q_u8(btable);
#else
tables[i][j].val[0] = vld1_u8(btable);
tables[i][j].val[1] = vld1_u8(btable + 8);
#endif
}
}
mask1 = vdupq_n_u8(0xf);
while (dst < d_end) {
for (k = 0; k < 8; k++) {
st[k] = vld1q_u64(src);
src += 2;
p[k] = vdupq_n_u8(0);
}
s32[0] = vuzpq_u32(vreinterpretq_u32_u64(st[0]),
vreinterpretq_u32_u64(st[1]));
s32[1] = vuzpq_u32(vreinterpretq_u32_u64(st[2]),
vreinterpretq_u32_u64(st[3]));
s32[2] = vuzpq_u32(vreinterpretq_u32_u64(st[4]),
vreinterpretq_u32_u64(st[5]));
s32[3] = vuzpq_u32(vreinterpretq_u32_u64(st[6]),
vreinterpretq_u32_u64(st[7]));
s16[0] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[0]),
vreinterpretq_u16_u32(s32[1].val[0]));
s16[1] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[0]),
vreinterpretq_u16_u32(s32[3].val[0]));
s16[2] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[1]),
vreinterpretq_u16_u32(s32[1].val[1]));
s16[3] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[1]),
vreinterpretq_u16_u32(s32[3].val[1]));
s8[0] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[0]),
vreinterpretq_u8_u16(s16[1].val[0]));
s8[1] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[1]),
vreinterpretq_u8_u16(s16[1].val[1]));
s8[2] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[0]),
vreinterpretq_u8_u16(s16[3].val[0]));
s8[3] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[1]),
vreinterpretq_u8_u16(s16[3].val[1]));
i = 0;
for (k = 0; k < 8; k++) {
si = vandq_u8(s8[k >> 1].val[k & 1], mask1);
for (j = 0; j < 8; j++) {
p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
}
i++;
si = vshrq_n_u8(s8[k >> 1].val[k & 1], 4);
for (j = 0; j < 8; j++) {
p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
}
i++;
}
s8[0] = vzipq_u8(p[0], p[1]);
s8[1] = vzipq_u8(p[2], p[3]);
s8[2] = vzipq_u8(p[4], p[5]);
s8[3] = vzipq_u8(p[6], p[7]);
s16[0] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[0]),
vreinterpretq_u16_u8(s8[1].val[0]));
s16[1] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[0]),
vreinterpretq_u16_u8(s8[3].val[0]));
s16[2] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[1]),
vreinterpretq_u16_u8(s8[1].val[1]));
s16[3] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[1]),
vreinterpretq_u16_u8(s8[3].val[1]));
s32[0] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[0]),
vreinterpretq_u32_u16(s16[1].val[0]));
s32[1] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[1]),
vreinterpretq_u32_u16(s16[1].val[1]));
s32[2] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[0]),
vreinterpretq_u32_u16(s16[3].val[0]));
s32[3] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[1]),
vreinterpretq_u32_u16(s16[3].val[1]));
for (k = 0; k < 8; k ++) {
st[k] = vreinterpretq_u64_u32(s32[k >> 1].val[k & 1]);
}
if (xor) {
for (i = 0; i < 8; i++) {
uint64x2_t t1 = vld1q_u64(dst);
vst1q_u64(dst, veorq_u64(st[i], t1));
dst += 2;
}
} else {
for (i = 0; i < 8; i++) {
vst1q_u64(dst, st[i]);
dst += 2;
}
}
}
}
static
void
gf_w64_neon_split_4_lazy_multiply_region(gf_t *gf, void *src, void *dest,
uint64_t val, int bytes, int xor,
int altmap)
{
gf_internal_t *h;
int i, j, k;
uint64_t pp, v, *s64, *d64, *top;
struct gf_split_4_64_lazy_data *ld;
gf_region_data rd;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
gf_do_initial_region_alignment(&rd);
s64 = (uint64_t *) rd.s_start;
d64 = (uint64_t *) rd.d_start;
top = (uint64_t *) rd.d_top;
h = (gf_internal_t *) gf->scratch;
pp = h->prim_poly;
ld = (struct gf_split_4_64_lazy_data *) h->private;
v = val;
for (i = 0; i < 16; i++) {
ld->tables[i][0] = 0;
for (j = 1; j < 16; j <<= 1) {
for (k = 0; k < j; k++) {
ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
}
v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
}
}
if (altmap) {
if (xor)
neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 1);
else
neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 0);
} else {
if (xor)
neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 1);
else
neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 0);
}
gf_do_final_region_alignment(&rd);
}
static
void
gf_w64_split_4_64_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
uint64_t val, int bytes, int xor)
{
gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
}
static
void
gf_w64_split_4_64_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
void *dest, uint64_t val,
int bytes, int xor)
{
gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
}
void gf_w64_neon_split_init(gf_t *gf)
{
gf_internal_t *h = (gf_internal_t *) gf->scratch;
if (h->region_type & GF_REGION_ALTMAP)
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_altmap_multiply_region_neon;
else
gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region_neon;
}

302
src/neon/gf_w8_neon.c Normal file
View File

@ -0,0 +1,302 @@
/*
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
*
* Copyright (c) 2014: Janne Grunau <j@jannau.net>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* - Neither the name of the University of Tennessee nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* gf_w8_neon.c
*
* Neon optimized routines for 8-bit Galois fields
*
*/
#include "gf_int.h"
#include "gf_w8.h"
#include <stdio.h>
#include <stdlib.h>
/* ARM NEON reducing macro for the carry free multiplication
* vmull_p8 is the carryless multiply operation. Here vshrn_n_u16 shifts
* the result to the right by 1 byte. This allows us to multiply
* the prim_poly by the leading bits of the result. We then xor the result
* of that operation back with the result. */
#define NEON_CFM_REDUCE(v, w, result, prim_poly, initial) \
do { \
if (initial) \
v = vshrn_n_u16 (vreinterpretq_u16_p16(result), 8); \
else \
v = veor_u8 (v, vshrn_n_u16 (vreinterpretq_u16_p16(result), 8)); \
w = vmull_p8 (prim_poly, vreinterpret_p8_u8(v)); \
result = vreinterpretq_p16_u16 (veorq_u16 (vreinterpretq_u16_p16(result), vreinterpretq_u16_p16(w))); \
} while (0)
static
inline
gf_val_32_t
gf_w8_neon_clm_multiply_x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8, int x)
{
gf_val_32_t rv = 0;
poly8x8_t a, b;
uint8x8_t v;
poly16x8_t result;
poly8x8_t prim_poly;
poly16x8_t w;
gf_internal_t * h = gf->scratch;
a = vdup_n_p8 (a8);
b = vdup_n_p8 (b8);
prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1ffULL));
/* Do the initial multiply */
result = vmull_p8 (a, b);
/* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
have to do the reduction at most twice, because (w-2)/z == 2. Where
z is equal to the number of zeros after the leading 1 */
NEON_CFM_REDUCE (v, w, result, prim_poly, 1);
NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
if (x >= 3) {
NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
}
if (x >= 4) {
NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
}
/* Extracts 32 bit value from result. */
rv = (gf_val_32_t)vget_lane_u8 (vmovn_u16 (vreinterpretq_u16_p16 (result)), 0);
return rv;
}
#define CLM_MULTIPLY(x) \
static gf_val_32_t gf_w8_neon_clm_multiply_ ## x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) \
{\
return gf_w8_neon_clm_multiply_x (gf, a8, b8, x);\
}
CLM_MULTIPLY(2)
CLM_MULTIPLY(3)
CLM_MULTIPLY(4)
static inline void
neon_clm_multiply_region_from_single_x(gf_t *gf, uint8_t *s8, uint8_t *d8,
gf_val_32_t val, uint8_t *d_end,
int xor, int x)
{
gf_internal_t * h = gf->scratch;
poly8x8_t a, b;
uint8x8_t c, v;
poly16x8_t result;
poly8x8_t prim_poly;
poly16x8_t w;
a = vdup_n_p8 (val);
prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0xffULL));
while (d8 < d_end) {
b = vld1_p8 ((poly8_t *) s8);
if (xor)
c = vld1_u8 (d8);
result = vmull_p8 (a, b);
NEON_CFM_REDUCE(v, w, result, prim_poly, 1);
NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
if (x >= 3) {
NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
}
if (x >= 4) {
NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
}
v = vmovn_u16 (vreinterpretq_u16_p16 (result));
if (xor)
v = veor_u8 (c, v);
vst1_u8 (d8, v);
d8 += 8;
s8 += 8;
}
}
#define CLM_MULT_REGION(x) \
static void \
gf_w8_neon_clm_multiply_region_from_single_ ## x (gf_t *gf, void *src, \
void *dest, \
gf_val_32_t val, int bytes, \
int xor) \
{ \
gf_region_data rd; \
uint8_t *s8; \
uint8_t *d8; \
\
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } \
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } \
\
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); \
gf_do_initial_region_alignment(&rd); \
s8 = (uint8_t *) rd.s_start; \
d8 = (uint8_t *) rd.d_start; \
\
if (xor) \
neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 1, x); \
else \
neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 0, x);\
gf_do_final_region_alignment(&rd); \
}
CLM_MULT_REGION(2)
CLM_MULT_REGION(3)
CLM_MULT_REGION(4)
int gf_w8_neon_cfm_init(gf_t *gf)
{
gf_internal_t *h;
h = (gf_internal_t *) gf->scratch;
if ((0xe0 & h->prim_poly) == 0){
gf->multiply.w32 = gf_w8_neon_clm_multiply_2;
gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_2;
}else if ((0xc0 & h->prim_poly) == 0){
gf->multiply.w32 = gf_w8_neon_clm_multiply_3;
gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_3;
}else if ((0x80 & h->prim_poly) == 0){
gf->multiply.w32 = gf_w8_neon_clm_multiply_4;
gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_4;
}else{
return 0;
}
return 1;
}
#ifndef ARCH_AARCH64
#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
vtbl2_u8(tbl, vget_high_u8(v)))
#endif
static
void
gf_w8_split_multiply_region_neon(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
{
uint8_t *bh, *bl, *sptr, *dptr;
uint8x16_t r, va, vh, vl, loset;
#ifdef ARCH_AARCH64
uint8x16_t mth, mtl;
#else
uint8x8x2_t mth, mtl;
#endif
struct gf_w8_half_table_data *htd;
gf_region_data rd;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private;
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
gf_do_initial_region_alignment(&rd);
bh = (uint8_t *) htd->high;
bh += (val << 4);
bl = (uint8_t *) htd->low;
bl += (val << 4);
sptr = rd.s_start;
dptr = rd.d_start;
#ifdef ARCH_AARCH64
mth = vld1q_u8 (bh);
mtl = vld1q_u8 (bl);
#else
mth.val[0] = vld1_u8 (bh);
mtl.val[0] = vld1_u8 (bl);
mth.val[1] = vld1_u8 (bh + 8);
mtl.val[1] = vld1_u8 (bl + 8);
#endif
loset = vdupq_n_u8(0xf);
if (xor) {
while (sptr < (uint8_t *) rd.s_top) {
va = vld1q_u8 (sptr);
vh = vshrq_n_u8 (va, 4);
vl = vandq_u8 (va, loset);
va = vld1q_u8 (dptr);
vh = vqtbl1q_u8 (mth, vh);
vl = vqtbl1q_u8 (mtl, vl);
r = veorq_u8 (vh, vl);
vst1q_u8 (dptr, veorq_u8 (va, r));
dptr += 16;
sptr += 16;
}
} else {
while (sptr < (uint8_t *) rd.s_top) {
va = vld1q_u8 (sptr);
vh = vshrq_n_u8 (va, 4);
vl = vandq_u8 (va, loset);
#ifdef ARCH_AARCH64
vh = vqtbl1q_u8 (mth, vh);
vl = vqtbl1q_u8 (mtl, vl);
#else
vh = vcombine_u8 (vtbl2_u8 (mth, vget_low_u8 (vh)),
vtbl2_u8 (mth, vget_high_u8 (vh)));
vl = vcombine_u8 (vtbl2_u8 (mtl, vget_low_u8 (vl)),
vtbl2_u8 (mtl, vget_high_u8 (vl)));
#endif
r = veorq_u8 (vh, vl);
vst1q_u8(dptr, r);
dptr += 16;
sptr += 16;
}
}
gf_do_final_region_alignment(&rd);
}
void gf_w8_neon_split_init(gf_t *gf)
{
gf->multiply_region.w32 = gf_w8_split_multiply_region_neon;
}

View File

@ -1,7 +1,7 @@
# GF-Complete 'test' AM file
AM_CPPFLAGS=-I./ -I../include
AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
bin_PROGRAMS = gf_unit

View File

@ -8,6 +8,14 @@
* Performs unit testing for gf arithmetic
*/
#include "config.h"
#ifdef HAVE_POSIX_MEMALIGN
#ifndef _XOPEN_SOURCE
#define _XOPEN_SOURCE 600
#endif
#endif
#include <stdio.h>
#include <getopt.h>
#include <stdint.h>
@ -82,6 +90,9 @@ int main(int argc, char **argv)
uint32_t mask = 0;
char *ra, *rb, *rc, *rd, *target;
int align;
#ifndef HAVE_POSIX_MEMALIGN
char *malloc_ra, *malloc_rb, *malloc_rc, *malloc_rd;
#endif
if (argc < 4) usage(NULL);
@ -116,18 +127,26 @@ int main(int argc, char **argv)
c = (gf_general_t *) malloc(sizeof(gf_general_t));
d = (gf_general_t *) malloc(sizeof(gf_general_t));
#if HAVE_POSIX_MEMALIGN
if (posix_memalign((void **) &ra, 16, sizeof(char)*REGION_SIZE))
ra = NULL;
if (posix_memalign((void **) &rb, 16, sizeof(char)*REGION_SIZE))
rb = NULL;
if (posix_memalign((void **) &rc, 16, sizeof(char)*REGION_SIZE))
rc = NULL;
if (posix_memalign((void **) &rd, 16, sizeof(char)*REGION_SIZE))
rd = NULL;
#else
//15 bytes extra to make sure it's 16byte aligned
ra = (char *) malloc(sizeof(char)*REGION_SIZE+15);
rb = (char *) malloc(sizeof(char)*REGION_SIZE+15);
rc = (char *) malloc(sizeof(char)*REGION_SIZE+15);
rd = (char *) malloc(sizeof(char)*REGION_SIZE+15);
//this still assumes 8 byte aligned pointer from malloc
//(which is usual on 32-bit machines)
ra += (uint64_t)ra & 0xf;
rb += (uint64_t)rb & 0xf;
rc += (uint64_t)rc & 0xf;
rd += (uint64_t)rd & 0xf;
malloc_ra = (char *) malloc(sizeof(char)*REGION_SIZE+15);
malloc_rb = (char *) malloc(sizeof(char)*REGION_SIZE+15);
malloc_rc = (char *) malloc(sizeof(char)*REGION_SIZE+15);
malloc_rd = (char *) malloc(sizeof(char)*REGION_SIZE+15);
ra = (uint8_t *) (((uintptr_t) malloc_ra + 15) & ~((uintptr_t) 0xf));
rb = (uint8_t *) (((uintptr_t) malloc_rb + 15) & ~((uintptr_t) 0xf));
rc = (uint8_t *) (((uintptr_t) malloc_rc + 15) & ~((uintptr_t) 0xf));
rd = (uint8_t *) (((uintptr_t) malloc_rd + 15) & ~((uintptr_t) 0xf));
#endif
if (w <= 32) {
mask = 0;
@ -423,10 +442,17 @@ int main(int argc, char **argv)
free(b);
free(c);
free(d);
#ifdef HAVE_POSIX_MEMALIGN
free(ra);
free(rb);
free(rc);
free(rd);
#else
free(malloc_ra);
free(malloc_rb);
free(malloc_rc);
free(malloc_rd);
#endif
return 0;
}

View File

@ -1,9 +1,7 @@
# GF-Complete 'tools' AM file
AM_CPPFLAGS=-I./ -I../include
AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
TESTS=run-tests.sh
AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
bin_PROGRAMS = gf_mult gf_div gf_add gf_time gf_methods gf_poly gf_inline_time
@ -35,3 +33,19 @@ gf_inline_time_SOURCES = gf_inline_time.c
#gf_inline_time_LDFLAGS = -lgf_complete
gf_inline_time_LDADD = ../src/libgf_complete.la
# gf_unit tests as generated by gf_methods
gf_unit_w%.sh: gf_methods
./$^ $(@:gf_unit_w%.sh=%) -A -U > $@ || rm $@
TESTS = gf_unit_w128.sh \
gf_unit_w64.sh \
gf_unit_w32.sh \
gf_unit_w16.sh \
gf_unit_w8.sh \
gf_unit_w4.sh
TEST_EXTENSIONS = .sh
SH_LOG_COMPILER = $(SHELL)
AM_SH_LOG_FLAGS = -e
CLEANFILES = $(TESTS)

View File

@ -28,7 +28,7 @@ static char *MULTS[NMULTS] = { "SHIFT", "CARRY_FREE", "CARRY_FREE_GK", "GROUP44"
/* Make sure CAUCHY is last */
#define NREGIONS (7)
static char *REGIONS[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SSE", "NOSSE",
static char *REGIONS[NREGIONS] = { "DOUBLE", "QUAD", "LAZY", "SIMD", "NOSIMD",
"ALTMAP", "CAUCHY" };
#define BNREGIONS (4)

View File

@ -8,6 +8,14 @@
* Performs timing for gf arithmetic
*/
#include "config.h"
#ifdef HAVE_POSIX_MEMALIGN
#ifndef _XOPEN_SOURCE
#define _XOPEN_SOURCE 600
#endif
#endif
#include <stdio.h>
#include <getopt.h>
#include <stdint.h>
@ -95,6 +103,9 @@ int main(int argc, char **argv)
time_t t0;
uint8_t *ra, *rb;
gf_general_t a;
#ifndef HAVE_POSIX_MEMALIGN
uint8_t *malloc_ra, *malloc_rb;
#endif
if (argc < 6) usage(NULL);
@ -155,8 +166,17 @@ int main(int argc, char **argv)
printf("Seed: %ld\n", t0);
ra = (uint8_t *) malloc(size);
rb = (uint8_t *) malloc(size);
#ifdef HAVE_POSIX_MEMALIGN
if (posix_memalign((void **) &ra, 16, size))
ra = NULL;
if (posix_memalign((void **) &rb, 16, size))
rb = NULL;
#else
malloc_ra = (uint8_t *) malloc(size + 15);
malloc_rb = (uint8_t *) malloc(size + 15);
ra = (uint8_t *) (((uintptr_t) malloc_ra + 15) & ~((uintptr_t) 0xf));
rb = (uint8_t *) (((uintptr_t) malloc_rb + 15) & ~((uintptr_t) 0xf));
#endif
if (ra == NULL || rb == NULL) { perror("malloc"); exit(1); }

View File

@ -1,9 +0,0 @@
#!/bin/bash
for w in 4 8 16 32 64 128 ; do
./gf_methods $w -A -U | sh -e
if [ $? != "0" ] ; then
echo "Failed unit tests for w=$w"
break
fi
done