Support for runtime SIMD detection
This commits adds support for runtime detection of SIMD instructions. The idea is that you would build once with all supported SIMD functions and the same binaries could run on different machines with varying support for SIMD. At runtime gf-complete will select the right functions based on the processor. gf_cpu.c has the logic to detect SIMD instructions. On Intel processors this is done through cpuid. For ARM on linux we use getauxv. The logic in gf_w*.c has been changed to check for runtime SIMD support and fallback to generic code. Also a new test has been added. It compares the functions selected by gf_init when we enable/disable SIMD support through build flags, with runtime enabling/disabling. The test checks if the results are identical.master
parent
7761438c63
commit
4339569f14
|
@ -75,4 +75,4 @@ tools/gf_time
|
|||
tools/gf_unit_w*
|
||||
tools/test-suite.log
|
||||
tools/.qemu/
|
||||
tools/test_simd*.results
|
||||
tools/test_simd*.results*
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
/*
|
||||
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
|
||||
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
|
||||
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
|
||||
*
|
||||
* gf_cpu.h
|
||||
*
|
||||
* Identifies whether the CPU supports SIMD instructions at runtime.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
extern int gf_cpu_supports_intel_pclmul;
|
||||
extern int gf_cpu_supports_intel_sse4;
|
||||
extern int gf_cpu_supports_intel_ssse3;
|
||||
extern int gf_cpu_supports_intel_sse3;
|
||||
extern int gf_cpu_supports_intel_sse2;
|
||||
extern int gf_cpu_supports_arm_neon;
|
||||
|
||||
void gf_cpu_identify(void);
|
|
@ -4,11 +4,21 @@
|
|||
AUTOMAKE_OPTIONS = subdir-objects
|
||||
|
||||
AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
|
||||
AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
|
||||
|
||||
# avoid using SIMD_FLAGS for code that calls strcmp as new gcc
|
||||
# versions will use SIMD for the strcmp implementation. Instead
|
||||
# we create a static library just for gf_method that is not compiled
|
||||
# with SIMD_FLAGS, this static library will get linked into gf_complete.so
|
||||
noinst_LTLIBRARIES = libgf_util.la
|
||||
libgf_util_la_SOURCES = gf_method.c
|
||||
libgf_util_la_CFLAGS = -O3 -fPIC -Wsign-compare
|
||||
|
||||
# we narrowly use SIMD_FLAGS for code that needs it
|
||||
lib_LTLIBRARIES = libgf_complete.la
|
||||
libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
|
||||
gf_w64.c gf_w128.c gf_rand.c gf_general.c
|
||||
libgf_complete_la_SOURCES = gf.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
|
||||
gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c
|
||||
libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
|
||||
libgf_complete_la_LIBADD = libgf_util.la
|
||||
|
||||
if HAVE_NEON
|
||||
libgf_complete_la_SOURCES += neon/gf_w4_neon.c \
|
||||
|
|
21
src/gf.c
21
src/gf.c
|
@ -12,6 +12,7 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include "gf_cpu.h"
|
||||
|
||||
int _gf_errno = GF_E_DEFAULT;
|
||||
|
||||
|
@ -207,20 +208,28 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
|
|||
if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
|
||||
|
||||
#ifdef INTEL_SSE2
|
||||
sse2 = 1;
|
||||
if (gf_cpu_supports_intel_sse2) {
|
||||
sse2 = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_SSSE3
|
||||
sse3 = 1;
|
||||
if (gf_cpu_supports_intel_ssse3) {
|
||||
sse3 = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_SSE4_PCLMUL
|
||||
pclmul = 1;
|
||||
if (gf_cpu_supports_intel_pclmul) {
|
||||
pclmul = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ARM_NEON
|
||||
pclmul = (w == 4 || w == 8);
|
||||
sse3 = 1;
|
||||
if (gf_cpu_supports_arm_neon) {
|
||||
pclmul = (w == 4 || w == 8);
|
||||
sse3 = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -473,6 +482,8 @@ int gf_init_hard(gf_t *gf, int w, int mult_type,
|
|||
int sz;
|
||||
gf_internal_t *h;
|
||||
|
||||
gf_cpu_identify();
|
||||
|
||||
if (gf_error_check(w, mult_type, region_type, divide_type,
|
||||
arg1, arg2, prim_poly, base_gf) == 0) return 0;
|
||||
|
||||
|
|
|
@ -0,0 +1,153 @@
|
|||
/*
|
||||
* GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
|
||||
* James S. Plank, Ethan L. Miller, Kevin M. Greenan,
|
||||
* Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
|
||||
*
|
||||
* gf_cpu.h
|
||||
*
|
||||
* Identifies whether the CPU supports SIMD instructions at runtime.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int gf_cpu_identified = 0;
|
||||
|
||||
int gf_cpu_supports_intel_pclmul = 0;
|
||||
int gf_cpu_supports_intel_sse4 = 0;
|
||||
int gf_cpu_supports_intel_ssse3 = 0;
|
||||
int gf_cpu_supports_intel_sse3 = 0;
|
||||
int gf_cpu_supports_intel_sse2 = 0;
|
||||
int gf_cpu_supports_arm_neon = 0;
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
void gf_cpu_identify(void)
|
||||
{
|
||||
if (gf_cpu_identified) {
|
||||
return;
|
||||
}
|
||||
|
||||
int op = 1, eax, ebx, ecx, edx;
|
||||
|
||||
__asm__("cpuid"
|
||||
: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
|
||||
: "a" (op));
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
if ((ecx & 1) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE4_PCLMUL")) {
|
||||
gf_cpu_supports_intel_pclmul = 1;
|
||||
#ifdef DEBUG_CPU_DETECTION
|
||||
printf("#gf_cpu_supports_intel_pclmul\n");
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(INTEL_SSE4)
|
||||
if (((ecx & (1<<20)) != 0 || (ecx & (1<<19)) != 0) && !getenv("GF_COMPLETE_DISABLE_SSE4")) {
|
||||
gf_cpu_supports_intel_sse4 = 1;
|
||||
#ifdef DEBUG_CPU_DETECTION
|
||||
printf("#gf_cpu_supports_intel_sse4\n");
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(INTEL_SSSE3)
|
||||
if ((ecx & (1<<9)) != 0 && !getenv("GF_COMPLETE_DISABLE_SSSE3")) {
|
||||
gf_cpu_supports_intel_ssse3 = 1;
|
||||
#ifdef DEBUG_CPU_DETECTION
|
||||
printf("#gf_cpu_supports_intel_ssse3\n");
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(INTEL_SSE3)
|
||||
if ((ecx & 1) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE3")) {
|
||||
gf_cpu_supports_intel_sse3 = 1;
|
||||
#ifdef DEBUG_CPU_DETECTION
|
||||
printf("#gf_cpu_supports_intel_sse3\n");
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(INTEL_SSE2)
|
||||
if ((edx & (1<<26)) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE2")) {
|
||||
gf_cpu_supports_intel_sse2 = 1;
|
||||
#ifdef DEBUG_CPU_DETECTION
|
||||
printf("#gf_cpu_supports_intel_sse2\n");
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
gf_cpu_identified = 1;
|
||||
}
|
||||
|
||||
#elif defined(__arm__) || defined(__aarch64__)
|
||||
|
||||
#ifdef __linux__
|
||||
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <elf.h>
|
||||
#include <linux/auxvec.h>
|
||||
#include <asm/hwcap.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
unsigned long get_hwcap(unsigned long type) {
|
||||
unsigned long hwcap = 0;
|
||||
int fd = open("/proc/self/auxv", O_RDONLY);
|
||||
if (fd > 0) {
|
||||
Elf32_auxv_t auxv;
|
||||
while (read(fd, &auxv, sizeof(Elf32_auxv_t))) {
|
||||
if (auxv.a_type == type) {
|
||||
hwcap = auxv.a_un.a_val;
|
||||
break;
|
||||
}
|
||||
}
|
||||
close(fd);
|
||||
}
|
||||
|
||||
return hwcap;
|
||||
}
|
||||
|
||||
#endif // linux
|
||||
|
||||
void gf_cpu_identify(void)
|
||||
{
|
||||
if (gf_cpu_identified) {
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(ARM_NEON)
|
||||
if (!getenv("GF_COMPLETE_DISABLE_NEON")) {
|
||||
#if __linux__ && __arm__
|
||||
gf_cpu_supports_arm_neon = (get_hwcap(AT_HWCAP) & HWCAP_NEON) > 0;
|
||||
#elif __aarch64__
|
||||
// ASIMD is supported on all aarch64 architectures
|
||||
gf_cpu_supports_arm_neon = 1;
|
||||
#else
|
||||
// we assume that NEON is supported if the compiler supports
|
||||
// NEON and we dont have a reliable way to detect runtime support.
|
||||
gf_cpu_supports_arm_neon = 1;
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_CPU_DETECTION
|
||||
if (gf_cpu_supports_arm_neon) {
|
||||
printf("#gf_cpu_supports_arm_neon\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif // defined(ARM_NEON)
|
||||
|
||||
gf_cpu_identified = 1;
|
||||
}
|
||||
|
||||
#else // defined(__arm__) || defined(__aarch64__)
|
||||
|
||||
int gf_cpu_identify(void)
|
||||
{
|
||||
gf_cpu_identified = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -11,6 +11,7 @@
|
|||
#include "gf_int.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "gf_cpu.h"
|
||||
|
||||
#define GF_FIELD_WIDTH (128)
|
||||
|
||||
|
@ -290,11 +291,11 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
|
|||
return;
|
||||
}
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
void
|
||||
gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
|
||||
{
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a,b;
|
||||
__m128i result0,result1;
|
||||
__m128i prim_poly;
|
||||
|
@ -338,9 +339,8 @@ gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
|
|||
|
||||
c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
|
||||
c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
void
|
||||
gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
|
||||
|
@ -376,10 +376,10 @@ gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_
|
|||
return;
|
||||
}
|
||||
|
||||
#if defined(INTEL_SSE4)
|
||||
void
|
||||
gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
|
||||
{
|
||||
#if defined(INTEL_SSE4)
|
||||
int i;
|
||||
__m128i a, b, pp, prod, amask, u_middle_one;
|
||||
/*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
|
||||
|
@ -427,16 +427,16 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
|
|||
}
|
||||
c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
|
||||
c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* Ben: This slow function implements sse instrutions for bytwo_b because why not */
|
||||
#if defined(INTEL_SSE4)
|
||||
void
|
||||
gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
|
||||
{
|
||||
#if defined(INTEL_SSE4)
|
||||
__m128i a, b, lmask, hmask, pp, c, middle_one;
|
||||
gf_internal_t *h;
|
||||
uint64_t topbit, middlebit;
|
||||
|
@ -471,8 +471,8 @@ gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
|
|||
if (middlebit) b = _mm_xor_si128(b, middle_one);
|
||||
if (topbit) b = _mm_xor_si128(b, pp);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
void
|
||||
gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
|
||||
|
@ -1146,7 +1146,7 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
|
|||
}
|
||||
|
||||
/* a^-1 -> b */
|
||||
void
|
||||
void
|
||||
gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
|
||||
{
|
||||
uint64_t e_i[2], e_im1[2], e_ip1[2];
|
||||
|
@ -1239,7 +1239,7 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
|
|||
return;
|
||||
}
|
||||
|
||||
void
|
||||
void
|
||||
gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
|
||||
{
|
||||
uint64_t d[2];
|
||||
|
@ -1248,7 +1248,7 @@ gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val
|
|||
return;
|
||||
}
|
||||
|
||||
void
|
||||
void
|
||||
gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
|
||||
{
|
||||
uint64_t one128[2];
|
||||
|
@ -1260,7 +1260,7 @@ gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
|
|||
|
||||
|
||||
static
|
||||
void
|
||||
void
|
||||
gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
|
||||
{
|
||||
gf_internal_t *h = (gf_internal_t *) gf->scratch;
|
||||
|
@ -1421,10 +1421,12 @@ static
|
|||
int gf_w128_cfm_init(gf_t *gf)
|
||||
{
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
|
||||
SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
|
||||
SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
|
||||
return 1;
|
||||
if (gf_cpu_supports_intel_pclmul) {
|
||||
SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
|
||||
SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
|
||||
SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
@ -1527,7 +1529,7 @@ int gf_w128_split_init(gf_t *gf)
|
|||
|
||||
SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
if (!(h->region_type & GF_REGION_NOSIMD)){
|
||||
if (gf_cpu_supports_intel_pclmul && !(h->region_type & GF_REGION_NOSIMD)){
|
||||
SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
|
||||
}
|
||||
#endif
|
||||
|
@ -1546,23 +1548,19 @@ int gf_w128_split_init(gf_t *gf)
|
|||
if((h->region_type & GF_REGION_ALTMAP))
|
||||
{
|
||||
#ifdef INTEL_SSE4
|
||||
if(!(h->region_type & GF_REGION_NOSIMD))
|
||||
if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
|
||||
SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_altmap_multiply_region)
|
||||
else
|
||||
return 0;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
else {
|
||||
#ifdef INTEL_SSE4
|
||||
if(!(h->region_type & GF_REGION_NOSIMD))
|
||||
if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
|
||||
SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_multiply_region)
|
||||
else
|
||||
SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
|
||||
#else
|
||||
SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
|
|
129
src/gf_w16.c
129
src/gf_w16.c
|
@ -12,6 +12,7 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "gf_w16.h"
|
||||
#include "gf_cpu.h"
|
||||
|
||||
#define AB2(ip, am1 ,am2, b, t1, t2) {\
|
||||
t1 = (b << 1) & am1;\
|
||||
|
@ -391,6 +392,7 @@ gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b)
|
|||
extra memory.
|
||||
*/
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
static
|
||||
inline
|
||||
gf_val_32_t
|
||||
|
@ -398,8 +400,6 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
|
|||
{
|
||||
gf_val_32_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -433,11 +433,11 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
|
|||
|
||||
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
|
||||
|
||||
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
static
|
||||
inline
|
||||
gf_val_32_t
|
||||
|
@ -445,8 +445,6 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
|
|||
{
|
||||
gf_val_32_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -473,11 +471,11 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
|
|||
|
||||
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
|
||||
|
||||
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
static
|
||||
inline
|
||||
gf_val_32_t
|
||||
|
@ -485,8 +483,6 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
|
|||
{
|
||||
gf_val_32_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -515,10 +511,9 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
|
|||
|
||||
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
|
||||
|
||||
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static
|
||||
|
@ -556,25 +551,27 @@ static
|
|||
int gf_w16_cfm_init(gf_t *gf)
|
||||
{
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
gf_internal_t *h;
|
||||
if (gf_cpu_supports_intel_pclmul) {
|
||||
gf_internal_t *h;
|
||||
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
|
||||
/*Ben: Determining how many reductions to do */
|
||||
|
||||
if ((0xfe00 & h->prim_poly) == 0) {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2)
|
||||
} else if((0xf000 & h->prim_poly) == 0) {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3)
|
||||
} else if ((0xe000 & h->prim_poly) == 0) {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4)
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
|
||||
/*Ben: Determining how many reductions to do */
|
||||
|
||||
if ((0xfe00 & h->prim_poly) == 0) {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2)
|
||||
} else if((0xf000 & h->prim_poly) == 0) {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3)
|
||||
} else if ((0xe000 & h->prim_poly) == 0) {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4)
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
@ -688,10 +685,9 @@ int gf_w16_log_init(gf_t *gf)
|
|||
|
||||
if (check) {
|
||||
if (h->mult_type != GF_MULT_LOG_TABLE) {
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
return gf_w16_cfm_init(gf);
|
||||
#endif
|
||||
if (gf_cpu_supports_intel_pclmul) {
|
||||
return gf_w16_cfm_init(gf);
|
||||
}
|
||||
return gf_w16_shift_init(gf);
|
||||
} else {
|
||||
_gf_errno = GF_E_LOGPOLY;
|
||||
|
@ -948,11 +944,11 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
|
|||
gf_do_final_region_alignment(&rd);
|
||||
}
|
||||
|
||||
#ifdef INTEL_SSSE3
|
||||
static
|
||||
void
|
||||
gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
|
||||
{
|
||||
#ifdef INTEL_SSSE3
|
||||
uint64_t i, j, *s64, *d64, *top64;;
|
||||
uint64_t c, prod;
|
||||
uint8_t low[4][16];
|
||||
|
@ -1078,14 +1074,14 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_v
|
|||
}
|
||||
|
||||
gf_do_final_region_alignment(&rd);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_SSSE3
|
||||
static
|
||||
void
|
||||
gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
|
||||
{
|
||||
#ifdef INTEL_SSSE3
|
||||
uint64_t i, j, *s64, *d64, *top64;;
|
||||
uint64_t c, prod;
|
||||
uint8_t low[4][16];
|
||||
|
@ -1187,8 +1183,8 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
|
|||
}
|
||||
gf_do_final_region_alignment(&rd);
|
||||
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32_t
|
||||
gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
|
||||
|
@ -1216,21 +1212,11 @@ int gf_w16_split_init(gf_t *gf)
|
|||
{
|
||||
gf_internal_t *h;
|
||||
struct gf_w16_split_8_8_data *d8;
|
||||
int i, j, exp, issse3;
|
||||
int isneon = 0;
|
||||
int i, j, exp;
|
||||
uint32_t p, basep, tmp;
|
||||
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
|
||||
#ifdef INTEL_SSSE3
|
||||
issse3 = 1;
|
||||
#else
|
||||
issse3 = 0;
|
||||
#endif
|
||||
#ifdef ARM_NEON
|
||||
isneon = 1;
|
||||
#endif
|
||||
|
||||
if (h->arg1 == 8 && h->arg2 == 8) {
|
||||
d8 = (struct gf_w16_split_8_8_data *) h->private;
|
||||
basep = 1;
|
||||
|
@ -1273,36 +1259,45 @@ int gf_w16_split_init(gf_t *gf)
|
|||
|
||||
/* Defaults */
|
||||
|
||||
if (issse3) {
|
||||
#ifdef INTEL_SSSE3
|
||||
if (gf_cpu_supports_intel_ssse3) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_multiply_region)
|
||||
} else if (isneon) {
|
||||
#ifdef ARM_NEON
|
||||
gf_w16_neon_split_init(gf);
|
||||
#endif
|
||||
} else {
|
||||
#elif ARM_NEON
|
||||
if (gf_cpu_supports_arm_neon) {
|
||||
gf_w16_neon_split_init(gf);
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
|
||||
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
|
||||
|
||||
} else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
|
||||
if (issse3 || isneon) {
|
||||
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
|
||||
if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
|
||||
if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
|
||||
else if(h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
|
||||
else if(h->region_type & GF_REGION_ALTMAP && issse3)
|
||||
#if defined(INTEL_SSSE3)
|
||||
else if(h->region_type & GF_REGION_ALTMAP && gf_cpu_supports_intel_ssse3)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_altmap_multiply_region)
|
||||
#endif
|
||||
} else {
|
||||
#endif
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
else if(h->region_type & GF_REGION_ALTMAP)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
|
||||
else
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
|
||||
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
@ -1846,26 +1841,28 @@ int gf_w16_bytwo_init(gf_t *gf)
|
|||
if (h->mult_type == GF_MULT_BYTWO_p) {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_p_multiply)
|
||||
#ifdef INTEL_SSE2
|
||||
if (h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region)
|
||||
else
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region)
|
||||
#else
|
||||
if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region)
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
#ifdef INTEL_SSE2
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_b_multiply)
|
||||
#ifdef INTEL_SSE2
|
||||
if (h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region)
|
||||
else
|
||||
if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_sse_multiply_region)
|
||||
#else
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
#ifdef INTEL_SSE2
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
181
src/gf_w32.c
181
src/gf_w32.c
|
@ -13,6 +13,7 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "gf_w32.h"
|
||||
#include "gf_cpu.h"
|
||||
|
||||
#define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
|
||||
|
||||
|
@ -347,6 +348,8 @@ uint32_t gf_w32_matrix (gf_t *gf, uint32_t b)
|
|||
extra memory.
|
||||
*/
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
static
|
||||
inline
|
||||
gf_val_32_t
|
||||
|
@ -354,8 +357,6 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
|
|||
{
|
||||
gf_val_32_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i w;
|
||||
|
@ -378,9 +379,9 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
|
|||
|
||||
/* Extracts 32 bit value from result. */
|
||||
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
|
@ -435,6 +436,8 @@ gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32
|
|||
#endif
|
||||
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
static
|
||||
inline
|
||||
gf_val_32_t
|
||||
|
@ -442,8 +445,6 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
|
|||
{
|
||||
gf_val_32_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -476,9 +477,11 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
|
|||
|
||||
/* Extracts 32 bit value from result. */
|
||||
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
static
|
||||
inline
|
||||
|
@ -487,8 +490,6 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
|
|||
{
|
||||
gf_val_32_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -515,9 +516,11 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
|
|||
/* Extracts 32 bit value from result. */
|
||||
|
||||
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
static
|
||||
inline
|
||||
|
@ -526,8 +529,6 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
|
|||
{
|
||||
gf_val_32_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -556,9 +557,9 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
|
|||
/* Extracts 32 bit value from result. */
|
||||
|
||||
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static
|
||||
|
@ -593,29 +594,31 @@ int gf_w32_cfmgk_init(gf_t *gf)
|
|||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
gf_internal_t *h;
|
||||
if (gf_cpu_supports_intel_pclmul) {
|
||||
gf_internal_t *h;
|
||||
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single)
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single)
|
||||
|
||||
uint64_t *q_plus = (uint64_t *) h->private;
|
||||
uint64_t *g_star = (uint64_t *) h->private + 1;
|
||||
uint64_t *q_plus = (uint64_t *) h->private;
|
||||
uint64_t *g_star = (uint64_t *) h->private + 1;
|
||||
|
||||
uint64_t tmp = h->prim_poly << 32;
|
||||
*q_plus = 1ULL << 32;
|
||||
uint64_t tmp = h->prim_poly << 32;
|
||||
*q_plus = 1ULL << 32;
|
||||
|
||||
int i;
|
||||
for(i = 63; i >= 32; i--)
|
||||
if((1ULL << i) & tmp)
|
||||
{
|
||||
*q_plus |= 1ULL << (i-32);
|
||||
tmp ^= h->prim_poly << (i-32);
|
||||
}
|
||||
int i;
|
||||
for(i = 63; i >= 32; i--)
|
||||
if((1ULL << i) & tmp)
|
||||
{
|
||||
*q_plus |= 1ULL << (i-32);
|
||||
tmp ^= h->prim_poly << (i-32);
|
||||
}
|
||||
|
||||
*g_star = h->prim_poly & ((1ULL << 32) - 1);
|
||||
*g_star = h->prim_poly & ((1ULL << 32) - 1);
|
||||
|
||||
return 1;
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
@ -631,23 +634,25 @@ int gf_w32_cfm_init(gf_t *gf)
|
|||
/*Ben: Check to see how many reduction steps it will take*/
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
gf_internal_t *h;
|
||||
if (gf_cpu_supports_intel_pclmul) {
|
||||
gf_internal_t *h;
|
||||
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
|
||||
if ((0xfffe0000 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2)
|
||||
}else if ((0xffc00000 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3)
|
||||
}else if ((0xfe000000 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4)
|
||||
} else {
|
||||
return 0;
|
||||
if ((0xfffe0000 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2)
|
||||
}else if ((0xffc00000 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3)
|
||||
}else if ((0xfe000000 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4)
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
@ -1382,26 +1387,28 @@ int gf_w32_bytwo_init(gf_t *gf)
|
|||
if (h->mult_type == GF_MULT_BYTWO_p) {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
|
||||
#ifdef INTEL_SSE2
|
||||
if (h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region)
|
||||
else
|
||||
if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_sse_multiply_region)
|
||||
#else
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
#ifdef INTEL_SSE2
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_b_multiply)
|
||||
#ifdef INTEL_SSE2
|
||||
if (h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region)
|
||||
else
|
||||
if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_sse_multiply_region)
|
||||
#else
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
#ifdef INTEL_SSE2
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -1755,11 +1762,11 @@ gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t
|
|||
gf_do_final_region_alignment(&rd);
|
||||
}
|
||||
|
||||
#ifdef INTEL_SSSE3
|
||||
static
|
||||
void
|
||||
gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
|
||||
{
|
||||
#ifdef INTEL_SSSE3
|
||||
gf_internal_t *h;
|
||||
int i, j, k;
|
||||
uint32_t pp, v, *s32, *d32, *top;
|
||||
|
@ -1942,16 +1949,15 @@ gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
|
|||
}
|
||||
|
||||
gf_do_final_region_alignment(&rd);
|
||||
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef INTEL_SSSE3
|
||||
static
|
||||
void
|
||||
gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
|
||||
{
|
||||
#ifdef INTEL_SSSE3
|
||||
gf_internal_t *h;
|
||||
int i, j, k;
|
||||
uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
|
||||
|
@ -2216,9 +2222,8 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
|
|||
}
|
||||
}
|
||||
gf_do_final_region_alignment(&rd);
|
||||
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
static
|
||||
int gf_w32_split_init(gf_t *gf)
|
||||
|
@ -2230,23 +2235,7 @@ int gf_w32_split_init(gf_t *gf)
|
|||
struct gf_split_8_32_lazy_data *d32;
|
||||
struct gf_split_16_32_lazy_data *d16;
|
||||
uint32_t p, basep;
|
||||
int i, j, exp, ispclmul, issse3;
|
||||
int isneon = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
ispclmul = 1;
|
||||
#else
|
||||
ispclmul = 0;
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_SSSE3
|
||||
issse3 = 1;
|
||||
#else
|
||||
issse3 = 0;
|
||||
#endif
|
||||
#ifdef ARM_NEON
|
||||
isneon = 1;
|
||||
#endif
|
||||
int i, j, exp;
|
||||
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
|
||||
|
@ -2262,7 +2251,8 @@ int gf_w32_split_init(gf_t *gf)
|
|||
|
||||
if (h->arg1 == 8 && h->arg2 == 8) {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply)
|
||||
} else if (ispclmul) {
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
} else if (gf_cpu_supports_intel_pclmul) {
|
||||
if ((0xfffe0000 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
|
||||
} else if ((0xffc00000 & h->prim_poly) == 0){
|
||||
|
@ -2270,6 +2260,7 @@ int gf_w32_split_init(gf_t *gf)
|
|||
} else if ((0xfe000000 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
|
||||
}
|
||||
|
@ -2287,33 +2278,39 @@ int gf_w32_split_init(gf_t *gf)
|
|||
ld2 = (struct gf_split_2_32_lazy_data *) h->private;
|
||||
ld2->last_value = 0;
|
||||
#ifdef INTEL_SSSE3
|
||||
if (!(h->region_type & GF_REGION_NOSIMD))
|
||||
if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_sse_multiply_region)
|
||||
else
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_multiply_region)
|
||||
#else
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD) return 0;
|
||||
if(h->region_type & GF_REGION_SIMD) return 0;
|
||||
#ifdef INTEL_SSSE3
|
||||
}
|
||||
#endif
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
|
||||
|
||||
|
||||
if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
|
||||
((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) {
|
||||
((gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) && h->mult_type == GF_REGION_DEFAULT)) {
|
||||
ld4 = (struct gf_split_4_32_lazy_data *) h->private;
|
||||
ld4->last_value = 0;
|
||||
if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) {
|
||||
if ((h->region_type & GF_REGION_NOSIMD) || !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region)
|
||||
} else if (isneon) {
|
||||
} else if (gf_cpu_supports_arm_neon) {
|
||||
#ifdef ARM_NEON
|
||||
gf_w32_neon_split_init(gf);
|
||||
#endif
|
||||
} else if (h->region_type & GF_REGION_ALTMAP) {
|
||||
#ifdef INTEL_SSSE3
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_altmap_multiply_region)
|
||||
#endif
|
||||
} else {
|
||||
#ifdef INTEL_SSSE3
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_multiply_region)
|
||||
#endif
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
@ -2686,16 +2683,6 @@ int gf_w32_composite_init(gf_t *gf)
|
|||
|
||||
int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
|
||||
{
|
||||
int issse3 = 0;
|
||||
int isneon = 0;
|
||||
|
||||
#ifdef INTEL_SSSE3
|
||||
issse3 = 1;
|
||||
#endif
|
||||
#ifdef ARM_NEON
|
||||
isneon = 1;
|
||||
#endif
|
||||
|
||||
switch(mult_type)
|
||||
{
|
||||
case GF_MULT_BYTWO_p:
|
||||
|
@ -2720,7 +2707,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg
|
|||
return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
|
||||
}
|
||||
if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) ||
|
||||
(mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) {
|
||||
(mult_type == GF_MULT_DEFAULT && !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))) {
|
||||
return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
|
||||
}
|
||||
if ((arg1 == 4 && arg2 == 32) ||
|
||||
|
|
92
src/gf_w4.c
92
src/gf_w4.c
|
@ -12,6 +12,7 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "gf_w4.h"
|
||||
#include "gf_cpu.h"
|
||||
|
||||
#define AB2(ip, am1 ,am2, b, t1, t2) {\
|
||||
t1 = (b << 1) & am1;\
|
||||
|
@ -134,6 +135,7 @@ gf_w4_shift_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
|
|||
|
||||
/* Ben: This function works, but it is 33% slower than the normal shift mult */
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
static
|
||||
inline
|
||||
gf_val_32_t
|
||||
|
@ -141,8 +143,6 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
|
|||
{
|
||||
gf_val_32_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -173,9 +173,9 @@ gf_w4_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
|
|||
/* Extracts 32 bit value from result. */
|
||||
|
||||
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
static
|
||||
void
|
||||
|
@ -447,18 +447,19 @@ int gf_w4_single_table_init(gf_t *gf)
|
|||
SET_FUNCTION(gf,inverse,w32,NULL)
|
||||
SET_FUNCTION(gf,divide,w32,gf_w4_single_table_divide)
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w4_single_table_multiply)
|
||||
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
|
||||
if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
|
||||
else
|
||||
#if defined(INTEL_SSSE3)
|
||||
#if defined(INTEL_SSSE3)
|
||||
if (gf_cpu_supports_intel_ssse3 && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_sse_multiply_region)
|
||||
#elif defined(ARM_NEON)
|
||||
} else {
|
||||
#elif defined(ARM_NEON)
|
||||
if (gf_cpu_supports_arm_neon && !(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))) {
|
||||
gf_w4_neon_single_table_init(gf);
|
||||
#endif
|
||||
#else
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
|
||||
if (h->region_type & GF_REGION_SIMD) return 0;
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region)
|
||||
if (h->region_type & GF_REGION_SIMD) return 0;
|
||||
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
|
||||
}
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
|
@ -736,16 +737,13 @@ int gf_w4_table_init(gf_t *gf)
|
|||
{
|
||||
int rt;
|
||||
gf_internal_t *h;
|
||||
int simd = 0;
|
||||
|
||||
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
|
||||
simd = 1;
|
||||
#endif
|
||||
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
rt = (h->region_type);
|
||||
|
||||
if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE;
|
||||
if (h->mult_type == GF_MULT_DEFAULT &&
|
||||
!(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))
|
||||
rt |= GF_REGION_DOUBLE_TABLE;
|
||||
|
||||
if (rt & GF_REGION_DOUBLE_TABLE) {
|
||||
return gf_w4_double_table_init(gf);
|
||||
|
@ -929,11 +927,11 @@ gf_w4_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
|
|||
#endif
|
||||
|
||||
/*
|
||||
#ifdef INTEL_SSE2
|
||||
static
|
||||
void
|
||||
gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
|
||||
{
|
||||
#ifdef INTEL_SSE2
|
||||
uint8_t *d8, *s8, tb;
|
||||
__m128i pp, m1, m2, t1, t2, va, vb;
|
||||
struct gf_bytwo_data *btd;
|
||||
|
@ -990,8 +988,8 @@ gf_w4_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
|
|||
}
|
||||
}
|
||||
gf_do_final_region_alignment(&rd);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
*/
|
||||
|
||||
#ifdef INTEL_SSE2
|
||||
|
@ -1867,26 +1865,28 @@ int gf_w4_bytwo_init(gf_t *gf)
|
|||
if (h->mult_type == GF_MULT_BYTWO_p) {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_p_multiply)
|
||||
#ifdef INTEL_SSE2
|
||||
if (h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
|
||||
else
|
||||
if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_sse_multiply_region)
|
||||
#else
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
|
||||
if (h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_p_nosse_multiply_region)
|
||||
if (h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
#ifdef INTEL_SSE2
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w4_bytwo_b_multiply)
|
||||
#ifdef INTEL_SSE2
|
||||
if (h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
|
||||
else
|
||||
if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_sse_multiply_region)
|
||||
#else
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
|
||||
if (h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w4_bytwo_b_nosse_multiply_region)
|
||||
if (h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
#ifdef INTEL_SSE2
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return 1;
|
||||
|
@ -1897,10 +1897,14 @@ static
|
|||
int gf_w4_cfm_init(gf_t *gf)
|
||||
{
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply)
|
||||
return 1;
|
||||
if (gf_cpu_supports_intel_pclmul) {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w4_clm_multiply)
|
||||
return 1;
|
||||
}
|
||||
#elif defined(ARM_NEON)
|
||||
return gf_w4_neon_cfm_init(gf);
|
||||
if (gf_cpu_supports_arm_neon) {
|
||||
return gf_w4_neon_cfm_init(gf);
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
@ -1917,15 +1921,6 @@ int gf_w4_shift_init(gf_t *gf)
|
|||
|
||||
int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
|
||||
{
|
||||
int issse3 = 0, isneon = 0;
|
||||
|
||||
#ifdef INTEL_SSSE3
|
||||
issse3 = 1;
|
||||
#endif
|
||||
#ifdef ARM_NEON
|
||||
isneon = 1;
|
||||
#endif
|
||||
|
||||
switch(mult_type)
|
||||
{
|
||||
case GF_MULT_BYTWO_p:
|
||||
|
@ -1938,7 +1933,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1
|
|||
return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
|
||||
}
|
||||
|
||||
if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))
|
||||
if (mult_type == GF_MULT_DEFAULT &&
|
||||
!(gf_cpu_supports_arm_neon || gf_cpu_supports_intel_ssse3))
|
||||
region_type = GF_REGION_DOUBLE_TABLE;
|
||||
|
||||
if (region_type & GF_REGION_DOUBLE_TABLE) {
|
||||
|
|
164
src/gf_w64.c
164
src/gf_w64.c
|
@ -12,6 +12,7 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "gf_w64.h"
|
||||
#include "gf_cpu.h"
|
||||
|
||||
static
|
||||
inline
|
||||
|
@ -338,6 +339,8 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
|
|||
* ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
|
||||
*/
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
static
|
||||
inline
|
||||
gf_val_64_t
|
||||
|
@ -345,8 +348,6 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
|
|||
{
|
||||
gf_val_64_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -376,10 +377,12 @@ gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
|
|||
result = _mm_xor_si128 (result, w);
|
||||
|
||||
rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
static
|
||||
inline
|
||||
gf_val_64_t
|
||||
|
@ -387,8 +390,6 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
|
|||
{
|
||||
gf_val_64_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -418,15 +419,15 @@ gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
|
|||
result = _mm_xor_si128 (result, w);
|
||||
|
||||
rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
void
|
||||
gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
|
||||
{
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
gf_internal_t *h;
|
||||
uint8_t *s8, *d8, *dtop;
|
||||
gf_region_data rd;
|
||||
|
@ -504,8 +505,8 @@ gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int by
|
|||
}
|
||||
}
|
||||
gf_do_final_region_alignment(&rd);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
void
|
||||
gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
|
||||
|
@ -709,21 +710,23 @@ int gf_w64_cfm_init(gf_t *gf)
|
|||
SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
gf_internal_t *h;
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
if (gf_cpu_supports_intel_pclmul) {
|
||||
gf_internal_t *h;
|
||||
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
|
||||
if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
|
||||
}else if((0xfffe000000000000ULL & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
|
||||
} else {
|
||||
return 0;
|
||||
if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
|
||||
}else if((0xfffe000000000000ULL & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
@ -1261,9 +1264,9 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_
|
|||
v = _mm_srli_epi64(v, 1); }
|
||||
|
||||
|
||||
#ifdef INTEL_SSE2
|
||||
void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
|
||||
{
|
||||
#ifdef INTEL_SSE2
|
||||
int i;
|
||||
uint8_t *s8, *d8;
|
||||
uint64_t vrev, one64;
|
||||
|
@ -1322,8 +1325,8 @@ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_
|
|||
s8 += 16;
|
||||
}
|
||||
gf_do_final_region_alignment(&rd);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef INTEL_SSE2
|
||||
static
|
||||
|
@ -1457,26 +1460,28 @@ int gf_w64_bytwo_init(gf_t *gf)
|
|||
if (h->mult_type == GF_MULT_BYTWO_p) {
|
||||
SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply)
|
||||
#ifdef INTEL_SSE2
|
||||
if (h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region)
|
||||
else
|
||||
if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_sse_multiply_region)
|
||||
#else
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
#ifdef INTEL_SSE2
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_b_multiply)
|
||||
#ifdef INTEL_SSE2
|
||||
if (h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region)
|
||||
else
|
||||
if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_sse_multiply_region)
|
||||
#else
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
#ifdef INTEL_SSE2
|
||||
}
|
||||
#endif
|
||||
}
|
||||
SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
|
||||
|
@ -1975,18 +1980,20 @@ int gf_w64_split_init(gf_t *gf)
|
|||
SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply)
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
if ((!(h->region_type & GF_REGION_NOSIMD) &&
|
||||
(h->arg1 == 64 || h->arg2 == 64)) ||
|
||||
h->mult_type == GF_MULT_DEFAULT){
|
||||
|
||||
if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
|
||||
}else if((0xfffe000000000000ULL & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
|
||||
}else{
|
||||
return 0;
|
||||
if (gf_cpu_supports_intel_pclmul) {
|
||||
if ((!(h->region_type & GF_REGION_NOSIMD) &&
|
||||
(h->arg1 == 64 || h->arg2 == 64)) ||
|
||||
h->mult_type == GF_MULT_DEFAULT){
|
||||
|
||||
if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
|
||||
}else if((0xfffe000000000000ULL & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
|
||||
}else{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1996,23 +2003,27 @@ int gf_w64_split_init(gf_t *gf)
|
|||
/* Allen: set region pointers for default mult type. Single pointers are
|
||||
* taken care of above (explicitly for sse, implicitly for no sse). */
|
||||
|
||||
if (h->mult_type == GF_MULT_DEFAULT) {
|
||||
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
|
||||
if (h->mult_type == GF_MULT_DEFAULT) {
|
||||
d4 = (struct gf_split_4_64_lazy_data *) h->private;
|
||||
d4->last_value = 0;
|
||||
if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
|
||||
d4 = (struct gf_split_4_64_lazy_data *) h->private;
|
||||
d4->last_value = 0;
|
||||
#if defined(INTEL_SSE4)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
|
||||
if (gf_cpu_supports_intel_sse4)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
|
||||
#elif defined(ARCH_AARCH64)
|
||||
gf_w64_neon_split_init(gf);
|
||||
if (gf_cpu_supports_arm_neon)
|
||||
gf_w64_neon_split_init(gf);
|
||||
#endif
|
||||
} else {
|
||||
#endif
|
||||
d8 = (struct gf_split_8_64_lazy_data *) h->private;
|
||||
d8->last_value = 0;
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
|
||||
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
if (h->mult_type == GF_MULT_DEFAULT) {
|
||||
d8 = (struct gf_split_8_64_lazy_data *) h->private;
|
||||
d8->last_value = 0;
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
|
||||
}
|
||||
#endif
|
||||
|
||||
if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
|
||||
d4 = (struct gf_split_4_64_lazy_data *) h->private;
|
||||
|
@ -2022,28 +2033,35 @@ int gf_w64_split_init(gf_t *gf)
|
|||
if(h->region_type & GF_REGION_ALTMAP)
|
||||
{
|
||||
#ifdef INTEL_SSSE3
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region)
|
||||
if (gf_cpu_supports_intel_ssse3) {
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region)
|
||||
} else
|
||||
#elif defined(ARCH_AARCH64)
|
||||
gf_w64_neon_split_init(gf);
|
||||
#else
|
||||
return 0;
|
||||
if (gf_cpu_supports_arm_neon) {
|
||||
gf_w64_neon_split_init(gf);
|
||||
} else
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
else //no altmap
|
||||
{
|
||||
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
|
||||
if(h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
|
||||
else
|
||||
#if defined(INTEL_SSE4)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
|
||||
#elif defined(ARCH_AARCH64)
|
||||
gf_w64_neon_split_init(gf);
|
||||
#endif
|
||||
#else
|
||||
if(gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
|
||||
if (h->region_type & GF_REGION_NOSIMD) {
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
|
||||
} else
|
||||
#if defined(INTEL_SSE4)
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
|
||||
#elif defined(ARCH_AARCH64)
|
||||
gf_w64_neon_split_init(gf);
|
||||
#endif
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -2114,11 +2132,15 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg
|
|||
* then fall through to split table scratch size code. */
|
||||
|
||||
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
|
||||
if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
|
||||
arg1 = 64;
|
||||
arg2 = 4;
|
||||
#else
|
||||
} else {
|
||||
#endif
|
||||
arg1 = 64;
|
||||
arg2 = 8;
|
||||
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
|
||||
}
|
||||
#endif
|
||||
|
||||
case GF_MULT_SPLIT_TABLE:
|
||||
|
|
126
src/gf_w8.c
126
src/gf_w8.c
|
@ -13,6 +13,7 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include "gf_cpu.h"
|
||||
|
||||
#define AB2(ip, am1 ,am2, b, t1, t2) {\
|
||||
t1 = (b << 1) & am1;\
|
||||
|
@ -127,6 +128,7 @@ uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
|
|||
}
|
||||
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
static
|
||||
inline
|
||||
gf_val_32_t
|
||||
|
@ -134,8 +136,6 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
|
|||
{
|
||||
gf_val_32_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -169,10 +169,11 @@ gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
|
|||
|
||||
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
|
||||
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
static
|
||||
inline
|
||||
gf_val_32_t
|
||||
|
@ -180,8 +181,6 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
|
|||
{
|
||||
gf_val_32_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -208,10 +207,11 @@ gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
|
|||
|
||||
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
|
||||
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
static
|
||||
inline
|
||||
gf_val_32_t
|
||||
|
@ -219,8 +219,6 @@ gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
|
|||
{
|
||||
gf_val_32_t rv = 0;
|
||||
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -248,9 +246,9 @@ gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
|
|||
/* Extracts 32 bit value from result. */
|
||||
rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
|
||||
|
||||
#endif
|
||||
return rv;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static
|
||||
|
@ -509,25 +507,29 @@ static
|
|||
int gf_w8_cfm_init(gf_t *gf)
|
||||
{
|
||||
#if defined(INTEL_SSE4_PCLMUL)
|
||||
gf_internal_t *h;
|
||||
if (gf_cpu_supports_intel_pclmul) {
|
||||
gf_internal_t *h;
|
||||
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
|
||||
if ((0xe0 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2)
|
||||
}else if ((0xc0 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3)
|
||||
}else if ((0x80 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4)
|
||||
}else{
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
if ((0xe0 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2)
|
||||
}else if ((0xc0 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3)
|
||||
}else if ((0x80 & h->prim_poly) == 0){
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4)
|
||||
}else{
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
#elif defined(ARM_NEON)
|
||||
return gf_w8_neon_cfm_init(gf);
|
||||
if (gf_cpu_supports_arm_neon) {
|
||||
return gf_w8_neon_cfm_init(gf);
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
@ -1103,20 +1105,21 @@ int gf_w8_split_init(gf_t *gf)
|
|||
}
|
||||
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w8_split_multiply)
|
||||
|
||||
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
|
||||
if (h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region)
|
||||
else
|
||||
#if defined(INTEL_SSSE3)
|
||||
|
||||
#if defined(INTEL_SSSE3)
|
||||
if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
|
||||
#elif defined(ARM_NEON)
|
||||
} else {
|
||||
#elif defined(ARM_NEON)
|
||||
if (gf_cpu_supports_arm_neon && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
gf_w8_neon_split_init(gf);
|
||||
#endif
|
||||
#else
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
|
||||
}
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
|
@ -1134,17 +1137,12 @@ int gf_w8_table_init(gf_t *gf)
|
|||
struct gf_w8_double_table_data *dtd = NULL;
|
||||
struct gf_w8_double_table_lazy_data *ltd = NULL;
|
||||
struct gf_w8_default_data *dd = NULL;
|
||||
int a, b, c, prod, scase, use_simd;
|
||||
int a, b, c, prod, scase;
|
||||
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
|
||||
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
|
||||
use_simd = 1;
|
||||
#else
|
||||
use_simd = 0;
|
||||
#endif
|
||||
|
||||
if (h->mult_type == GF_MULT_DEFAULT && use_simd) {
|
||||
if (h->mult_type == GF_MULT_DEFAULT &&
|
||||
(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
|
||||
dd = (struct gf_w8_default_data *)h->private;
|
||||
scase = 3;
|
||||
bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
|
||||
|
@ -1220,13 +1218,19 @@ int gf_w8_table_init(gf_t *gf)
|
|||
break;
|
||||
case 3:
|
||||
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
|
||||
SET_FUNCTION(gf,divide,w32,gf_w8_default_divide)
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply)
|
||||
if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
|
||||
SET_FUNCTION(gf,divide,w32,gf_w8_default_divide)
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply)
|
||||
#if defined(INTEL_SSSE3)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
|
||||
if (gf_cpu_supports_intel_ssse3) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
|
||||
}
|
||||
#elif defined(ARM_NEON)
|
||||
gf_w8_neon_split_init(gf);
|
||||
if (gf_cpu_supports_arm_neon) {
|
||||
gf_w8_neon_split_init(gf);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
@ -2192,26 +2196,28 @@ int gf_w8_bytwo_init(gf_t *gf)
|
|||
if (h->mult_type == GF_MULT_BYTWO_p) {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_p_multiply)
|
||||
#ifdef INTEL_SSE2
|
||||
if (h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
|
||||
else
|
||||
if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_sse_multiply_region)
|
||||
#else
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
#ifdef INTEL_SSE2
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_b_multiply)
|
||||
#ifdef INTEL_SSE2
|
||||
if (h->region_type & GF_REGION_NOSIMD)
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region)
|
||||
else
|
||||
if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_sse_multiply_region)
|
||||
#else
|
||||
} else {
|
||||
#endif
|
||||
SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region)
|
||||
if(h->region_type & GF_REGION_SIMD)
|
||||
return 0;
|
||||
#ifdef INTEL_SSE2
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return 1;
|
||||
|
@ -2229,9 +2235,9 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1
|
|||
switch(mult_type)
|
||||
{
|
||||
case GF_MULT_DEFAULT:
|
||||
#if defined(INTEL_SSSE3) || defined(ARM_NEON)
|
||||
return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
|
||||
#endif
|
||||
if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
|
||||
return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
|
||||
}
|
||||
return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
|
||||
case GF_MULT_TABLE:
|
||||
if (region_type == GF_REGION_CAUCHY) {
|
||||
|
|
|
@ -118,6 +118,237 @@ test_compile() {
|
|||
esac
|
||||
}
|
||||
|
||||
# disable through build flags
|
||||
runtime_arm_flags() {
|
||||
failed=0
|
||||
|
||||
echo "====NO SIMD support..." >> ${1}
|
||||
{ ./configure --disable-neon && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "Compile FAILED" >> ${1}; return 1; }
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====FULL SIMD support..." >> ${1}
|
||||
{ ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "Compile FAILED" >> ${1}; return 1; }
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
return ${failed}
|
||||
}
|
||||
|
||||
# build once with FULL SIMD and disable at runtime through environment
|
||||
runtime_arm_env() {
|
||||
failed=0
|
||||
|
||||
{ ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "Compile FAILED" >> ${1}; return 1; }
|
||||
|
||||
echo "====NO SIMD support..." >> ${1}
|
||||
export GF_COMPLETE_DISABLE_NEON=1
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====FULL SIMD support..." >> ${1}
|
||||
unset GF_COMPLETE_DISABLE_NEON
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
return ${failed}
|
||||
}
|
||||
|
||||
runtime_intel_flags() {
|
||||
failed=0
|
||||
|
||||
echo "====NO SIMD support..." >> ${1}
|
||||
{ ./configure --disable-sse && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====SSE2 support..." >> ${1}
|
||||
export ax_cv_have_sse_ext=no
|
||||
export ax_cv_have_sse2_ext=yes
|
||||
export ax_cv_have_sse3_ext=no
|
||||
export ax_cv_have_ssse3_ext=no
|
||||
export ax_cv_have_sse41_ext=no
|
||||
export ax_cv_have_sse42_ext=no
|
||||
export ax_cv_have_pclmuldq_ext=no
|
||||
{ ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====SSE2,SSE3 support..." >> ${1}
|
||||
export ax_cv_have_sse_ext=no
|
||||
export ax_cv_have_sse2_ext=yes
|
||||
export ax_cv_have_sse3_ext=yes
|
||||
export ax_cv_have_ssse3_ext=no
|
||||
export ax_cv_have_sse41_ext=no
|
||||
export ax_cv_have_sse42_ext=no
|
||||
export ax_cv_have_pclmuldq_ext=no
|
||||
{ ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====SSE2,SSE3,SSSE3 support..." >> ${1}
|
||||
export ax_cv_have_sse_ext=no
|
||||
export ax_cv_have_sse2_ext=yes
|
||||
export ax_cv_have_sse3_ext=yes
|
||||
export ax_cv_have_ssse3_ext=yes
|
||||
export ax_cv_have_sse41_ext=no
|
||||
export ax_cv_have_sse42_ext=no
|
||||
export ax_cv_have_pclmuldq_ext=no
|
||||
{ ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1}
|
||||
export ax_cv_have_sse_ext=no
|
||||
export ax_cv_have_sse2_ext=yes
|
||||
export ax_cv_have_sse3_ext=yes
|
||||
export ax_cv_have_ssse3_ext=yes
|
||||
export ax_cv_have_sse41_ext=yes
|
||||
export ax_cv_have_sse42_ext=no
|
||||
export ax_cv_have_pclmuldq_ext=no
|
||||
{ ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1}
|
||||
export ax_cv_have_sse_ext=no
|
||||
export ax_cv_have_sse2_ext=yes
|
||||
export ax_cv_have_sse3_ext=yes
|
||||
export ax_cv_have_ssse3_ext=yes
|
||||
export ax_cv_have_sse41_ext=no
|
||||
export ax_cv_have_sse42_ext=yes
|
||||
export ax_cv_have_pclmuldq_ext=no
|
||||
{ ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====FULL SIMD support..." >> ${1}
|
||||
{ ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "FAIL" >> ${1}; ((++failed)); }
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
return ${failed}
|
||||
}
|
||||
|
||||
runtime_intel_env() {
|
||||
failed=0
|
||||
|
||||
# compile a build with full SIMD support
|
||||
{ ./configure && make clean && make CFLAGS="-DDEBUG_FUNCTIONS"; } || { echo "Compile FAILED" >> ${1}; return 1; }
|
||||
|
||||
echo "====NO SIMD support..." >> ${1}
|
||||
export GF_COMPLETE_DISABLE_SSE2=1
|
||||
export GF_COMPLETE_DISABLE_SSE3=1
|
||||
export GF_COMPLETE_DISABLE_SSSE3=1
|
||||
export GF_COMPLETE_DISABLE_SSE4=1
|
||||
export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====SSE2 support..." >> ${1}
|
||||
unset GF_COMPLETE_DISABLE_SSE2
|
||||
export GF_COMPLETE_DISABLE_SSE3=1
|
||||
export GF_COMPLETE_DISABLE_SSSE3=1
|
||||
export GF_COMPLETE_DISABLE_SSE4=1
|
||||
export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====SSE2,SSE3 support..." >> ${1}
|
||||
unset GF_COMPLETE_DISABLE_SSE2
|
||||
unset GF_COMPLETE_DISABLE_SSE3
|
||||
export GF_COMPLETE_DISABLE_SSSE3=1
|
||||
export GF_COMPLETE_DISABLE_SSE4=1
|
||||
export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====SSE2,SSE3,SSSE3 support..." >> ${1}
|
||||
unset GF_COMPLETE_DISABLE_SSE2
|
||||
unset GF_COMPLETE_DISABLE_SSE3
|
||||
unset GF_COMPLETE_DISABLE_SSSE3
|
||||
export GF_COMPLETE_DISABLE_SSE4=1
|
||||
export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====SSE2,SSE3,SSSE3,SSE4_1 support..." >> ${1}
|
||||
unset GF_COMPLETE_DISABLE_SSE2
|
||||
unset GF_COMPLETE_DISABLE_SSE3
|
||||
unset GF_COMPLETE_DISABLE_SSSE3
|
||||
unset GF_COMPLETE_DISABLE_SSE4
|
||||
export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====SSE2,SSE3,SSSE3,SSE4_2 support..." >> ${1}
|
||||
unset GF_COMPLETE_DISABLE_SSE2
|
||||
unset GF_COMPLETE_DISABLE_SSE3
|
||||
unset GF_COMPLETE_DISABLE_SSSE3
|
||||
unset GF_COMPLETE_DISABLE_SSE4
|
||||
export GF_COMPLETE_DISABLE_SSE4_PCLMUL=1
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
echo "====FULL SIMD support..." >> ${1}
|
||||
unset GF_COMPLETE_DISABLE_SSE2
|
||||
unset GF_COMPLETE_DISABLE_SSE3
|
||||
unset GF_COMPLETE_DISABLE_SSSE3
|
||||
unset GF_COMPLETE_DISABLE_SSE4
|
||||
unset GF_COMPLETE_DISABLE_SSE4_PCLMUL
|
||||
for i in 128 64 32 16 8 4; do
|
||||
{ ${script_dir}/gf_methods $i -ACD -X >> ${1}; } || { echo "gf_methods $i FAILED" >> ${1}; ((++failed)); }
|
||||
done
|
||||
|
||||
return ${failed}
|
||||
}
|
||||
|
||||
test_runtime() {
|
||||
rm -f ${results}.left
|
||||
rm -f ${results}.right
|
||||
|
||||
case $host_cpu in
|
||||
aarch64*|arm*)
|
||||
runtime_arm_flags ${results}.left
|
||||
runtime_arm_env ${results}.right
|
||||
;;
|
||||
i[[3456]]86*|x86_64*|amd64*)
|
||||
runtime_intel_flags ${results}.left
|
||||
runtime_intel_env ${results}.right
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "======LEFT======" > ${results}
|
||||
cat ${results}.left >> ${results}
|
||||
echo "======RIGHT======" >> ${results}
|
||||
cat ${results}.right >> ${results}
|
||||
echo "======RESULT======" >> ${results}
|
||||
if diff "${results}.left" "${results}.right"; then
|
||||
echo SUCCESS >> ${results}
|
||||
return 0
|
||||
else
|
||||
echo SUCCESS >> ${results}
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
cd ${script_dir}/..
|
||||
rm -f ${results}
|
||||
|
||||
|
|
|
@ -224,6 +224,8 @@ run_test_simd_basic() {
|
|||
{ run_test $arch $cpu "unit" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
|
||||
echo "=====running functions test"
|
||||
{ run_test $arch $cpu "functions" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
|
||||
echo "=====running runtime test"
|
||||
{ run_test $arch $cpu "runtime" && echo "SUCCESS"; } || { echo "FAILED"; ((++failed)); }
|
||||
stop_qemu
|
||||
|
||||
return ${failed}
|
||||
|
|
Loading…
Reference in New Issue