Removed GROUP/128/SSE. It wasn't compiling, and it needed an overhaul.
I'll do it someday when I'm bored.master
parent
fb0bbdcf62
commit
f0c32c94bc
|
@ -144,7 +144,6 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
|
|||
GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */
|
||||
GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */
|
||||
GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */
|
||||
GF_E_GR_SSE4, /* Mult == GROUP, w == 128, No SSE4 */
|
||||
GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */
|
||||
GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */
|
||||
GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */
|
||||
|
|
2
src/gf.c
2
src/gf.c
|
@ -75,7 +75,6 @@ void gf_error()
|
|||
case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break;
|
||||
case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break;
|
||||
case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
|
||||
case GF_E_GR_SSE4: s = "With -m GROUP, w == 128, you need SSE4."; break;
|
||||
case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
|
||||
case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
|
||||
case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break;
|
||||
|
@ -320,7 +319,6 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
|
|||
if (w == 16 && (arg1 != 4 || arg2 != 4)) { _gf_errno = GF_E_GR_W_16; return 0; }
|
||||
if (w == 128 && (arg1 != 4 ||
|
||||
(arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
|
||||
if (w == 128 && !sse4) { _gf_errno = GF_E_GR_SSE4; return 0; }
|
||||
if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; }
|
||||
if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; }
|
||||
if (raltmap || rsse || rnosse) { _gf_errno = GF_E_GR____J; return 0; }
|
||||
|
|
|
@ -144,7 +144,6 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default && Mult == Default */
|
|||
GF_E_GR_W_48, /* Mult == GROUP, w \in { 4, 8 } */
|
||||
GF_E_GR_W_16, /* Mult == GROUP, w == 16, arg1 != 4 || arg2 != 4 */
|
||||
GF_E_GR_128A, /* Mult == GROUP, w == 128, bad args */
|
||||
GF_E_GR_SSE4, /* Mult == GROUP, w == 128, No SSE4 */
|
||||
GF_E_GR_A_27, /* Mult == GROUP, either arg > 27 */
|
||||
GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */
|
||||
GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */
|
||||
|
|
298
src/gf_w128.c
298
src/gf_w128.c
|
@ -984,45 +984,6 @@ void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128)
|
|||
return;
|
||||
}
|
||||
|
||||
static
|
||||
void gf_w128_group_m_sse_init(gf_t *gf, gf_val_128_t b128)
|
||||
{
|
||||
#if defined(INTEL_SSE4)
|
||||
int i, j;
|
||||
int g_m;
|
||||
uint64_t lbit, middlebit;
|
||||
gf_internal_t *scratch;
|
||||
gf_group_tables_t *gt;
|
||||
scratch = (gf_internal_t *) gf->scratch;
|
||||
gt = scratch->private;
|
||||
g_m = scratch->arg1;
|
||||
|
||||
__m128i *table = (__m128i *)(gt->m_table), b, a, ubit, prim_poly;
|
||||
prim_poly = _mm_insert_epi64(_mm_setzero_si128(), scratch->prim_poly, 0);
|
||||
b = _mm_loadu_si128((__m128i *)(b128));
|
||||
|
||||
table[0] = _mm_setzero_si128();
|
||||
table[1] = table[0];
|
||||
table[1] = _mm_insert_epi64(table[1],b128[0],1);
|
||||
table[1] = _mm_insert_epi64(table[1],b128[1],0);
|
||||
lbit = 1;
|
||||
lbit <<= 63;
|
||||
ubit = _mm_set_epi32(0, 1, 0, 0);
|
||||
for (i = 2; i < (1 << g_m); i <<= 1) {
|
||||
a = table[(i >> 1)];
|
||||
middlebit = (_mm_extract_epi64(a, 0x0) & lbit);
|
||||
a = _mm_slli_epi64(a, 1);
|
||||
if (middlebit) a = _mm_xor_si128(a, ubit);
|
||||
table[i] = a;
|
||||
if (_mm_extract_epi64(table[i >> 1], 0x1) & lbit) table[i] = _mm_xor_si128(table[i], prim_poly);
|
||||
for (j = 0; j < i; j++) {
|
||||
table[i + j] = _mm_xor_si128(table[i], table[j]);
|
||||
}
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
|
||||
{
|
||||
|
@ -1095,88 +1056,6 @@ gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
|
|||
c128[1] = p_i[1];
|
||||
}
|
||||
|
||||
void
|
||||
gf_w128_group_sse_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
|
||||
{
|
||||
#if defined(INTEL_SSE4) && defined(ARCH_64)
|
||||
int i,j;
|
||||
int i_r, i_m, t_m;
|
||||
int mask_m, mask_r, mask_s;
|
||||
int g_m, g_r;
|
||||
uint32_t shiftbits;
|
||||
uint64_t a[2], tbit = 1;
|
||||
tbit <<= 63;
|
||||
gf_internal_t *scratch;
|
||||
gf_group_tables_t *gt;
|
||||
__m128i p_i, *m_table, *r_table, zero;
|
||||
|
||||
zero = _mm_setzero_si128();
|
||||
scratch = (gf_internal_t *) gf->scratch;
|
||||
gt = scratch->private;
|
||||
m_table = (__m128i *)(gt->m_table);
|
||||
r_table = (__m128i *)(gt->r_table);
|
||||
g_m = scratch->arg1;
|
||||
g_r = scratch->arg2;
|
||||
|
||||
mask_m = (1 << g_m) - 1;
|
||||
mask_r = (1 << g_r) - 1;
|
||||
mask_s = mask_m << (32-g_m); /*sets g_m leftmost bits to 1*/
|
||||
if (b128[0] != _mm_extract_epi64(m_table[1], 1) || b128[1] != _mm_extract_epi64(m_table[1], 0)) {
|
||||
gf_w128_group_m_sse_init(gf, b128);
|
||||
}
|
||||
|
||||
p_i = zero;
|
||||
a[0] = a128[0];
|
||||
a[1] = a128[1];
|
||||
|
||||
t_m = 0;
|
||||
i_r = 0;
|
||||
|
||||
/* Top 64 bits */
|
||||
for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
|
||||
i_m = (a[0] >> (i * g_m)) & mask_m;
|
||||
i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
|
||||
|
||||
shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
|
||||
shiftbits >>= 32-g_m;
|
||||
p_i = _mm_slli_epi64(p_i, g_m);
|
||||
p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
|
||||
|
||||
p_i = _mm_xor_si128(p_i, m_table[i_m]);
|
||||
t_m += g_m;
|
||||
if (t_m == g_r) {
|
||||
p_i = _mm_xor_si128(p_i, r_table[i_r]);
|
||||
t_m = 0;
|
||||
i_r = 0;
|
||||
} else {
|
||||
i_r <<= g_m;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
|
||||
i_m = (a[1] >> (i * g_m)) & mask_m;
|
||||
i_r ^= (((uint64_t)_mm_extract_epi64(p_i,1)) >> (64 - g_m)) & mask_r;
|
||||
|
||||
shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
|
||||
shiftbits >>= 32-g_m;
|
||||
p_i = _mm_slli_epi64(p_i, g_m);
|
||||
p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
|
||||
|
||||
p_i = _mm_xor_si128(p_i, m_table[i_m]);
|
||||
t_m += g_m;
|
||||
if (t_m == g_r) {
|
||||
p_i = _mm_xor_si128(p_i, r_table[i_r]);
|
||||
t_m = 0;
|
||||
i_r = 0;
|
||||
} else {
|
||||
i_r <<= g_m;
|
||||
}
|
||||
}
|
||||
c128[0] = _mm_extract_epi64(p_i, 1);
|
||||
c128[1] = _mm_extract_epi64(p_i, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
static
|
||||
void
|
||||
gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
|
||||
|
@ -1273,160 +1152,6 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
|
|||
}
|
||||
}
|
||||
|
||||
static
|
||||
void
|
||||
gf_w128_group_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
|
||||
{
|
||||
#if defined(INTEL_SSE4)
|
||||
int i;
|
||||
int i_r, i_m, t_m;
|
||||
int mask_m, mask_r, mask_s;
|
||||
int g_m, g_r;
|
||||
uint32_t shiftbits;
|
||||
uint64_t a[2];
|
||||
gf_internal_t *scratch;
|
||||
gf_group_tables_t *gt;
|
||||
gf_region_data rd;
|
||||
uint64_t *a128, *c128, *top;
|
||||
__m128i *m_table, *r_table, p_i, zero;
|
||||
zero = _mm_setzero_si128();
|
||||
/* We only do this to check on alignment. */
|
||||
gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
|
||||
|
||||
if (val[0] == 0) {
|
||||
if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
|
||||
if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
|
||||
}
|
||||
|
||||
scratch = (gf_internal_t *) gf->scratch;
|
||||
gt = scratch->private;
|
||||
m_table = (__m128i *)(gt->m_table);
|
||||
r_table = (__m128i *)(gt->r_table);
|
||||
g_m = scratch->arg1;
|
||||
g_r = scratch->arg2;
|
||||
|
||||
mask_m = (1 << g_m) - 1;
|
||||
mask_r = (1 << g_r) - 1;
|
||||
mask_s = mask_m << (32-g_m);
|
||||
|
||||
if (val[0] != _mm_extract_epi64(m_table[1], 1) || val[1] != _mm_extract_epi64(m_table[1], 0)) {
|
||||
gf_w128_group_m_sse_init(gf, val);
|
||||
}
|
||||
|
||||
a128 = (uint64_t *) src;
|
||||
c128 = (uint64_t *) dest;
|
||||
top = (uint64_t *) rd.d_top;
|
||||
|
||||
if (xor){
|
||||
while (c128 < top) {
|
||||
p_i = zero;
|
||||
a[0] = a128[0];
|
||||
a[1] = a128[1];
|
||||
|
||||
t_m = 0;
|
||||
i_r = 0;
|
||||
/* Top 64 bits */
|
||||
for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
|
||||
i_m = (a[0] >> (i * g_m)) & mask_m;
|
||||
i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
|
||||
|
||||
shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
|
||||
shiftbits >>= 32-g_m;
|
||||
p_i = _mm_slli_epi64(p_i, g_m);
|
||||
p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
|
||||
p_i = _mm_xor_si128(p_i, m_table[i_m]);
|
||||
t_m += g_m;
|
||||
if (t_m == g_r) {
|
||||
p_i = _mm_xor_si128(p_i, r_table[i_r]);
|
||||
t_m = 0;
|
||||
i_r = 0;
|
||||
} else {
|
||||
i_r <<= g_m;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
|
||||
i_m = (a[1] >> (i * g_m)) & mask_m;
|
||||
i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
|
||||
|
||||
shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
|
||||
shiftbits >>= 32-g_m;
|
||||
p_i = _mm_slli_epi64(p_i, g_m);
|
||||
p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
|
||||
|
||||
p_i = _mm_xor_si128(p_i, m_table[i_m]);
|
||||
t_m += g_m;
|
||||
if (t_m == g_r) {
|
||||
p_i = _mm_xor_si128(p_i, r_table[i_r]);
|
||||
t_m = 0;
|
||||
i_r = 0;
|
||||
} else {
|
||||
i_r <<= g_m;
|
||||
}
|
||||
}
|
||||
|
||||
c128[0] ^= _mm_extract_epi64(p_i, 1);
|
||||
c128[1] ^= _mm_extract_epi64(p_i, 0);
|
||||
a128 += 2;
|
||||
c128 += 2;
|
||||
}
|
||||
}else{
|
||||
while (c128 < top) {
|
||||
p_i = zero;
|
||||
a[0] = a128[0];
|
||||
a[1] = a128[1];
|
||||
|
||||
t_m = 0;
|
||||
i_r = 0;
|
||||
/* Top 64 bits */
|
||||
for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
|
||||
i_m = (a[0] >> (i * g_m)) & mask_m;
|
||||
i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
|
||||
|
||||
shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
|
||||
shiftbits >>= 32-g_m;
|
||||
p_i = _mm_slli_epi64(p_i, g_m);
|
||||
p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
|
||||
p_i = _mm_xor_si128(p_i, m_table[i_m]);
|
||||
t_m += g_m;
|
||||
if (t_m == g_r) {
|
||||
p_i = _mm_xor_si128(p_i, r_table[i_r]);
|
||||
t_m = 0;
|
||||
i_r = 0;
|
||||
} else {
|
||||
i_r <<= g_m;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
|
||||
i_m = (a[1] >> (i * g_m)) & mask_m;
|
||||
i_r ^= ((uint64_t)_mm_extract_epi64(p_i, 1) >> (64 - g_m)) & mask_r;
|
||||
|
||||
shiftbits = _mm_extract_epi32(p_i, 1) & mask_s;
|
||||
shiftbits >>= 32-g_m;
|
||||
p_i = _mm_slli_epi64(p_i, g_m);
|
||||
p_i = _mm_xor_si128(p_i, _mm_insert_epi32(zero, shiftbits, 2));
|
||||
|
||||
p_i = _mm_xor_si128(p_i, m_table[i_m]);
|
||||
t_m += g_m;
|
||||
if (t_m == g_r) {
|
||||
p_i = _mm_xor_si128(p_i, r_table[i_r]);
|
||||
t_m = 0;
|
||||
i_r = 0;
|
||||
} else {
|
||||
i_r <<= g_m;
|
||||
}
|
||||
}
|
||||
|
||||
c128[0] = _mm_extract_epi64(p_i, 1);
|
||||
c128[1] = _mm_extract_epi64(p_i, 0);
|
||||
a128 += 2;
|
||||
c128 += 2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* a^-1 -> b */
|
||||
void
|
||||
gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
|
||||
|
@ -1880,28 +1605,7 @@ int gf_w128_group_init(gf_t *gf)
|
|||
gf->inverse.w128 = gf_w128_euclid;
|
||||
gf->multiply_region.w128 = gf_w128_group_multiply_region;
|
||||
|
||||
/* JSP: I've got a problem compiling here -- something about "vmovq", and
|
||||
I don't have the time to chase it down right now. */
|
||||
|
||||
#if defined(INTEL_SSE4) && defined(ARCH_64)
|
||||
if(!(scratch->region_type & GF_REGION_NOSSE))
|
||||
{
|
||||
if ((g_m != 4) && ((g_r != 4) || (g_r != 8)))
|
||||
return 0;
|
||||
gt->r_table = (void *)(((uint64_t)gt->r_table + 15) & (~0xfULL)); /* aligns gt->r_table on a 16-bit boundary*/
|
||||
gt->m_table = gt->r_table + 2*size_r;
|
||||
gt->m_table[2] = 0;
|
||||
gt->m_table[3] = 0;
|
||||
gf->multiply.w128 = gf_w128_group_sse_multiply;
|
||||
gf->multiply_region.w128 = gf_w128_group_sse_multiply_region;
|
||||
gf_w128_group_r_sse_init(gf);
|
||||
}
|
||||
else
|
||||
gf_w128_group_r_init(gf);
|
||||
#else
|
||||
if(scratch->region_type & GF_REGION_SSE) return 0;
|
||||
else gf_w128_group_r_init(gf);
|
||||
#endif
|
||||
gf_w128_group_r_init(gf);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -20,8 +20,8 @@
|
|||
#define BNMULTS (8)
|
||||
static char *BMULTS[BNMULTS] = { "CARRY_FREE", "GROUP48",
|
||||
"TABLE", "LOG", "SPLIT4", "SPLIT8", "SPLIT88", "COMPOSITE" };
|
||||
#define NMULTS (15)
|
||||
static char *MULTS[NMULTS] = { "CARRY_FREE", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b",
|
||||
#define NMULTS (16)
|
||||
static char *MULTS[NMULTS] = { "SHIFT", "CARRY_FREE", "GROUP44", "GROUP48", "BYTWO_p", "BYTWO_b",
|
||||
"TABLE", "LOG", "LOG_ZERO", "LOG_ZERO_EXT", "SPLIT2",
|
||||
"SPLIT4", "SPLIT8", "SPLIT16", "SPLIT88", "COMPOSITE" };
|
||||
|
||||
|
|
Loading…
Reference in New Issue