Merged in jimplank/gf-complete-unit-bytwo (pull request #1)
Unit & w=64 bytwo. Made a few more changes.master
commit
5c719db2b9
|
@ -190,6 +190,8 @@ int main(int argc, char **argv)
|
|||
gf_general_set_zero(a, w);
|
||||
} else if (i < 40) {
|
||||
gf_general_set_one(a, w);
|
||||
} else if (i < 60) {
|
||||
gf_general_set_two(a, w);
|
||||
} else {
|
||||
gf_general_set_random(a, w, 1);
|
||||
}
|
||||
|
|
110
gf_w64.c
110
gf_w64.c
|
@ -57,6 +57,8 @@ gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a)
|
|||
return gf->divide.w64(gf, 1, a);
|
||||
}
|
||||
|
||||
#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
|
||||
|
||||
static
|
||||
inline
|
||||
gf_val_64_t gf_w64_divide_from_inverse (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
|
||||
|
@ -163,7 +165,7 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
|
|||
pl ^= bl;
|
||||
pr ^= br;
|
||||
}
|
||||
/* printf("P: %016llx %016llx ", pl, pr); printf("B: %016llx %016llx\n", bl, br); */
|
||||
/* printf("P: %016llx %016llx ", pl, pr); printf("B: %016llx %016llx\n", bl, br); */
|
||||
bl <<= 1;
|
||||
if (br & lbit) bl ^= 1;
|
||||
br <<= 1;
|
||||
|
@ -185,7 +187,6 @@ gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
|
|||
return pr;
|
||||
}
|
||||
|
||||
#ifdef INTEL_PCLMUL
|
||||
/*
|
||||
* ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
|
||||
*/
|
||||
|
@ -194,6 +195,7 @@ inline
|
|||
gf_val_64_t
|
||||
gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
|
||||
{
|
||||
#ifdef INTEL_PCLMUL
|
||||
__m128i a, b;
|
||||
__m128i result;
|
||||
__m128i prim_poly;
|
||||
|
@ -201,7 +203,7 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
|
|||
gf_internal_t * h = gf->scratch;
|
||||
|
||||
a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
|
||||
b = _mm_insert_epi64 (_mm_setzero_si128(), b64, 0);
|
||||
b = _mm_insert_epi64 (a, b64, 0);
|
||||
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
|
||||
/* Do the initial multiply */
|
||||
result = _mm_clmulepi64_si128 (a, b, 0);
|
||||
|
@ -216,8 +218,8 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
|
|||
result = _mm_xor_si128 (result, w);
|
||||
|
||||
return ((gf_val_64_t)_mm_extract_epi64(result, 0));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
|
||||
|
@ -409,8 +411,15 @@ gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_
|
|||
static
|
||||
int gf_w64_shift_init(gf_t *gf)
|
||||
{
|
||||
/* gf->multiply.w64 = gf_w64_shift_multiply; */
|
||||
gf->multiply.w64 = gf_w64_clm_multiply;
|
||||
gf_internal_t *h;
|
||||
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
|
||||
gf->multiply.w64 = gf_w64_shift_multiply;
|
||||
#ifdef INTEL_PCLMUL
|
||||
if (h->region_type != GF_REGION_NOSSE) gf->multiply.w64 = gf_w64_clm_multiply;
|
||||
#endif
|
||||
|
||||
gf->inverse.w64 = gf_w64_euclid;
|
||||
gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
|
||||
return 1;
|
||||
|
@ -950,7 +959,6 @@ gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_
|
|||
}
|
||||
gf_do_final_region_alignment(&rd);
|
||||
}
|
||||
#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
|
||||
|
||||
#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
|
||||
t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
|
||||
|
@ -1031,6 +1039,72 @@ void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_
|
|||
#endif
|
||||
}
|
||||
|
||||
static
|
||||
void
|
||||
gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd)
|
||||
{
|
||||
#ifdef INTEL_SSE4
|
||||
int i;
|
||||
uint64_t one64, amask;
|
||||
uint8_t *d8, *s8, tb;
|
||||
__m128i pp, m1, m2, t1, t2, va, vb;
|
||||
gf_internal_t *h;
|
||||
|
||||
s8 = (uint8_t *) rd->s_start;
|
||||
d8 = (uint8_t *) rd->d_start;
|
||||
|
||||
h = (gf_internal_t *) rd->gf->scratch;
|
||||
one64 = 1;
|
||||
amask = -1;
|
||||
amask ^= 1;
|
||||
pp = _mm_set1_epi64x(h->prim_poly);
|
||||
m1 = _mm_set1_epi64x(amask);
|
||||
m2 = _mm_set1_epi64x(one64 << 63);
|
||||
|
||||
while (d8 < (uint8_t *) rd->d_top) {
|
||||
va = _mm_load_si128 ((__m128i *)(s8));
|
||||
SSE_AB2(pp, m1, m2, va, t1, t2);
|
||||
vb = _mm_load_si128 ((__m128i *)(d8));
|
||||
vb = _mm_xor_si128(vb, va);
|
||||
_mm_store_si128((__m128i *)d8, vb);
|
||||
d8 += 16;
|
||||
s8 += 16;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static
|
||||
void
|
||||
gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd)
|
||||
{
|
||||
#ifdef INTEL_SSE4
|
||||
int i;
|
||||
uint64_t one64, amask;
|
||||
uint8_t *d8, *s8, tb;
|
||||
__m128i pp, m1, m2, t1, t2, va;
|
||||
gf_internal_t *h;
|
||||
|
||||
s8 = (uint8_t *) rd->s_start;
|
||||
d8 = (uint8_t *) rd->d_start;
|
||||
|
||||
h = (gf_internal_t *) rd->gf->scratch;
|
||||
one64 = 1;
|
||||
amask = -1;
|
||||
amask ^= 1;
|
||||
pp = _mm_set1_epi64x(h->prim_poly);
|
||||
m1 = _mm_set1_epi64x(amask);
|
||||
m2 = _mm_set1_epi64x(one64 << 63);
|
||||
|
||||
while (d8 < (uint8_t *) rd->d_top) {
|
||||
va = _mm_load_si128 ((__m128i *)(s8));
|
||||
SSE_AB2(pp, m1, m2, va, t1, t2);
|
||||
_mm_store_si128((__m128i *)d8, va);
|
||||
d8 += 16;
|
||||
s8 += 16;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static
|
||||
void
|
||||
gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
|
||||
|
@ -1049,6 +1123,16 @@ gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t
|
|||
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
|
||||
gf_do_initial_region_alignment(&rd);
|
||||
|
||||
if (val == 2) {
|
||||
if (xor) {
|
||||
gf_w64_bytwo_b_sse_region_2_xor(&rd);
|
||||
} else {
|
||||
gf_w64_bytwo_b_sse_region_2_noxor(&rd);
|
||||
}
|
||||
gf_do_final_region_alignment(&rd);
|
||||
return;
|
||||
}
|
||||
|
||||
s8 = (uint8_t *) rd.s_start;
|
||||
d8 = (uint8_t *) rd.d_start;
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
|
@ -1417,8 +1501,13 @@ int gf_w64_split_init(gf_t *gf)
|
|||
|
||||
/* Defaults */
|
||||
gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
|
||||
gf->multiply.w64 = gf_w64_clm_multiply;
|
||||
/* gf->multiply.w64 = gf_w64_shift_multiply; */
|
||||
|
||||
gf->multiply.w64 = gf_w64_shift_multiply;
|
||||
|
||||
#ifdef INTEL_PCLMUL
|
||||
if (h->region_type != GF_REGION_NOSSE) gf->multiply.w64 = gf_w64_clm_multiply;
|
||||
#endif
|
||||
|
||||
gf->inverse.w64 = gf_w64_euclid;
|
||||
|
||||
if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
|
||||
|
@ -1489,7 +1578,8 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg
|
|||
switch(mult_type)
|
||||
{
|
||||
case GF_MULT_SHIFT:
|
||||
if (arg1 != 0 || arg2 != 0 || region_type != 0) return -1;
|
||||
if (arg1 != 0 || arg2 != 0) return -1;
|
||||
if (region_type != GF_REGION_NOSSE && region_type != GF_REGION_SSE && region_type != GF_REGION_DEFAULT) return -1;
|
||||
return sizeof(gf_internal_t);
|
||||
break;
|
||||
case GF_MULT_BYTWO_p:
|
||||
|
|
Loading…
Reference in New Issue