Added clm region multiplication for w=64. I should make this the default,
but I haven't yet. Speed is nice and fast, but not as fast ast SPLIT 64 4 SSE,ALTMAP: UNIX> gf_time 64 R 0 10240 10240 - Seed: 0 Region-Random: XOR: 0 0.661736 s MB: 100.000 151.118 MB/s Region-Random: XOR: 1 0.659374 s MB: 100.000 151.659 MB/s Region-By-Zero: XOR: 0 0.002128 s MB: 100.000 46989.738 MB/s Region-By-Zero: XOR: 1 0.000248 s MB: 100.000 402911.047 MB/s Region-By-One: XOR: 0 0.002168 s MB: 100.000 46131.808 MB/s Region-By-One: XOR: 1 0.003946 s MB: 100.000 25344.758 MB/s Region-By-Two: XOR: 0 0.377993 s MB: 100.000 264.555 MB/s Region-By-Two: XOR: 1 0.382269 s MB: 100.000 261.596 MB/s UNIX> gf_time 64 R 0 10240 10240 SPLIT 64 4 SSE,ALTMAP - Seed: 0 Region-Random: XOR: 0 0.050045 s MB: 100.000 1998.211 MB/s Region-Random: XOR: 1 0.049198 s MB: 100.000 2032.597 MB/s Region-By-Zero: XOR: 0 0.002100 s MB: 100.000 47619.255 MB/s Region-By-Zero: XOR: 1 0.000260 s MB: 100.000 384445.830 MB/s Region-By-One: XOR: 0 0.002139 s MB: 100.000 46743.609 MB/s Region-By-One: XOR: 1 0.003928 s MB: 100.000 25457.053 MB/s Region-By-Two: XOR: 0 0.048678 s MB: 100.000 2054.330 MB/s Region-By-Two: XOR: 1 0.048800 s MB: 100.000 2049.161 MB/s UNIX> gf_time 64 R 0 10240 10240 SHIFT SSE - Seed: 0 Region-Random: XOR: 0 0.108492 s MB: 100.000 921.724 MB/s Region-Random: XOR: 1 0.110783 s MB: 100.000 902.663 MB/s Region-By-Zero: XOR: 0 0.002077 s MB: 100.000 48155.040 MB/s Region-By-Zero: XOR: 1 0.000254 s MB: 100.000 393461.914 MB/s Region-By-One: XOR: 0 0.002088 s MB: 100.000 47902.056 MB/s Region-By-One: XOR: 1 0.003885 s MB: 100.000 25739.822 MB/s Region-By-Two: XOR: 0 0.107280 s MB: 100.000 932.142 MB/s Region-By-Two: XOR: 1 0.110782 s MB: 100.000 902.676 MB/s UNIX>master
parent
5c719db2b9
commit
b2d6666ed7
100
gf_w64.c
100
gf_w64.c
|
@ -203,7 +203,7 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
|
|||
gf_internal_t * h = gf->scratch;
|
||||
|
||||
a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
|
||||
b = _mm_insert_epi64 (a, b64, 0);
|
||||
b = _mm_insert_epi64 (a, b64, 0);
|
||||
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
|
||||
/* Do the initial multiply */
|
||||
result = _mm_clmulepi64_si128 (a, b, 0);
|
||||
|
@ -221,6 +221,92 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
|
|||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
|
||||
{
|
||||
gf_internal_t *h;
|
||||
int i, j, k;
|
||||
uint8_t *s8, *d8, *dtop;
|
||||
uint64_t *s64, *d64;
|
||||
gf_region_data rd;
|
||||
__m128i v, b, m, prim_poly, c, fr, w, result;
|
||||
|
||||
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
|
||||
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
|
||||
|
||||
h = (gf_internal_t *) gf->scratch;
|
||||
|
||||
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
|
||||
gf_do_initial_region_alignment(&rd);
|
||||
|
||||
s8 = (uint8_t *) rd.s_start;
|
||||
d8 = (uint8_t *) rd.d_start;
|
||||
dtop = (uint8_t *) rd.d_top;
|
||||
|
||||
v = _mm_insert_epi64(_mm_setzero_si128(), val, 0);
|
||||
m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
|
||||
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
|
||||
|
||||
if (xor) {
|
||||
while (d8 != dtop) {
|
||||
s64 = (uint64_t *) s8;
|
||||
b = _mm_load_si128((__m128i *) s8);
|
||||
result = _mm_clmulepi64_si128 (b, v, 0);
|
||||
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
|
||||
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
|
||||
result = _mm_xor_si128 (result, w);
|
||||
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
|
||||
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
|
||||
fr = _mm_xor_si128 (result, w);
|
||||
fr = _mm_and_si128 (fr, m);
|
||||
|
||||
result = _mm_clmulepi64_si128 (b, v, 1);
|
||||
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
|
||||
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
|
||||
result = _mm_xor_si128 (result, w);
|
||||
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
|
||||
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
|
||||
result = _mm_xor_si128 (result, w);
|
||||
result = _mm_slli_si128 (result, 8);
|
||||
fr = _mm_xor_si128 (result, fr);
|
||||
result = _mm_load_si128((__m128i *) d8);
|
||||
fr = _mm_xor_si128 (result, fr);
|
||||
|
||||
_mm_store_si128((__m128i *) d8, fr);
|
||||
d8 += 16;
|
||||
s8 += 16;
|
||||
}
|
||||
} else {
|
||||
while (d8 < dtop) {
|
||||
s64 = (uint64_t *) s8;
|
||||
b = _mm_load_si128((__m128i *) s8);
|
||||
result = _mm_clmulepi64_si128 (b, v, 0);
|
||||
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
|
||||
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
|
||||
result = _mm_xor_si128 (result, w);
|
||||
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
|
||||
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
|
||||
fr = _mm_xor_si128 (result, w);
|
||||
fr = _mm_and_si128 (fr, m);
|
||||
|
||||
result = _mm_clmulepi64_si128 (b, v, 1);
|
||||
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
|
||||
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
|
||||
result = _mm_xor_si128 (result, w);
|
||||
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
|
||||
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
|
||||
result = _mm_xor_si128 (result, w);
|
||||
result = _mm_slli_si128 (result, 8);
|
||||
fr = _mm_xor_si128 (result, fr);
|
||||
|
||||
_mm_store_si128((__m128i *) d8, fr);
|
||||
d8 += 16;
|
||||
s8 += 16;
|
||||
}
|
||||
}
|
||||
gf_do_final_region_alignment(&rd);
|
||||
}
|
||||
|
||||
void
|
||||
gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
|
||||
{
|
||||
|
@ -238,7 +324,7 @@ gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t
|
|||
|
||||
ld = (struct gf_split_4_64_lazy_data *) h->private;
|
||||
|
||||
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
|
||||
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
|
||||
gf_do_initial_region_alignment(&rd);
|
||||
|
||||
if (ld->last_value != val) {
|
||||
|
@ -416,12 +502,14 @@ int gf_w64_shift_init(gf_t *gf)
|
|||
h = (gf_internal_t *) gf->scratch;
|
||||
|
||||
gf->multiply.w64 = gf_w64_shift_multiply;
|
||||
#ifdef INTEL_PCLMUL
|
||||
if (h->region_type != GF_REGION_NOSSE) gf->multiply.w64 = gf_w64_clm_multiply;
|
||||
#endif
|
||||
|
||||
gf->inverse.w64 = gf_w64_euclid;
|
||||
gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
|
||||
|
||||
#ifdef INTEL_PCLMUL
|
||||
if (h->region_type != GF_REGION_NOSSE) gf->multiply.w64 = gf_w64_clm_multiply;
|
||||
if (h->region_type != GF_REGION_NOSSE) gf->multiply_region.w64 = gf_w64_clm_multiply_region;
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue