Added clm region multiplication for w=64. I should make this the default,

but I haven't yet.  Speed is nice and fast, but not as fast ast SPLIT 64 4 SSE,ALTMAP:

UNIX> gf_time 64 R 0 10240 10240 -
Seed: 0
 Region-Random: XOR: 0      0.661736 s     MB:    100.000       151.118 MB/s
 Region-Random: XOR: 1      0.659374 s     MB:    100.000       151.659 MB/s
Region-By-Zero: XOR: 0      0.002128 s     MB:    100.000     46989.738 MB/s
Region-By-Zero: XOR: 1      0.000248 s     MB:    100.000    402911.047 MB/s
 Region-By-One: XOR: 0      0.002168 s     MB:    100.000     46131.808 MB/s
 Region-By-One: XOR: 1      0.003946 s     MB:    100.000     25344.758 MB/s
 Region-By-Two: XOR: 0      0.377993 s     MB:    100.000       264.555 MB/s
 Region-By-Two: XOR: 1      0.382269 s     MB:    100.000       261.596 MB/s
UNIX> gf_time 64 R 0 10240 10240 SPLIT 64 4 SSE,ALTMAP -
Seed: 0
 Region-Random: XOR: 0      0.050045 s     MB:    100.000      1998.211 MB/s
 Region-Random: XOR: 1      0.049198 s     MB:    100.000      2032.597 MB/s
Region-By-Zero: XOR: 0      0.002100 s     MB:    100.000     47619.255 MB/s
Region-By-Zero: XOR: 1      0.000260 s     MB:    100.000    384445.830 MB/s
 Region-By-One: XOR: 0      0.002139 s     MB:    100.000     46743.609 MB/s
 Region-By-One: XOR: 1      0.003928 s     MB:    100.000     25457.053 MB/s
 Region-By-Two: XOR: 0      0.048678 s     MB:    100.000      2054.330 MB/s
 Region-By-Two: XOR: 1      0.048800 s     MB:    100.000      2049.161 MB/s
UNIX> gf_time 64 R 0 10240 10240 SHIFT SSE -
Seed: 0
 Region-Random: XOR: 0      0.108492 s     MB:    100.000       921.724 MB/s
 Region-Random: XOR: 1      0.110783 s     MB:    100.000       902.663 MB/s
Region-By-Zero: XOR: 0      0.002077 s     MB:    100.000     48155.040 MB/s
Region-By-Zero: XOR: 1      0.000254 s     MB:    100.000    393461.914 MB/s
 Region-By-One: XOR: 0      0.002088 s     MB:    100.000     47902.056 MB/s
 Region-By-One: XOR: 1      0.003885 s     MB:    100.000     25739.822 MB/s
 Region-By-Two: XOR: 0      0.107280 s     MB:    100.000       932.142 MB/s
 Region-By-Two: XOR: 1      0.110782 s     MB:    100.000       902.676 MB/s
UNIX>
master
Jim Plank 2013-03-04 17:02:24 -05:00
parent 5c719db2b9
commit b2d6666ed7
2 changed files with 6179 additions and 116 deletions

100
gf_w64.c
View File

@ -203,7 +203,7 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
gf_internal_t * h = gf->scratch;
a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
b = _mm_insert_epi64 (a, b64, 0);
b = _mm_insert_epi64 (a, b64, 0);
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
/* Do the initial multiply */
result = _mm_clmulepi64_si128 (a, b, 0);
@ -221,6 +221,92 @@ gf_w64_clm_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
#endif
}
void
gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
{
gf_internal_t *h;
int i, j, k;
uint8_t *s8, *d8, *dtop;
uint64_t *s64, *d64;
gf_region_data rd;
__m128i v, b, m, prim_poly, c, fr, w, result;
if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
h = (gf_internal_t *) gf->scratch;
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
gf_do_initial_region_alignment(&rd);
s8 = (uint8_t *) rd.s_start;
d8 = (uint8_t *) rd.d_start;
dtop = (uint8_t *) rd.d_top;
v = _mm_insert_epi64(_mm_setzero_si128(), val, 0);
m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
if (xor) {
while (d8 != dtop) {
s64 = (uint64_t *) s8;
b = _mm_load_si128((__m128i *) s8);
result = _mm_clmulepi64_si128 (b, v, 0);
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
result = _mm_xor_si128 (result, w);
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
fr = _mm_xor_si128 (result, w);
fr = _mm_and_si128 (fr, m);
result = _mm_clmulepi64_si128 (b, v, 1);
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
result = _mm_xor_si128 (result, w);
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
result = _mm_xor_si128 (result, w);
result = _mm_slli_si128 (result, 8);
fr = _mm_xor_si128 (result, fr);
result = _mm_load_si128((__m128i *) d8);
fr = _mm_xor_si128 (result, fr);
_mm_store_si128((__m128i *) d8, fr);
d8 += 16;
s8 += 16;
}
} else {
while (d8 < dtop) {
s64 = (uint64_t *) s8;
b = _mm_load_si128((__m128i *) s8);
result = _mm_clmulepi64_si128 (b, v, 0);
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
result = _mm_xor_si128 (result, w);
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
fr = _mm_xor_si128 (result, w);
fr = _mm_and_si128 (fr, m);
result = _mm_clmulepi64_si128 (b, v, 1);
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
result = _mm_xor_si128 (result, w);
c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
w = _mm_clmulepi64_si128 (prim_poly, c, 0);
result = _mm_xor_si128 (result, w);
result = _mm_slli_si128 (result, 8);
fr = _mm_xor_si128 (result, fr);
_mm_store_si128((__m128i *) d8, fr);
d8 += 16;
s8 += 16;
}
}
gf_do_final_region_alignment(&rd);
}
void
gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
{
@ -238,7 +324,7 @@ gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t
ld = (struct gf_split_4_64_lazy_data *) h->private;
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
gf_do_initial_region_alignment(&rd);
if (ld->last_value != val) {
@ -416,12 +502,14 @@ int gf_w64_shift_init(gf_t *gf)
h = (gf_internal_t *) gf->scratch;
gf->multiply.w64 = gf_w64_shift_multiply;
#ifdef INTEL_PCLMUL
if (h->region_type != GF_REGION_NOSSE) gf->multiply.w64 = gf_w64_clm_multiply;
#endif
gf->inverse.w64 = gf_w64_euclid;
gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
#ifdef INTEL_PCLMUL
if (h->region_type != GF_REGION_NOSSE) gf->multiply.w64 = gf_w64_clm_multiply;
if (h->region_type != GF_REGION_NOSSE) gf->multiply_region.w64 = gf_w64_clm_multiply_region;
#endif
return 1;
}

6195
junk.txt

File diff suppressed because it is too large Load Diff