Merge branch 'neon_fixes' into 'master'

NEON fixes/tweaks

This merge request fixes some issues and adds some tweaks to NEON code:

* SPLIT(16,4) ALTMAP implementation was broken as it only processed half the amount of data. As such, this fixed implementation is significantly slower than the old code (which is to be expected). Fixes #2
* SPLIT(16,4) implementations now merge the ARMv8 and older code path, similar to SPLIT(32,4). This fixes the ALTMAP variant, and also enables the non-ALTMAP version to have consistent sizing
* Unnecessary VTRN removed in non-ALTMAP SPLIT(16,4) as NEON allows (de)interleaving during load/store; because of this, ALTMAP isn't so useful in NEON
  * This can also be done for SPLIT(32,4), but I have not implemented it
* I also pulled the `if(xor)` conditional from non-ALTMAP SPLIT(16,4) to outside the loop. It seems to improve performance a bit on my Cortex A7
  * It probably should be implemented everywhere else, but I have not done this
* CARRY_FREE was incorrectly enabled on all sizes of w, when it's only available for w=4 and w=8

See merge request !16
master
Loic Dachary 2016-09-13 10:34:23 +00:00
commit 51a1abb918
2 changed files with 60 additions and 140 deletions

View File

@ -219,7 +219,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
#endif
#ifdef ARM_NEON
pclmul = 1;
pclmul = (w == 4 || w == 8);
sse3 = 1;
#endif

View File

@ -46,7 +46,11 @@
#include <stdlib.h>
#include "gf_w16.h"
#ifdef ARCH_AARCH64
#ifndef ARCH_AARCH64
#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
vtbl2_u8(tbl, vget_high_u8(v)))
#endif
static
inline
void
@ -56,23 +60,32 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
{
unsigned i;
uint8_t *high = tbl + 4 * 16;
uint16x8_t va0, va1, r0, r1;
uint8x16_t loset, rl, rh;
uint8x16x2_t va;
#ifdef ARCH_AARCH64
uint8x16_t tbl_h[4], tbl_l[4];
for (i = 0; i < 4; i++) {
tbl_l[i] = vld1q_u8(tbl + i*16);
tbl_h[i] = vld1q_u8(high + i*16);
}
#else
uint8x8x2_t tbl_h[4], tbl_l[4];
for (i = 0; i < 4; i++) {
tbl_l[i].val[0] = vld1_u8(tbl + i*16);
tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
tbl_h[i].val[0] = vld1_u8(high + i*16);
tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
}
#endif
loset = vdupq_n_u8(0xf);
while (dst < d_end) {
va0 = vld1q_u16(src);
va1 = vld1q_u16(src + 8);
va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1));
if (xor) {
uint8x16x2_t vb;
while (dst < d_end) {
va = vld2q_u8((uint8_t*)src);
vb = vld2q_u8((uint8_t*)dst);
rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
@ -84,24 +97,38 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
va = vtrnq_u8(rl, rh);
r0 = vreinterpretq_u16_u8(va.val[0]);
r1 = vreinterpretq_u16_u8(va.val[1]);
if (xor) {
va0 = vld1q_u16(dst);
va1 = vld1q_u16(dst + 8);
r0 = veorq_u16(r0, va0);
r1 = veorq_u16(r1, va1);
}
vst1q_u16(dst, r0);
vst1q_u16(dst + 8, r1);
va.val[0] = veorq_u8(va.val[0], vb.val[0]);
va.val[1] = veorq_u8(va.val[1], vb.val[1]);
vst2q_u8((uint8_t*)dst, va);
src += 16;
dst += 16;
}
} else {
while (dst < d_end) {
va = vld2q_u8((uint8_t*)src);
rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
va.val[0] = vshrq_n_u8(va.val[0], 4);
va.val[1] = vshrq_n_u8(va.val[1], 4);
rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
vst2q_u8((uint8_t*)dst, va);
src += 16;
dst += 16;
}
}
}
@ -118,10 +145,21 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
uint8x16_t vh, vl, rh, rl;
uint8x16_t loset;
#ifdef ARCH_AARCH64
uint8x16_t tbl_h[4], tbl_l[4];
#else
uint8x8x2_t tbl_h[4], tbl_l[4];
#endif
for (i = 0; i < 4; i++) {
#ifdef ARCH_AARCH64
tbl_l[i] = vld1q_u8(tbl + i*16);
tbl_h[i] = vld1q_u8(high + i*16);
#else
tbl_l[i].val[0] = vld1_u8(tbl + i*16);
tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
tbl_h[i].val[0] = vld1_u8(high + i*16);
tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
#endif
}
loset = vdupq_n_u8(0xf);
@ -157,125 +195,7 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
}
}
#else /* ARCH_AARCH64 */
static
inline
void
neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
uint16_t *d_end, uint8_t *tbl,
gf_val_32_t val, int xor)
{
unsigned i;
uint8_t *high = tbl + 4 * 16;
uint16x8_t va, r;
uint8x8_t loset, vb, vc, rl, rh;
uint8x8x2_t tbl_h[4], tbl_l[4];
for (i = 0; i < 4; i++) {
tbl_l[i].val[0] = vld1_u8(tbl + i*16);
tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
tbl_h[i].val[0] = vld1_u8(high + i*16);
tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
}
loset = vdup_n_u8(0xf);
while (dst < d_end) {
va = vld1q_u16(src);
vb = vmovn_u16(va);
vc = vshrn_n_u16(va, 8);
rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset));
rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset));
vb = vshr_n_u8(vb, 4);
rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset)));
rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset)));
vc = vshr_n_u8(vc, 4);
rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb));
rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb));
rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc));
rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc));
r = vmovl_u8(rl);
r = vorrq_u16(r, vshll_n_u8(rh, 8));
if (xor) {
va = vld1q_u16(dst);
r = veorq_u16(r, va);
}
vst1q_u16(dst, r);
src += 8;
dst += 8;
}
}
static
inline
void
neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
uint8_t *dst, uint8_t *d_end,
uint8_t *tbl, gf_val_32_t val,
int xor)
{
unsigned i;
uint8_t *high = tbl + 4 * 16;
uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3;
uint8x8_t loset;
uint8x8x2_t tbl_h[4], tbl_l[4];
for (i = 0; i < 4; i++) {
tbl_l[i].val[0] = vld1_u8(tbl + i*16);
tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
tbl_h[i].val[0] = vld1_u8(high + i*16);
tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
}
loset = vdup_n_u8(0xf);
while (dst < d_end) {
vh0 = vld1_u8(src);
vh1 = vld1_u8(src + 8);
vl0 = vld1_u8(src + 16);
vl1 = vld1_u8(src + 24);
r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset));
r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset));
r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset));
r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset));
vh0 = vshr_n_u8(vh0, 4);
vh1 = vshr_n_u8(vh1, 4);
vl0 = vshr_n_u8(vl0, 4);
vl1 = vshr_n_u8(vl1, 4);
r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0));
r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1));
r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0));
r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1));
if (xor) {
vh0 = vld1_u8(dst);
vh1 = vld1_u8(dst + 8);
vl0 = vld1_u8(dst + 16);
vl1 = vld1_u8(dst + 24);
r0 = veor_u8(r0, vh0);
r1 = veor_u8(r1, vh1);
r2 = veor_u8(r2, vl0);
r3 = veor_u8(r3, vl1);
}
vst1_u8(dst, r0);
vst1_u8(dst + 8, r1);
vst1_u8(dst + 16, r2);
vst1_u8(dst + 24, r3);
src += 32;
dst += 32;
}
}
#endif /* ARCH_AARCH64 */
static
inline