diff --git a/src/gf.c b/src/gf.c index 835fb12..b9caa26 100644 --- a/src/gf.c +++ b/src/gf.c @@ -219,7 +219,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, #endif #ifdef ARM_NEON - pclmul = 1; + pclmul = (w == 4 || w == 8); sse3 = 1; #endif diff --git a/src/neon/gf_w16_neon.c b/src/neon/gf_w16_neon.c index 95bfd80..2bd3f30 100644 --- a/src/neon/gf_w16_neon.c +++ b/src/neon/gf_w16_neon.c @@ -46,7 +46,11 @@ #include #include "gf_w16.h" -#ifdef ARCH_AARCH64 +#ifndef ARCH_AARCH64 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ + vtbl2_u8(tbl, vget_high_u8(v))) +#endif + static inline void @@ -56,23 +60,32 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, { unsigned i; uint8_t *high = tbl + 4 * 16; - uint16x8_t va0, va1, r0, r1; uint8x16_t loset, rl, rh; uint8x16x2_t va; +#ifdef ARCH_AARCH64 uint8x16_t tbl_h[4], tbl_l[4]; for (i = 0; i < 4; i++) { tbl_l[i] = vld1q_u8(tbl + i*16); tbl_h[i] = vld1q_u8(high + i*16); } +#else + uint8x8x2_t tbl_h[4], tbl_l[4]; + for (i = 0; i < 4; i++) { + tbl_l[i].val[0] = vld1_u8(tbl + i*16); + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); + tbl_h[i].val[0] = vld1_u8(high + i*16); + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); + } +#endif loset = vdupq_n_u8(0xf); - while (dst < d_end) { - va0 = vld1q_u16(src); - va1 = vld1q_u16(src + 8); - - va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1)); + if (xor) { + uint8x16x2_t vb; + while (dst < d_end) { + va = vld2q_u8((uint8_t*)src); + vb = vld2q_u8((uint8_t*)dst); rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset)); rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset)); @@ -84,24 +97,38 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0])); rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0])); - rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); - rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); - va = vtrnq_u8(rl, rh); - r0 = vreinterpretq_u16_u8(va.val[0]); - r1 = vreinterpretq_u16_u8(va.val[1]); - - if (xor) { - va0 = vld1q_u16(dst); - va1 = vld1q_u16(dst + 8); - r0 = veorq_u16(r0, va0); - r1 = veorq_u16(r1, va1); - } - vst1q_u16(dst, r0); - vst1q_u16(dst + 8, r1); + va.val[0] = veorq_u8(va.val[0], vb.val[0]); + va.val[1] = veorq_u8(va.val[1], vb.val[1]); + vst2q_u8((uint8_t*)dst, va); src += 16; dst += 16; + } + } else { + while (dst < d_end) { + va = vld2q_u8((uint8_t*)src); + + rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset)); + rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset)); + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset))); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset))); + + va.val[0] = vshrq_n_u8(va.val[0], 4); + va.val[1] = vshrq_n_u8(va.val[1], 4); + + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0])); + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0])); + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); + + vst2q_u8((uint8_t*)dst, va); + + src += 16; + dst += 16; + } } } @@ -118,10 +145,21 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, uint8x16_t vh, vl, rh, rl; uint8x16_t loset; +#ifdef ARCH_AARCH64 uint8x16_t tbl_h[4], tbl_l[4]; +#else + uint8x8x2_t tbl_h[4], tbl_l[4]; +#endif for (i = 0; i < 4; i++) { +#ifdef ARCH_AARCH64 tbl_l[i] = vld1q_u8(tbl + i*16); tbl_h[i] = vld1q_u8(high + i*16); +#else + tbl_l[i].val[0] = vld1_u8(tbl + i*16); + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); + tbl_h[i].val[0] = vld1_u8(high + i*16); + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); +#endif } loset = vdupq_n_u8(0xf); @@ -157,125 +195,7 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, } } -#else /* ARCH_AARCH64 */ -static -inline -void -neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, - uint16_t *d_end, uint8_t *tbl, - gf_val_32_t val, int xor) -{ - unsigned i; - uint8_t *high = tbl + 4 * 16; - uint16x8_t va, r; - uint8x8_t loset, vb, vc, rl, rh; - - uint8x8x2_t tbl_h[4], tbl_l[4]; - for (i = 0; i < 4; i++) { - tbl_l[i].val[0] = vld1_u8(tbl + i*16); - tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); - tbl_h[i].val[0] = vld1_u8(high + i*16); - tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); - } - - loset = vdup_n_u8(0xf); - - while (dst < d_end) { - va = vld1q_u16(src); - - vb = vmovn_u16(va); - vc = vshrn_n_u16(va, 8); - - rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset)); - rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset)); - vb = vshr_n_u8(vb, 4); - rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset))); - rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset))); - vc = vshr_n_u8(vc, 4); - rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb)); - rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb)); - rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc)); - rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc)); - - r = vmovl_u8(rl); - r = vorrq_u16(r, vshll_n_u8(rh, 8)); - - if (xor) { - va = vld1q_u16(dst); - r = veorq_u16(r, va); - } - vst1q_u16(dst, r); - - src += 8; - dst += 8; - } -} - -static -inline -void -neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, - uint8_t *dst, uint8_t *d_end, - uint8_t *tbl, gf_val_32_t val, - int xor) -{ - unsigned i; - uint8_t *high = tbl + 4 * 16; - uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3; - uint8x8_t loset; - - uint8x8x2_t tbl_h[4], tbl_l[4]; - for (i = 0; i < 4; i++) { - tbl_l[i].val[0] = vld1_u8(tbl + i*16); - tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); - tbl_h[i].val[0] = vld1_u8(high + i*16); - tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); - } - - loset = vdup_n_u8(0xf); - - while (dst < d_end) { - vh0 = vld1_u8(src); - vh1 = vld1_u8(src + 8); - vl0 = vld1_u8(src + 16); - vl1 = vld1_u8(src + 24); - - r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset)); - r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset)); - r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset)); - r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset)); - - vh0 = vshr_n_u8(vh0, 4); - vh1 = vshr_n_u8(vh1, 4); - vl0 = vshr_n_u8(vl0, 4); - vl1 = vshr_n_u8(vl1, 4); - - r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0)); - r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1)); - r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0)); - r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1)); - - if (xor) { - vh0 = vld1_u8(dst); - vh1 = vld1_u8(dst + 8); - vl0 = vld1_u8(dst + 16); - vl1 = vld1_u8(dst + 24); - r0 = veor_u8(r0, vh0); - r1 = veor_u8(r1, vh1); - r2 = veor_u8(r2, vl0); - r3 = veor_u8(r3, vl1); - } - vst1_u8(dst, r0); - vst1_u8(dst + 8, r1); - vst1_u8(dst + 16, r2); - vst1_u8(dst + 24, r3); - - src += 32; - dst += 32; - } -} -#endif /* ARCH_AARCH64 */ static inline