target/arm: Implement fp16 for Neon VMLA, VMLS operations

Convert the Neon floating-point VMLA and VMLS insns over to using a
gvec helper, and use this to implement the fp16 case.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20200828183354.27913-31-peter.maydell@linaro.org
master
Peter Maydell 2020-08-28 19:33:39 +01:00
parent e22705bb94
commit e5adc70665
3 changed files with 50 additions and 31 deletions

View File

@ -659,6 +659,12 @@ DEF_HELPER_FLAGS_5(gvec_fmaxnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i3
DEF_HELPER_FLAGS_5(gvec_fminnum_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fminnum_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fmla_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fmla_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fmls_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fmls_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG,

View File

@ -1119,37 +1119,8 @@ DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
/*
* For all the functions using this macro, size == 1 means fp16,
* which is an architecture extension we don't implement yet.
*/
#define DO_3S_FP(INSN,FUNC,READS_VD) \
static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
{ \
if (a->size != 0) { \
/* TODO fp16 support */ \
return false; \
} \
return do_3same_fp(s, a, FUNC, READS_VD); \
}
static void gen_VMLA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
TCGv_ptr fpstatus)
{
gen_helper_vfp_muls(vn, vn, vm, fpstatus);
gen_helper_vfp_adds(vd, vd, vn, fpstatus);
}
static void gen_VMLS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
TCGv_ptr fpstatus)
{
gen_helper_vfp_muls(vn, vn, vm, fpstatus);
gen_helper_vfp_subs(vd, vd, vn, fpstatus);
}
DO_3S_FP(VMLA, gen_VMLA_fp_3s, true)
DO_3S_FP(VMLS, gen_VMLS_fp_3s, true)
DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)

View File

@ -842,6 +842,48 @@ DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
#endif
#undef DO_3OP
/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
float_status *stat)
{
return float16_add(dest, float16_mul(op1, op2, stat), stat);
}
static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
float_status *stat)
{
return float32_add(dest, float32_mul(op1, op2, stat), stat);
}
static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
float_status *stat)
{
return float16_sub(dest, float16_mul(op1, op2, stat), stat);
}
static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
float_status *stat)
{
return float32_sub(dest, float32_mul(op1, op2, stat), stat);
}
#define DO_MULADD(NAME, FUNC, TYPE) \
void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
{ \
intptr_t i, oprsz = simd_oprsz(desc); \
TYPE *d = vd, *n = vn, *m = vm; \
for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
d[i] = FUNC(d[i], n[i], m[i], stat); \
} \
clear_tail(d, oprsz, simd_maxsz(desc)); \
}
DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
/* For the indexed ops, SVE applies the index per 128-bit vector segment.
* For AdvSIMD, there is of course only one such vector segment.
*/