From cb48f3654e290ee5d7cbf1fb31888463fa2a180c Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 7 Nov 2022 19:48:14 +1100 Subject: [PATCH] tcg: Add guest load/store primitives for TCGv_i128 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These are not yet considering atomicity of the 16-byte value; this is a direct replacement for the current target code which uses a pair of 8-byte operations. Reviewed-by: Alex Bennée Signed-off-by: Richard Henderson --- accel/tcg/cputlb.c | 112 +++++++++++++++++++++++++++++++++ accel/tcg/user-exec.c | 66 ++++++++++++++++++++ include/exec/cpu_ldst.h | 10 +++ include/tcg/tcg-op.h | 2 + tcg/tcg-op.c | 134 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 324 insertions(+) diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index 04e270742e..4812d83961 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -2192,6 +2192,64 @@ uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr, return cpu_load_helper(env, addr, oi, ra, helper_le_ldq_mmu); } +Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + int mmu_idx = get_mmuidx(oi); + MemOpIdx new_oi; + unsigned a_bits; + uint64_t h, l; + + tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_BE|MO_128)); + a_bits = get_alignment_bits(mop); + + /* Handle CPU specific unaligned behaviour */ + if (addr & ((1 << a_bits) - 1)) { + cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_LOAD, + mmu_idx, ra); + } + + /* Construct an unaligned 64-bit replacement MemOpIdx. */ + mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN; + new_oi = make_memop_idx(mop, mmu_idx); + + h = helper_be_ldq_mmu(env, addr, new_oi, ra); + l = helper_be_ldq_mmu(env, addr + 8, new_oi, ra); + + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); + return int128_make128(l, h); +} + +Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + int mmu_idx = get_mmuidx(oi); + MemOpIdx new_oi; + unsigned a_bits; + uint64_t h, l; + + tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_LE|MO_128)); + a_bits = get_alignment_bits(mop); + + /* Handle CPU specific unaligned behaviour */ + if (addr & ((1 << a_bits) - 1)) { + cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_LOAD, + mmu_idx, ra); + } + + /* Construct an unaligned 64-bit replacement MemOpIdx. */ + mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN; + new_oi = make_memop_idx(mop, mmu_idx); + + l = helper_le_ldq_mmu(env, addr, new_oi, ra); + h = helper_le_ldq_mmu(env, addr + 8, new_oi, ra); + + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); + return int128_make128(l, h); +} + /* * Store Helpers */ @@ -2546,6 +2604,60 @@ void cpu_stq_le_mmu(CPUArchState *env, target_ulong addr, uint64_t val, cpu_store_helper(env, addr, val, oi, retaddr, helper_le_stq_mmu); } +void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + int mmu_idx = get_mmuidx(oi); + MemOpIdx new_oi; + unsigned a_bits; + + tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_BE|MO_128)); + a_bits = get_alignment_bits(mop); + + /* Handle CPU specific unaligned behaviour */ + if (addr & ((1 << a_bits) - 1)) { + cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_STORE, + mmu_idx, ra); + } + + /* Construct an unaligned 64-bit replacement MemOpIdx. */ + mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN; + new_oi = make_memop_idx(mop, mmu_idx); + + helper_be_stq_mmu(env, addr, int128_gethi(val), new_oi, ra); + helper_be_stq_mmu(env, addr + 8, int128_getlo(val), new_oi, ra); + + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); +} + +void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, Int128 val, + MemOpIdx oi, uintptr_t ra) +{ + MemOp mop = get_memop(oi); + int mmu_idx = get_mmuidx(oi); + MemOpIdx new_oi; + unsigned a_bits; + + tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_LE|MO_128)); + a_bits = get_alignment_bits(mop); + + /* Handle CPU specific unaligned behaviour */ + if (addr & ((1 << a_bits) - 1)) { + cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_STORE, + mmu_idx, ra); + } + + /* Construct an unaligned 64-bit replacement MemOpIdx. */ + mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN; + new_oi = make_memop_idx(mop, mmu_idx); + + helper_le_stq_mmu(env, addr, int128_getlo(val), new_oi, ra); + helper_le_stq_mmu(env, addr + 8, int128_gethi(val), new_oi, ra); + + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); +} + #include "ldst_common.c.inc" /* diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c index a8eb63ab96..ae67d84638 100644 --- a/accel/tcg/user-exec.c +++ b/accel/tcg/user-exec.c @@ -1031,6 +1031,42 @@ uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr, return ret; } +Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra) +{ + void *haddr; + Int128 ret; + + validate_memop(oi, MO_128 | MO_BE); + haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); + memcpy(&ret, haddr, 16); + clear_helper_retaddr(); + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); + + if (!HOST_BIG_ENDIAN) { + ret = bswap128(ret); + } + return ret; +} + +Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra) +{ + void *haddr; + Int128 ret; + + validate_memop(oi, MO_128 | MO_LE); + haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD); + memcpy(&ret, haddr, 16); + clear_helper_retaddr(); + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R); + + if (HOST_BIG_ENDIAN) { + ret = bswap128(ret); + } + return ret; +} + void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val, MemOpIdx oi, uintptr_t ra) { @@ -1115,6 +1151,36 @@ void cpu_stq_le_mmu(CPUArchState *env, abi_ptr addr, uint64_t val, qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); } +void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, + Int128 val, MemOpIdx oi, uintptr_t ra) +{ + void *haddr; + + validate_memop(oi, MO_128 | MO_BE); + haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE); + if (!HOST_BIG_ENDIAN) { + val = bswap128(val); + } + memcpy(haddr, &val, 16); + clear_helper_retaddr(); + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); +} + +void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, + Int128 val, MemOpIdx oi, uintptr_t ra) +{ + void *haddr; + + validate_memop(oi, MO_128 | MO_LE); + haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE); + if (HOST_BIG_ENDIAN) { + val = bswap128(val); + } + memcpy(haddr, &val, 16); + clear_helper_retaddr(); + qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W); +} + uint32_t cpu_ldub_code(CPUArchState *env, abi_ptr ptr) { uint32_t ret; diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h index d0c7c0d5fe..09b55cc0ee 100644 --- a/include/exec/cpu_ldst.h +++ b/include/exec/cpu_ldst.h @@ -220,6 +220,11 @@ uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr ptr, uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr ptr, MemOpIdx oi, uintptr_t ra); +Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra); +Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr, + MemOpIdx oi, uintptr_t ra); + void cpu_stb_mmu(CPUArchState *env, abi_ptr ptr, uint8_t val, MemOpIdx oi, uintptr_t ra); void cpu_stw_be_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val, @@ -235,6 +240,11 @@ void cpu_stl_le_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val, void cpu_stq_le_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val, MemOpIdx oi, uintptr_t ra); +void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val, + MemOpIdx oi, uintptr_t ra); +void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, Int128 val, + MemOpIdx oi, uintptr_t ra); + uint32_t cpu_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr, uint32_t cmpv, uint32_t newv, MemOpIdx oi, uintptr_t retaddr); diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h index c4276767d1..e5f5b63c37 100644 --- a/include/tcg/tcg-op.h +++ b/include/tcg/tcg-op.h @@ -845,6 +845,8 @@ void tcg_gen_qemu_ld_i32(TCGv_i32, TCGv, TCGArg, MemOp); void tcg_gen_qemu_st_i32(TCGv_i32, TCGv, TCGArg, MemOp); void tcg_gen_qemu_ld_i64(TCGv_i64, TCGv, TCGArg, MemOp); void tcg_gen_qemu_st_i64(TCGv_i64, TCGv, TCGArg, MemOp); +void tcg_gen_qemu_ld_i128(TCGv_i128, TCGv, TCGArg, MemOp); +void tcg_gen_qemu_st_i128(TCGv_i128, TCGv, TCGArg, MemOp); static inline void tcg_gen_qemu_ld8u(TCGv ret, TCGv addr, int mem_index) { diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index cb83d2375d..33ef325f6e 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -3109,6 +3109,140 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop) } } +static void canonicalize_memop_i128_as_i64(MemOp ret[2], MemOp orig) +{ + MemOp mop_1 = orig, mop_2; + + tcg_debug_assert((orig & MO_SIZE) == MO_128); + tcg_debug_assert((orig & MO_SIGN) == 0); + + /* Use a memory ordering implemented by the host. */ + if (!TCG_TARGET_HAS_MEMORY_BSWAP && (orig & MO_BSWAP)) { + mop_1 &= ~MO_BSWAP; + } + + /* Reduce the size to 64-bit. */ + mop_1 = (mop_1 & ~MO_SIZE) | MO_64; + + /* Retain the alignment constraints of the original. */ + switch (orig & MO_AMASK) { + case MO_UNALN: + case MO_ALIGN_2: + case MO_ALIGN_4: + mop_2 = mop_1; + break; + case MO_ALIGN_8: + /* Prefer MO_ALIGN+MO_64 to MO_ALIGN_8+MO_64. */ + mop_1 = (mop_1 & ~MO_AMASK) | MO_ALIGN; + mop_2 = mop_1; + break; + case MO_ALIGN: + /* Second has 8-byte alignment; first has 16-byte alignment. */ + mop_2 = mop_1; + mop_1 = (mop_1 & ~MO_AMASK) | MO_ALIGN_16; + break; + case MO_ALIGN_16: + case MO_ALIGN_32: + case MO_ALIGN_64: + /* Second has 8-byte alignment; first retains original. */ + mop_2 = (mop_1 & ~MO_AMASK) | MO_ALIGN; + break; + default: + g_assert_not_reached(); + } + ret[0] = mop_1; + ret[1] = mop_2; +} + +void tcg_gen_qemu_ld_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop) +{ + MemOp mop[2]; + TCGv addr_p8; + TCGv_i64 x, y; + + canonicalize_memop_i128_as_i64(mop, memop); + + tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD); + addr = plugin_prep_mem_callbacks(addr); + + /* TODO: respect atomicity of the operation. */ + /* TODO: allow the tcg backend to see the whole operation. */ + + /* + * Since there are no global TCGv_i128, there is no visible state + * changed if the second load faults. Load directly into the two + * subwords. + */ + if ((memop & MO_BSWAP) == MO_LE) { + x = TCGV128_LOW(val); + y = TCGV128_HIGH(val); + } else { + x = TCGV128_HIGH(val); + y = TCGV128_LOW(val); + } + + gen_ldst_i64(INDEX_op_qemu_ld_i64, x, addr, mop[0], idx); + + if ((mop[0] ^ memop) & MO_BSWAP) { + tcg_gen_bswap64_i64(x, x); + } + + addr_p8 = tcg_temp_new(); + tcg_gen_addi_tl(addr_p8, addr, 8); + gen_ldst_i64(INDEX_op_qemu_ld_i64, y, addr_p8, mop[1], idx); + tcg_temp_free(addr_p8); + + if ((mop[0] ^ memop) & MO_BSWAP) { + tcg_gen_bswap64_i64(y, y); + } + + plugin_gen_mem_callbacks(addr, make_memop_idx(memop, idx), + QEMU_PLUGIN_MEM_R); +} + +void tcg_gen_qemu_st_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop) +{ + MemOp mop[2]; + TCGv addr_p8; + TCGv_i64 x, y; + + canonicalize_memop_i128_as_i64(mop, memop); + + tcg_gen_req_mo(TCG_MO_ST_LD | TCG_MO_ST_ST); + addr = plugin_prep_mem_callbacks(addr); + + /* TODO: respect atomicity of the operation. */ + /* TODO: allow the tcg backend to see the whole operation. */ + + if ((memop & MO_BSWAP) == MO_LE) { + x = TCGV128_LOW(val); + y = TCGV128_HIGH(val); + } else { + x = TCGV128_HIGH(val); + y = TCGV128_LOW(val); + } + + addr_p8 = tcg_temp_new(); + if ((mop[0] ^ memop) & MO_BSWAP) { + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_bswap64_i64(t, x); + gen_ldst_i64(INDEX_op_qemu_st_i64, t, addr, mop[0], idx); + tcg_gen_bswap64_i64(t, y); + tcg_gen_addi_tl(addr_p8, addr, 8); + gen_ldst_i64(INDEX_op_qemu_st_i64, t, addr_p8, mop[1], idx); + tcg_temp_free_i64(t); + } else { + gen_ldst_i64(INDEX_op_qemu_st_i64, x, addr, mop[0], idx); + tcg_gen_addi_tl(addr_p8, addr, 8); + gen_ldst_i64(INDEX_op_qemu_st_i64, y, addr_p8, mop[1], idx); + } + tcg_temp_free(addr_p8); + + plugin_gen_mem_callbacks(addr, make_memop_idx(memop, idx), + QEMU_PLUGIN_MEM_W); +} + static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, MemOp opc) { switch (opc & MO_SSIZE) {