Cache CPUClass for use in hot code paths.

Add CPUTLBEntryFull, probe_access_full, tlb_set_page_full. Add generic support for TARGET_TB_PCREL. tcg/ppc: Optimize 26-bit jumps using STQ for POWER 2.07 target/sh4: Fix TB_FLAG_UNALIGN -----BEGIN PGP SIGNATURE----- iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmM8jXEdHHJpY2hhcmQu aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV/oEggArAHK8FtydfQ4ZwnF SjXfpdP50OC0SZn3uBN93FZOrxz9UYG9t1oDHs39J/+b/u2nwJYch//EH2k+NtOW hc3iIgS9bWgs/UWZESkViKQccw7gpYlc21Br38WWwFNEFyecX0p+e9pJgld5rSv1 mRGvCs5J2svH2tcXl/Sb/JWgcumOJoG7qy2aLyJGolR6UOfwcfFMzQXzq8qjpRKH Jh84qusE/rLbzBsdN6snJY4+dyvUo03lT5IJ4d+FQg2tUip+Qqt7pnMbsqq6qF6H R6fWU1JTbsh7GxXJwQJ83jLBnUsi8cy6FKrZ3jyiBq76+DIpR0PqoEe+PN/weInU TN0z4g== =RfXJ -----END PGP SIGNATURE----- Merge tag 'pull-tcg-20221004' of https://gitlab.com/rth7680/qemu into staging Cache CPUClass for use in hot code paths. Add CPUTLBEntryFull, probe_access_full, tlb_set_page_full. Add generic support for TARGET_TB_PCREL. tcg/ppc: Optimize 26-bit jumps using STQ for POWER 2.07 target/sh4: Fix TB_FLAG_UNALIGN # -----BEGIN PGP SIGNATURE----- # # iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmM8jXEdHHJpY2hhcmQu # aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV/oEggArAHK8FtydfQ4ZwnF # SjXfpdP50OC0SZn3uBN93FZOrxz9UYG9t1oDHs39J/+b/u2nwJYch//EH2k+NtOW # hc3iIgS9bWgs/UWZESkViKQccw7gpYlc21Br38WWwFNEFyecX0p+e9pJgld5rSv1 # mRGvCs5J2svH2tcXl/Sb/JWgcumOJoG7qy2aLyJGolR6UOfwcfFMzQXzq8qjpRKH # Jh84qusE/rLbzBsdN6snJY4+dyvUo03lT5IJ4d+FQg2tUip+Qqt7pnMbsqq6qF6H # R6fWU1JTbsh7GxXJwQJ83jLBnUsi8cy6FKrZ3jyiBq76+DIpR0PqoEe+PN/weInU # TN0z4g== # =RfXJ # -----END PGP SIGNATURE----- # gpg: Signature made Tue 04 Oct 2022 15:45:53 EDT # gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F # gpg: issuer "richard.henderson@linaro.org" # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full] # Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F * tag 'pull-tcg-20221004' of https://gitlab.com/rth7680/qemu: target/sh4: Fix TB_FLAG_UNALIGN tcg/ppc: Optimize 26-bit jumps accel/tcg: Introduce TARGET_TB_PCREL accel/tcg: Introduce tb_pc and log_pc hw/core: Add CPUClass.get_pc include/hw/core: Create struct CPUJumpCache accel/tcg: Inline tb_flush_jmp_cache accel/tcg: Do not align tb->page_addr[0] accel/tcg: Use DisasContextBase in plugin_gen_tb_start accel/tcg: Use bool for page_find_alloc accel/tcg: Remove PageDesc code_bitmap include/exec: Introduce TARGET_PAGE_ENTRY_EXTRA accel/tcg: Introduce tlb_set_page_full accel/tcg: Introduce probe_access_full accel/tcg: Suppress auto-invalidate in probe_access_internal accel/tcg: Drop addr member from SavedIOTLB accel/tcg: Rename CPUIOTLBEntry to CPUTLBEntryFull cputlb: used cached CPUClass in our hot-paths hw/core/cpu-sysemu: used cached class in cpu_asidx_from_attrs cpu: cache CPUClass in CPUState for hot code paths Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2022-10-05 10:17:02 -04:00 · 2022-10-05 10:17:02 -04:00 · 4a9c04672a
parent fafd35a6da ab419fd8a0
commit 4a9c04672a
55 changed files with 915 additions and 462 deletions
--- a/accel/stubs/tcg-stub.c
+++ b/accel/stubs/tcg-stub.c
@ -21,6 +21,10 @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
 {
 }

+void tcg_flush_jmp_cache(CPUState *cpu)
+{
+}
+
 int probe_access_flags(CPUArchState *env, target_ulong addr,
                       MMUAccessType access_type, int mmu_idx,
                       bool nonfault, void **phost, uintptr_t retaddr)
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@ -42,6 +42,7 @@
 #include "sysemu/replay.h"
 #include "sysemu/tcg.h"
 #include "exec/helper-proto.h"
+#include "tb-jmp-cache.h"
 #include "tb-hash.h"
 #include "tb-context.h"
 #include "internal.h"
@ -174,7 +175,7 @@ struct tb_desc {
    target_ulong pc;
    target_ulong cs_base;
    CPUArchState *env;
-    tb_page_addr_t phys_page1;
+    tb_page_addr_t page_addr0;
    uint32_t flags;
    uint32_t cflags;
    uint32_t trace_vcpu_dstate;
@ -185,8 +186,8 @@ static bool tb_lookup_cmp(const void *p, const void *d)
    const TranslationBlock *tb = p;
    const struct tb_desc *desc = d;

-    if (tb->pc == desc->pc &&
-        tb->page_addr[0] == desc->phys_page1 &&
+    if ((TARGET_TB_PCREL || tb_pc(tb) == desc->pc) &&
+        tb->page_addr[0] == desc->page_addr0 &&
        tb->cs_base == desc->cs_base &&
        tb->flags == desc->flags &&
        tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
@ -195,8 +196,8 @@ static bool tb_lookup_cmp(const void *p, const void *d)
        if (tb->page_addr[1] == -1) {
            return true;
        } else {
-            tb_page_addr_t phys_page2;
-            target_ulong virt_page2;
+            tb_page_addr_t phys_page1;
+            target_ulong virt_page1;

            /*
             * We know that the first page matched, and an otherwise valid TB
@ -207,9 +208,9 @@ static bool tb_lookup_cmp(const void *p, const void *d)
             * is different for the new TB.  Therefore any exception raised
             * here by the faulting lookup is not premature.
             */
-            virt_page2 = TARGET_PAGE_ALIGN(desc->pc);
-            phys_page2 = get_page_addr_code(desc->env, virt_page2);
-            if (tb->page_addr[1] == phys_page2) {
+            virt_page1 = TARGET_PAGE_ALIGN(desc->pc);
+            phys_page1 = get_page_addr_code(desc->env, virt_page1);
+            if (tb->page_addr[1] == phys_page1) {
                return true;
            }
        }
@ -235,8 +236,9 @@ static TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
    if (phys_pc == -1) {
        return NULL;
    }
-    desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
-    h = tb_hash_func(phys_pc, pc, flags, cflags, *cpu->trace_dstate);
+    desc.page_addr0 = phys_pc;
+    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : pc),
+                     flags, cflags, *cpu->trace_dstate);
    return qht_lookup_custom(&tb_ctx.htable, &desc, h, tb_lookup_cmp);
 }

@ -246,16 +248,18 @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
                                          uint32_t flags, uint32_t cflags)
 {
    TranslationBlock *tb;
+    CPUJumpCache *jc;
    uint32_t hash;

    /* we should never be trying to look up an INVALID tb */
    tcg_debug_assert(!(cflags & CF_INVALID));

    hash = tb_jmp_cache_hash_func(pc);
-    tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
+    jc = cpu->tb_jmp_cache;
+    tb = tb_jmp_cache_get_tb(jc, hash);

    if (likely(tb &&
-               tb->pc == pc &&
+               tb_jmp_cache_get_pc(jc, hash, tb) == pc &&
               tb->cs_base == cs_base &&
               tb->flags == flags &&
               tb->trace_vcpu_dstate == *cpu->trace_dstate &&
@ -266,16 +270,14 @@ static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
    if (tb == NULL) {
        return NULL;
    }
-    qatomic_set(&cpu->tb_jmp_cache[hash], tb);
+    tb_jmp_cache_set(jc, hash, tb, pc);
    return tb;
 }

-static inline void log_cpu_exec(target_ulong pc, CPUState *cpu,
-                                const TranslationBlock *tb)
+static void log_cpu_exec(target_ulong pc, CPUState *cpu,
+                         const TranslationBlock *tb)
 {
-    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_CPU | CPU_LOG_EXEC))
-        && qemu_log_in_addr_range(pc)) {
-
+    if (qemu_log_in_addr_range(pc)) {
        qemu_log_mask(CPU_LOG_EXEC,
                      "Trace %d: %p [" TARGET_FMT_lx
                      "/" TARGET_FMT_lx "/%08x/%08x] %s\n",
@ -399,7 +401,9 @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
        return tcg_code_gen_epilogue;
    }

-    log_cpu_exec(pc, cpu, tb);
+    if (qemu_loglevel_mask(CPU_LOG_TB_CPU | CPU_LOG_EXEC)) {
+        log_cpu_exec(pc, cpu, tb);
+    }

    return tb->tc.ptr;
 }
@ -422,7 +426,9 @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
    TranslationBlock *last_tb;
    const void *tb_ptr = itb->tc.ptr;

-    log_cpu_exec(itb->pc, cpu, itb);
+    if (qemu_loglevel_mask(CPU_LOG_TB_CPU | CPU_LOG_EXEC)) {
+        log_cpu_exec(log_pc(cpu, itb), cpu, itb);
+    }

    qemu_thread_jit_execute();
    ret = tcg_qemu_tb_exec(env, tb_ptr);
@ -446,16 +452,21 @@ cpu_tb_exec(CPUState *cpu, TranslationBlock *itb, int *tb_exit)
         * of the start of the TB.
         */
        CPUClass *cc = CPU_GET_CLASS(cpu);
-        qemu_log_mask_and_addr(CPU_LOG_EXEC, last_tb->pc,
-                               "Stopped execution of TB chain before %p ["
-                               TARGET_FMT_lx "] %s\n",
-                               last_tb->tc.ptr, last_tb->pc,
-                               lookup_symbol(last_tb->pc));
+
        if (cc->tcg_ops->synchronize_from_tb) {
            cc->tcg_ops->synchronize_from_tb(cpu, last_tb);
        } else {
+            assert(!TARGET_TB_PCREL);
            assert(cc->set_pc);
-            cc->set_pc(cpu, last_tb->pc);
+            cc->set_pc(cpu, tb_pc(last_tb));
+        }
+        if (qemu_loglevel_mask(CPU_LOG_EXEC)) {
+            target_ulong pc = log_pc(cpu, last_tb);
+            if (qemu_log_in_addr_range(pc)) {
+                qemu_log("Stopped execution of TB chain before %p ["
+                         TARGET_FMT_lx "] %s\n",
+                         last_tb->tc.ptr, pc, lookup_symbol(pc));
+            }
        }
    }

@ -597,11 +608,8 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,

    qemu_spin_unlock(&tb_next->jmp_lock);

-    qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
-                           "Linking TBs %p [" TARGET_FMT_lx
-                           "] index %d -> %p [" TARGET_FMT_lx "]\n",
-                           tb->tc.ptr, tb->pc, n,
-                           tb_next->tc.ptr, tb_next->pc);
+    qemu_log_mask(CPU_LOG_EXEC, "Linking TBs %p index %d -> %p\n",
+                  tb->tc.ptr, n, tb_next->tc.ptr);
    return;

 out_unlock_next:
@ -847,11 +855,12 @@ static inline bool cpu_handle_interrupt(CPUState *cpu,
 }

 static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb,
+                                    target_ulong pc,
                                    TranslationBlock **last_tb, int *tb_exit)
 {
    int32_t insns_left;

-    trace_exec_tb(tb, tb->pc);
+    trace_exec_tb(tb, pc);
    tb = cpu_tb_exec(cpu, tb, tb_exit);
    if (*tb_exit != TB_EXIT_REQUESTED) {
        *last_tb = tb;
@ -987,6 +996,8 @@ int cpu_exec(CPUState *cpu)

            tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
            if (tb == NULL) {
+                uint32_t h;
+
                mmap_lock();
                tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
                mmap_unlock();
@ -994,7 +1005,8 @@ int cpu_exec(CPUState *cpu)
                 * We add the TB in the virtual pc hash table
                 * for the fast lookup
                 */
-                qatomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
+                h = tb_jmp_cache_hash_func(pc);
+                tb_jmp_cache_set(cpu->tb_jmp_cache, h, tb, pc);
            }

 #ifndef CONFIG_USER_ONLY
@ -1013,7 +1025,7 @@ int cpu_exec(CPUState *cpu)
                tb_add_jump(last_tb, tb_exit, tb);
            }

-            cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit);
+            cpu_loop_exec_tb(cpu, tb, pc, &last_tb, &tb_exit);

            /* Try to align the host and virtual clocks
               if the guest is in advance */
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@ -100,21 +100,14 @@ static void tlb_window_reset(CPUTLBDesc *desc, int64_t ns,

 static void tb_jmp_cache_clear_page(CPUState *cpu, target_ulong page_addr)
 {
-    unsigned int i, i0 = tb_jmp_cache_hash_page(page_addr);
+    int i, i0 = tb_jmp_cache_hash_page(page_addr);
+    CPUJumpCache *jc = cpu->tb_jmp_cache;

    for (i = 0; i < TB_JMP_PAGE_SIZE; i++) {
-        qatomic_set(&cpu->tb_jmp_cache[i0 + i], NULL);
+        qatomic_set(&jc->array[i0 + i].tb, NULL);
    }
 }

-static void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr)
-{
-    /* Discard jump cache entries for any tb which might potentially
-       overlap the flushed page.  */
-    tb_jmp_cache_clear_page(cpu, addr - TARGET_PAGE_SIZE);
-    tb_jmp_cache_clear_page(cpu, addr);
-}
-
 /**
 * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
 * @desc: The CPUTLBDesc portion of the TLB
@ -200,13 +193,13 @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast,
    }

    g_free(fast->table);
-    g_free(desc->iotlb);
+    g_free(desc->fulltlb);

    tlb_window_reset(desc, now, 0);
    /* desc->n_used_entries is cleared by the caller */
    fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;
    fast->table = g_try_new(CPUTLBEntry, new_size);
-    desc->iotlb = g_try_new(CPUIOTLBEntry, new_size);
+    desc->fulltlb = g_try_new(CPUTLBEntryFull, new_size);

    /*
     * If the allocations fail, try smaller sizes. We just freed some
@ -215,7 +208,7 @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast,
     * allocations to fail though, so we progressively reduce the allocation
     * size, aborting if we cannot even allocate the smallest TLB we support.
     */
-    while (fast->table == NULL || desc->iotlb == NULL) {
+    while (fast->table == NULL || desc->fulltlb == NULL) {
        if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) {
            error_report("%s: %s", __func__, strerror(errno));
            abort();
@ -224,9 +217,9 @@ static void tlb_mmu_resize_locked(CPUTLBDesc *desc, CPUTLBDescFast *fast,
        fast->mask = (new_size - 1) << CPU_TLB_ENTRY_BITS;

        g_free(fast->table);
-        g_free(desc->iotlb);
+        g_free(desc->fulltlb);
        fast->table = g_try_new(CPUTLBEntry, new_size);
-        desc->iotlb = g_try_new(CPUIOTLBEntry, new_size);
+        desc->fulltlb = g_try_new(CPUTLBEntryFull, new_size);
    }
 }

@ -258,7 +251,7 @@ static void tlb_mmu_init(CPUTLBDesc *desc, CPUTLBDescFast *fast, int64_t now)
    desc->n_used_entries = 0;
    fast->mask = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
    fast->table = g_new(CPUTLBEntry, n_entries);
-    desc->iotlb = g_new(CPUIOTLBEntry, n_entries);
+    desc->fulltlb = g_new(CPUTLBEntryFull, n_entries);
    tlb_mmu_flush_locked(desc, fast);
 }

@ -299,7 +292,7 @@ void tlb_destroy(CPUState *cpu)
        CPUTLBDescFast *fast = &env_tlb(env)->f[i];

        g_free(fast->table);
-        g_free(desc->iotlb);
+        g_free(desc->fulltlb);
    }
 }

@ -364,7 +357,7 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)

    qemu_spin_unlock(&env_tlb(env)->c.lock);

-    cpu_tb_jmp_cache_clear(cpu);
+    tcg_flush_jmp_cache(cpu);

    if (to_clean == ALL_MMUIDX_BITS) {
        qatomic_set(&env_tlb(env)->c.full_flush_count,
@ -541,7 +534,12 @@ static void tlb_flush_page_by_mmuidx_async_0(CPUState *cpu,
    }
    qemu_spin_unlock(&env_tlb(env)->c.lock);

-    tb_flush_jmp_cache(cpu, addr);
+    /*
+     * Discard jump cache entries for any tb which might potentially
+     * overlap the flushed page, which includes the previous.
+     */
+    tb_jmp_cache_clear_page(cpu, addr - TARGET_PAGE_SIZE);
+    tb_jmp_cache_clear_page(cpu, addr);
 }

 /**
@ -788,12 +786,18 @@ static void tlb_flush_range_by_mmuidx_async_0(CPUState *cpu,
     * longer to clear each entry individually than it will to clear it all.
     */
    if (d.len >= (TARGET_PAGE_SIZE * TB_JMP_CACHE_SIZE)) {
-        cpu_tb_jmp_cache_clear(cpu);
+        tcg_flush_jmp_cache(cpu);
        return;
    }

-    for (target_ulong i = 0; i < d.len; i += TARGET_PAGE_SIZE) {
-        tb_flush_jmp_cache(cpu, d.addr + i);
+    /*
+     * Discard jump cache entries for any tb which might potentially
+     * overlap the flushed pages, which includes the previous.
+     */
+    d.addr -= TARGET_PAGE_SIZE;
+    for (target_ulong i = 0, n = d.len / TARGET_PAGE_SIZE + 1; i < n; i++) {
+        tb_jmp_cache_clear_page(cpu, d.addr);
+        d.addr += TARGET_PAGE_SIZE;
    }
 }

@ -951,7 +955,8 @@ void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
   can be detected */
 void tlb_protect_code(ram_addr_t ram_addr)
 {
-    cpu_physical_memory_test_and_clear_dirty(ram_addr, TARGET_PAGE_SIZE,
+    cpu_physical_memory_test_and_clear_dirty(ram_addr & TARGET_PAGE_MASK,
+                                             TARGET_PAGE_SIZE,
                                             DIRTY_MEMORY_CODE);
 }

@ -1095,16 +1100,16 @@ static void tlb_add_large_page(CPUArchState *env, int mmu_idx,
    env_tlb(env)->d[mmu_idx].large_page_mask = lp_mask;
 }

-/* Add a new TLB entry. At most one entry for a given virtual address
+/*
+ * Add a new TLB entry. At most one entry for a given virtual address
 * is permitted. Only a single TARGET_PAGE_SIZE region is mapped, the
 * supplied size is only used by tlb_flush_page.
 *
 * Called from TCG-generated code, which is under an RCU read-side
 * critical section.
 */
-void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
-                             hwaddr paddr, MemTxAttrs attrs, int prot,
-                             int mmu_idx, target_ulong size)
+void tlb_set_page_full(CPUState *cpu, int mmu_idx,
+                       target_ulong vaddr, CPUTLBEntryFull *full)
 {
    CPUArchState *env = cpu->env_ptr;
    CPUTLB *tlb = env_tlb(env);
@ -1117,35 +1122,36 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
    CPUTLBEntry *te, tn;
    hwaddr iotlb, xlat, sz, paddr_page;
    target_ulong vaddr_page;
-    int asidx = cpu_asidx_from_attrs(cpu, attrs);
-    int wp_flags;
+    int asidx, wp_flags, prot;
    bool is_ram, is_romd;

    assert_cpu_is_self(cpu);

-    if (size <= TARGET_PAGE_SIZE) {
+    if (full->lg_page_size <= TARGET_PAGE_BITS) {
        sz = TARGET_PAGE_SIZE;
    } else {
-        tlb_add_large_page(env, mmu_idx, vaddr, size);
-        sz = size;
+        sz = (hwaddr)1 << full->lg_page_size;
+        tlb_add_large_page(env, mmu_idx, vaddr, sz);
    }
    vaddr_page = vaddr & TARGET_PAGE_MASK;
-    paddr_page = paddr & TARGET_PAGE_MASK;
+    paddr_page = full->phys_addr & TARGET_PAGE_MASK;

+    prot = full->prot;
+    asidx = cpu_asidx_from_attrs(cpu, full->attrs);
    section = address_space_translate_for_iotlb(cpu, asidx, paddr_page,
-                                                &xlat, &sz, attrs, &prot);
+                                                &xlat, &sz, full->attrs, &prot);
    assert(sz >= TARGET_PAGE_SIZE);

    tlb_debug("vaddr=" TARGET_FMT_lx " paddr=0x" TARGET_FMT_plx
              " prot=%x idx=%d\n",
-              vaddr, paddr, prot, mmu_idx);
+              vaddr, full->phys_addr, prot, mmu_idx);

    address = vaddr_page;
-    if (size < TARGET_PAGE_SIZE) {
+    if (full->lg_page_size < TARGET_PAGE_BITS) {
        /* Repeat the MMU check and TLB fill on every access.  */
        address |= TLB_INVALID_MASK;
    }
-    if (attrs.byte_swap) {
+    if (full->attrs.byte_swap) {
        address |= TLB_BSWAP;
    }

@ -1219,7 +1225,7 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,

        /* Evict the old entry into the victim tlb.  */
        copy_tlb_helper_locked(tv, te);
-        desc->viotlb[vidx] = desc->iotlb[index];
+        desc->vfulltlb[vidx] = desc->fulltlb[index];
        tlb_n_used_entries_dec(env, mmu_idx);
    }

@ -1236,8 +1242,10 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     * subtract here is that of the page base, and not the same as the
     * vaddr we add back in io_readx()/io_writex()/get_page_addr_code().
     */
-    desc->iotlb[index].addr = iotlb - vaddr_page;
-    desc->iotlb[index].attrs = attrs;
+    desc->fulltlb[index] = *full;
+    desc->fulltlb[index].xlat_section = iotlb - vaddr_page;
+    desc->fulltlb[index].phys_addr = paddr_page;
+    desc->fulltlb[index].prot = prot;

    /* Now calculate the new entry */
    tn.addend = addend - vaddr_page;
@ -1272,9 +1280,21 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
    qemu_spin_unlock(&tlb->c.lock);
 }

-/* Add a new TLB entry, but without specifying the memory
- * transaction attributes to be used.
- */
+void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
+                             hwaddr paddr, MemTxAttrs attrs, int prot,
+                             int mmu_idx, target_ulong size)
+{
+    CPUTLBEntryFull full = {
+        .phys_addr = paddr,
+        .attrs = attrs,
+        .prot = prot,
+        .lg_page_size = ctz64(size)
+    };
+
+    assert(is_power_of_2(size));
+    tlb_set_page_full(cpu, mmu_idx, vaddr, &full);
+}
+
 void tlb_set_page(CPUState *cpu, target_ulong vaddr,
                  hwaddr paddr, int prot,
                  int mmu_idx, target_ulong size)
@ -1291,15 +1311,14 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
 static void tlb_fill(CPUState *cpu, target_ulong addr, int size,
                     MMUAccessType access_type, int mmu_idx, uintptr_t retaddr)
 {
-    CPUClass *cc = CPU_GET_CLASS(cpu);
    bool ok;

    /*
     * This is not a probe, so only valid return is success; failure
     * should result in exception + longjmp to the cpu loop.
     */
-    ok = cc->tcg_ops->tlb_fill(cpu, addr, size,
-                               access_type, mmu_idx, false, retaddr);
+    ok = cpu->cc->tcg_ops->tlb_fill(cpu, addr, size,
+                                    access_type, mmu_idx, false, retaddr);
    assert(ok);
 }

@ -1307,9 +1326,8 @@ static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
                                        MMUAccessType access_type,
                                        int mmu_idx, uintptr_t retaddr)
 {
-    CPUClass *cc = CPU_GET_CLASS(cpu);
-
-    cc->tcg_ops->do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr);
+    cpu->cc->tcg_ops->do_unaligned_access(cpu, addr, access_type,
+                                          mmu_idx, retaddr);
 }

 static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
@ -1329,7 +1347,7 @@ static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
    }
 }

-static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
                         int mmu_idx, target_ulong addr, uintptr_t retaddr,
                         MMUAccessType access_type, MemOp op)
 {
@ -1341,9 +1359,9 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
    bool locked = false;
    MemTxResult r;

-    section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
+    section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
    mr = section->mr;
-    mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+    mr_offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
    cpu->mem_io_pc = retaddr;
    if (!cpu->can_do_io) {
        cpu_io_recompile(cpu, retaddr);
@ -1353,14 +1371,14 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
        qemu_mutex_lock_iothread();
        locked = true;
    }
-    r = memory_region_dispatch_read(mr, mr_offset, &val, op, iotlbentry->attrs);
+    r = memory_region_dispatch_read(mr, mr_offset, &val, op, full->attrs);
    if (r != MEMTX_OK) {
        hwaddr physaddr = mr_offset +
            section->offset_within_address_space -
            section->offset_within_region;

        cpu_transaction_failed(cpu, physaddr, addr, memop_size(op), access_type,
-                               mmu_idx, iotlbentry->attrs, r, retaddr);
+                               mmu_idx, full->attrs, r, retaddr);
    }
    if (locked) {
        qemu_mutex_unlock_iothread();
@ -1370,22 +1388,21 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
 }

 /*
- * Save a potentially trashed IOTLB entry for later lookup by plugin.
- * This is read by tlb_plugin_lookup if the iotlb entry doesn't match
+ * Save a potentially trashed CPUTLBEntryFull for later lookup by plugin.
+ * This is read by tlb_plugin_lookup if the fulltlb entry doesn't match
 * because of the side effect of io_writex changing memory layout.
 */
-static void save_iotlb_data(CPUState *cs, hwaddr addr,
-                            MemoryRegionSection *section, hwaddr mr_offset)
+static void save_iotlb_data(CPUState *cs, MemoryRegionSection *section,
+                            hwaddr mr_offset)
 {
 #ifdef CONFIG_PLUGIN
    SavedIOTLB *saved = &cs->saved_iotlb;
-    saved->addr = addr;
    saved->section = section;
    saved->mr_offset = mr_offset;
 #endif
 }

-static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
+static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
                      int mmu_idx, uint64_t val, target_ulong addr,
                      uintptr_t retaddr, MemOp op)
 {
@ -1396,9 +1413,9 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
    bool locked = false;
    MemTxResult r;

-    section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
+    section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
    mr = section->mr;
-    mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+    mr_offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
    if (!cpu->can_do_io) {
        cpu_io_recompile(cpu, retaddr);
    }
@ -1408,20 +1425,20 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     * The memory_region_dispatch may trigger a flush/resize
     * so for plugins we save the iotlb_data just in case.
     */
-    save_iotlb_data(cpu, iotlbentry->addr, section, mr_offset);
+    save_iotlb_data(cpu, section, mr_offset);

    if (!qemu_mutex_iothread_locked()) {
        qemu_mutex_lock_iothread();
        locked = true;
    }
-    r = memory_region_dispatch_write(mr, mr_offset, val, op, iotlbentry->attrs);
+    r = memory_region_dispatch_write(mr, mr_offset, val, op, full->attrs);
    if (r != MEMTX_OK) {
        hwaddr physaddr = mr_offset +
            section->offset_within_address_space -
            section->offset_within_region;

        cpu_transaction_failed(cpu, physaddr, addr, memop_size(op),
-                               MMU_DATA_STORE, mmu_idx, iotlbentry->attrs, r,
+                               MMU_DATA_STORE, mmu_idx, full->attrs, r,
                               retaddr);
    }
    if (locked) {
@ -1468,9 +1485,10 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
            copy_tlb_helper_locked(vtlb, &tmptlb);
            qemu_spin_unlock(&env_tlb(env)->c.lock);

-            CPUIOTLBEntry tmpio, *io = &env_tlb(env)->d[mmu_idx].iotlb[index];
-            CPUIOTLBEntry *vio = &env_tlb(env)->d[mmu_idx].viotlb[vidx];
-            tmpio = *io; *io = *vio; *vio = tmpio;
+            CPUTLBEntryFull *f1 = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+            CPUTLBEntryFull *f2 = &env_tlb(env)->d[mmu_idx].vfulltlb[vidx];
+            CPUTLBEntryFull tmpf;
+            tmpf = *f1; *f1 = *f2; *f2 = tmpf;
            return true;
        }
    }
@ -1483,9 +1501,9 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
                 (ADDR) & TARGET_PAGE_MASK)

 static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
-                           CPUIOTLBEntry *iotlbentry, uintptr_t retaddr)
+                           CPUTLBEntryFull *full, uintptr_t retaddr)
 {
-    ram_addr_t ram_addr = mem_vaddr + iotlbentry->addr;
+    ram_addr_t ram_addr = mem_vaddr + full->xlat_section;

    trace_memory_notdirty_write_access(mem_vaddr, ram_addr, size);

@ -1512,7 +1530,8 @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size,
 static int probe_access_internal(CPUArchState *env, target_ulong addr,
                                 int fault_size, MMUAccessType access_type,
                                 int mmu_idx, bool nonfault,
-                                 void **phost, uintptr_t retaddr)
+                                 void **phost, CPUTLBEntryFull **pfull,
+                                 uintptr_t retaddr)
 {
    uintptr_t index = tlb_index(env, mmu_idx, addr);
    CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
@ -1535,25 +1554,36 @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
    }
    tlb_addr = tlb_read_ofs(entry, elt_ofs);

+    flags = TLB_FLAGS_MASK;
    page_addr = addr & TARGET_PAGE_MASK;
    if (!tlb_hit_page(tlb_addr, page_addr)) {
        if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page_addr)) {
            CPUState *cs = env_cpu(env);
-            CPUClass *cc = CPU_GET_CLASS(cs);

-            if (!cc->tcg_ops->tlb_fill(cs, addr, fault_size, access_type,
-                                       mmu_idx, nonfault, retaddr)) {
+            if (!cs->cc->tcg_ops->tlb_fill(cs, addr, fault_size, access_type,
+                                           mmu_idx, nonfault, retaddr)) {
                /* Non-faulting page table read failed.  */
                *phost = NULL;
+                *pfull = NULL;
                return TLB_INVALID_MASK;
            }

            /* TLB resize via tlb_fill may have moved the entry.  */
+            index = tlb_index(env, mmu_idx, addr);
            entry = tlb_entry(env, mmu_idx, addr);
+
+            /*
+             * With PAGE_WRITE_INV, we set TLB_INVALID_MASK immediately,
+             * to force the next access through tlb_fill.  We've just
+             * called tlb_fill, so we know that this entry *is* valid.
+             */
+            flags &= ~TLB_INVALID_MASK;
        }
        tlb_addr = tlb_read_ofs(entry, elt_ofs);
    }
-    flags = tlb_addr & TLB_FLAGS_MASK;
+    flags &= tlb_addr;
+
+    *pfull = &env_tlb(env)->d[mmu_idx].fulltlb[index];

    /* Fold all "mmio-like" bits into TLB_MMIO.  This is not RAM.  */
    if (unlikely(flags & ~(TLB_WATCHPOINT | TLB_NOTDIRTY))) {
@ -1566,37 +1596,44 @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
    return flags;
 }

-int probe_access_flags(CPUArchState *env, target_ulong addr,
-                       MMUAccessType access_type, int mmu_idx,
-                       bool nonfault, void **phost, uintptr_t retaddr)
+int probe_access_full(CPUArchState *env, target_ulong addr,
+                      MMUAccessType access_type, int mmu_idx,
+                      bool nonfault, void **phost, CPUTLBEntryFull **pfull,
+                      uintptr_t retaddr)
 {
-    int flags;
-
-    flags = probe_access_internal(env, addr, 0, access_type, mmu_idx,
-                                  nonfault, phost, retaddr);
+    int flags = probe_access_internal(env, addr, 0, access_type, mmu_idx,
+                                      nonfault, phost, pfull, retaddr);

    /* Handle clean RAM pages.  */
    if (unlikely(flags & TLB_NOTDIRTY)) {
-        uintptr_t index = tlb_index(env, mmu_idx, addr);
-        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
-
-        notdirty_write(env_cpu(env), addr, 1, iotlbentry, retaddr);
+        notdirty_write(env_cpu(env), addr, 1, *pfull, retaddr);
        flags &= ~TLB_NOTDIRTY;
    }

    return flags;
 }

+int probe_access_flags(CPUArchState *env, target_ulong addr,
+                       MMUAccessType access_type, int mmu_idx,
+                       bool nonfault, void **phost, uintptr_t retaddr)
+{
+    CPUTLBEntryFull *full;
+
+    return probe_access_full(env, addr, access_type, mmu_idx,
+                             nonfault, phost, &full, retaddr);
+}
+
 void *probe_access(CPUArchState *env, target_ulong addr, int size,
                   MMUAccessType access_type, int mmu_idx, uintptr_t retaddr)
 {
+    CPUTLBEntryFull *full;
    void *host;
    int flags;

    g_assert(-(addr | TARGET_PAGE_MASK) >= size);

    flags = probe_access_internal(env, addr, size, access_type, mmu_idx,
-                                  false, &host, retaddr);
+                                  false, &host, &full, retaddr);

    /* Per the interface, size == 0 merely faults the access. */
    if (size == 0) {
@ -1604,20 +1641,17 @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
    }

    if (unlikely(flags & (TLB_NOTDIRTY | TLB_WATCHPOINT))) {
-        uintptr_t index = tlb_index(env, mmu_idx, addr);
-        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
-
        /* Handle watchpoints.  */
        if (flags & TLB_WATCHPOINT) {
            int wp_access = (access_type == MMU_DATA_STORE
                             ? BP_MEM_WRITE : BP_MEM_READ);
            cpu_check_watchpoint(env_cpu(env), addr, size,
-                                 iotlbentry->attrs, wp_access, retaddr);
+                                 full->attrs, wp_access, retaddr);
        }

        /* Handle clean RAM pages.  */
        if (flags & TLB_NOTDIRTY) {
-            notdirty_write(env_cpu(env), addr, 1, iotlbentry, retaddr);
+            notdirty_write(env_cpu(env), addr, 1, full, retaddr);
        }
    }

@ -1627,11 +1661,12 @@ void *probe_access(CPUArchState *env, target_ulong addr, int size,
 void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
                        MMUAccessType access_type, int mmu_idx)
 {
+    CPUTLBEntryFull *full;
    void *host;
    int flags;

    flags = probe_access_internal(env, addr, 0, access_type,
-                                  mmu_idx, true, &host, 0);
+                                  mmu_idx, true, &host, &full, 0);

    /* No combination of flags are expected by the caller. */
    return flags ? NULL : host;
@ -1650,10 +1685,11 @@ void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
 tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
                                        void **hostp)
 {
+    CPUTLBEntryFull *full;
    void *p;

    (void)probe_access_internal(env, addr, 1, MMU_INST_FETCH,
-                                cpu_mmu_index(env, true), false, &p, 0);
+                                cpu_mmu_index(env, true), false, &p, &full, 0);
    if (p == NULL) {
        return -1;
    }
@ -1674,7 +1710,7 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
 * should have just filled the TLB. The one corner case is io_writex
 * which can cause TLB flushes and potential resizing of the TLBs
 * losing the information we need. In those cases we need to recover
- * data from a copy of the iotlbentry. As long as this always occurs
+ * data from a copy of the CPUTLBEntryFull. As long as this always occurs
 * from the same thread (which a mem callback will be) this is safe.
 */

@ -1689,11 +1725,12 @@ bool tlb_plugin_lookup(CPUState *cpu, target_ulong addr, int mmu_idx,
    if (likely(tlb_hit(tlb_addr, addr))) {
        /* We must have an iotlb entry for MMIO */
        if (tlb_addr & TLB_MMIO) {
-            CPUIOTLBEntry *iotlbentry;
-            iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+            CPUTLBEntryFull *full;
+            full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
            data->is_io = true;
-            data->v.io.section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
-            data->v.io.offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
+            data->v.io.section =
+                iotlb_to_section(cpu, full->xlat_section, full->attrs);
+            data->v.io.offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
        } else {
            data->is_io = false;
            data->v.ram.hostaddr = (void *)((uintptr_t)addr + tlbe->addend);
@ -1801,7 +1838,7 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,

    if (unlikely(tlb_addr & TLB_NOTDIRTY)) {
        notdirty_write(env_cpu(env), addr, size,
-                       &env_tlb(env)->d[mmu_idx].iotlb[index], retaddr);
+                       &env_tlb(env)->d[mmu_idx].fulltlb[index], retaddr);
    }

    return hostaddr;
@ -1909,7 +1946,7 @@ load_helper(CPUArchState *env, target_ulong addr, MemOpIdx oi,

    /* Handle anything that isn't just a straight memory access.  */
    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
+        CPUTLBEntryFull *full;
        bool need_swap;

        /* For anything that is unaligned, recurse through full_load.  */
@ -1917,20 +1954,20 @@ load_helper(CPUArchState *env, target_ulong addr, MemOpIdx oi,
            goto do_unaligned_access;
        }

-        iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+        full = &env_tlb(env)->d[mmu_idx].fulltlb[index];

        /* Handle watchpoints.  */
        if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
            /* On watchpoint hit, this will longjmp out.  */
            cpu_check_watchpoint(env_cpu(env), addr, size,
-                                 iotlbentry->attrs, BP_MEM_READ, retaddr);
+                                 full->attrs, BP_MEM_READ, retaddr);
        }

        need_swap = size > 1 && (tlb_addr & TLB_BSWAP);

        /* Handle I/O access.  */
        if (likely(tlb_addr & TLB_MMIO)) {
-            return io_readx(env, iotlbentry, mmu_idx, addr, retaddr,
+            return io_readx(env, full, mmu_idx, addr, retaddr,
                            access_type, op ^ (need_swap * MO_BSWAP));
        }

@ -2245,12 +2282,12 @@ store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
     */
    if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
        cpu_check_watchpoint(env_cpu(env), addr, size - size2,
-                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
+                             env_tlb(env)->d[mmu_idx].fulltlb[index].attrs,
                             BP_MEM_WRITE, retaddr);
    }
    if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
        cpu_check_watchpoint(env_cpu(env), page2, size2,
-                             env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
+                             env_tlb(env)->d[mmu_idx].fulltlb[index2].attrs,
                             BP_MEM_WRITE, retaddr);
    }

@ -2314,7 +2351,7 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,

    /* Handle anything that isn't just a straight memory access.  */
    if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
-        CPUIOTLBEntry *iotlbentry;
+        CPUTLBEntryFull *full;
        bool need_swap;

        /* For anything that is unaligned, recurse through byte stores.  */
@ -2322,20 +2359,20 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
            goto do_unaligned_access;
        }

-        iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+        full = &env_tlb(env)->d[mmu_idx].fulltlb[index];

        /* Handle watchpoints.  */
        if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
            /* On watchpoint hit, this will longjmp out.  */
            cpu_check_watchpoint(env_cpu(env), addr, size,
-                                 iotlbentry->attrs, BP_MEM_WRITE, retaddr);
+                                 full->attrs, BP_MEM_WRITE, retaddr);
        }

        need_swap = size > 1 && (tlb_addr & TLB_BSWAP);

        /* Handle I/O access.  */
        if (tlb_addr & TLB_MMIO) {
-            io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
+            io_writex(env, full, mmu_idx, val, addr, retaddr,
                      op ^ (need_swap * MO_BSWAP));
            return;
        }
@ -2347,7 +2384,7 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,

        /* Handle clean RAM pages.  */
        if (tlb_addr & TLB_NOTDIRTY) {
-            notdirty_write(env_cpu(env), addr, size, iotlbentry, retaddr);
+            notdirty_write(env_cpu(env), addr, size, full, retaddr);
        }

        haddr = (void *)((uintptr_t)addr + entry->addend);
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@ -18,4 +18,14 @@ G_NORETURN void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
 void page_init(void);
 void tb_htable_init(void);

+/* Return the current PC from CPU, which may be cached in TB. */
+static inline target_ulong log_pc(CPUState *cpu, const TranslationBlock *tb)
+{
+#if TARGET_TB_PCREL
+    return cpu->cc->get_pc(cpu);
+#else
+    return tb_pc(tb);
+#endif
+}
+
 #endif /* ACCEL_TCG_INTERNAL_H */
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@ -852,7 +852,8 @@ static void plugin_gen_inject(const struct qemu_plugin_tb *plugin_tb)
    pr_ops();
 }

-bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool mem_only)
+bool plugin_gen_tb_start(CPUState *cpu, const DisasContextBase *db,
+                         bool mem_only)
 {
    bool ret = false;

@ -870,9 +871,9 @@ bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool mem_onl

        ret = true;

-        ptb->vaddr = tb->pc;
+        ptb->vaddr = db->pc_first;
        ptb->vaddr2 = -1;
-        get_page_addr_code_hostp(cpu->env_ptr, tb->pc, &ptb->haddr1);
+        ptb->haddr1 = db->host_addr[0];
        ptb->haddr2 = NULL;
        ptb->mem_only = mem_only;

@ -898,16 +899,15 @@ void plugin_gen_insn_start(CPUState *cpu, const DisasContextBase *db)
     * Note that we skip this when haddr1 == NULL, e.g. when we're
     * fetching instructions from a region not backed by RAM.
     */
-    if (likely(ptb->haddr1 != NULL && ptb->vaddr2 == -1) &&
-        unlikely((db->pc_next & TARGET_PAGE_MASK) !=
-                 (db->pc_first & TARGET_PAGE_MASK))) {
-        get_page_addr_code_hostp(cpu->env_ptr, db->pc_next,
-                                 &ptb->haddr2);
-        ptb->vaddr2 = db->pc_next;
-    }
-    if (likely(ptb->vaddr2 == -1)) {
+    if (ptb->haddr1 == NULL) {
+        pinsn->haddr = NULL;
+    } else if (is_same_page(db, db->pc_next)) {
        pinsn->haddr = ptb->haddr1 + pinsn->vaddr - ptb->vaddr;
    } else {
+        if (ptb->vaddr2 == -1) {
+            ptb->vaddr2 = TARGET_PAGE_ALIGN(db->pc_first);
+            get_page_addr_code_hostp(cpu->env_ptr, ptb->vaddr2, &ptb->haddr2);
+        }
        pinsn->haddr = ptb->haddr2 + pinsn->vaddr - ptb->vaddr2;
    }
 }
--- a/accel/tcg/tb-hash.h
+++ b/accel/tcg/tb-hash.h
@ -23,6 +23,7 @@
 #include "exec/cpu-defs.h"
 #include "exec/exec-all.h"
 #include "qemu/xxhash.h"
+#include "tb-jmp-cache.h"

 #ifdef CONFIG_SOFTMMU

--- a/accel/tcg/tb-jmp-cache.h
+++ b/accel/tcg/tb-jmp-cache.h
@ -0,0 +1,65 @@
+/*
+ * The per-CPU TranslationBlock jump cache.
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef ACCEL_TCG_TB_JMP_CACHE_H
+#define ACCEL_TCG_TB_JMP_CACHE_H
+
+#define TB_JMP_CACHE_BITS 12
+#define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
+
+/*
+ * Accessed in parallel; all accesses to 'tb' must be atomic.
+ * For TARGET_TB_PCREL, accesses to 'pc' must be protected by
+ * a load_acquire/store_release to 'tb'.
+ */
+struct CPUJumpCache {
+    struct {
+        TranslationBlock *tb;
+#if TARGET_TB_PCREL
+        target_ulong pc;
+#endif
+    } array[TB_JMP_CACHE_SIZE];
+};
+
+static inline TranslationBlock *
+tb_jmp_cache_get_tb(CPUJumpCache *jc, uint32_t hash)
+{
+#if TARGET_TB_PCREL
+    /* Use acquire to ensure current load of pc from jc. */
+    return qatomic_load_acquire(&jc->array[hash].tb);
+#else
+    /* Use rcu_read to ensure current load of pc from *tb. */
+    return qatomic_rcu_read(&jc->array[hash].tb);
+#endif
+}
+
+static inline target_ulong
+tb_jmp_cache_get_pc(CPUJumpCache *jc, uint32_t hash, TranslationBlock *tb)
+{
+#if TARGET_TB_PCREL
+    return jc->array[hash].pc;
+#else
+    return tb_pc(tb);
+#endif
+}
+
+static inline void
+tb_jmp_cache_set(CPUJumpCache *jc, uint32_t hash,
+                 TranslationBlock *tb, target_ulong pc)
+{
+#if TARGET_TB_PCREL
+    jc->array[hash].pc = pc;
+    /* Use store_release on tb to ensure pc is written first. */
+    qatomic_store_release(&jc->array[hash].tb, tb);
+#else
+    /* Use the pc value already stored in tb->pc. */
+    qatomic_set(&jc->array[hash].tb, tb);
+#endif
+}
+
+#endif /* ACCEL_TCG_TB_JMP_CACHE_H */
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@ -58,6 +58,7 @@
 #include "sysemu/tcg.h"
 #include "qapi/error.h"
 #include "hw/core/tcg-cpu-ops.h"
+#include "tb-jmp-cache.h"
 #include "tb-hash.h"
 #include "tb-context.h"
 #include "internal.h"
@ -102,21 +103,14 @@
 #define assert_memory_lock() tcg_debug_assert(have_mmap_lock())
 #endif

-#define SMC_BITMAP_USE_THRESHOLD 10
-
 typedef struct PageDesc {
    /* list of TBs intersecting this ram page */
    uintptr_t first_tb;
-#ifdef CONFIG_SOFTMMU
-    /* in order to optimize self modifying code, we count the number
-       of lookups we do to a given page to use a bitmap */
-    unsigned long *code_bitmap;
-    unsigned int code_write_count;
-#else
+#ifdef CONFIG_USER_ONLY
    unsigned long flags;
    void *target_data;
 #endif
-#ifndef CONFIG_USER_ONLY
+#ifdef CONFIG_SOFTMMU
    QemuSpin lock;
 #endif
 } PageDesc;
@ -305,7 +299,7 @@ static int encode_search(TranslationBlock *tb, uint8_t *block)

        for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
            if (i == 0) {
-                prev = (j == 0 ? tb->pc : 0);
+                prev = (!TARGET_TB_PCREL && j == 0 ? tb_pc(tb) : 0);
            } else {
                prev = tcg_ctx->gen_insn_data[i - 1][j];
            }
@ -333,7 +327,7 @@ static int encode_search(TranslationBlock *tb, uint8_t *block)
 static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
                                     uintptr_t searched_pc, bool reset_icount)
 {
-    target_ulong data[TARGET_INSN_START_WORDS] = { tb->pc };
+    target_ulong data[TARGET_INSN_START_WORDS];
    uintptr_t host_pc = (uintptr_t)tb->tc.ptr;
    CPUArchState *env = cpu->env_ptr;
    const uint8_t *p = tb->tc.ptr + tb->tc.size;
@ -349,6 +343,11 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
        return -1;
    }

+    memset(data, 0, sizeof(data));
+    if (!TARGET_TB_PCREL) {
+        data[0] = tb_pc(tb);
+    }
+
    /* Reconstruct the stored insn data while looking for the point at
       which the end of the insn exceeds the searched_pc.  */
    for (i = 0; i < num_insns; ++i) {
@ -472,7 +471,7 @@ void page_init(void)
 #endif
 }

-static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)
+static PageDesc *page_find_alloc(tb_page_addr_t index, bool alloc)
 {
    PageDesc *pd;
    void **lp;
@ -540,11 +539,11 @@ static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)

 static inline PageDesc *page_find(tb_page_addr_t index)
 {
-    return page_find_alloc(index, 0);
+    return page_find_alloc(index, false);
 }

 static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
-                           PageDesc **ret_p2, tb_page_addr_t phys2, int alloc);
+                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc);

 /* In user-mode page locks aren't used; mmap_lock is enough */
 #ifdef CONFIG_USER_ONLY
@ -658,7 +657,7 @@ static inline void page_unlock(PageDesc *pd)
 /* lock the page(s) of a TB in the correct acquisition order */
 static inline void page_lock_tb(const TranslationBlock *tb)
 {
-    page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], 0);
+    page_lock_pair(NULL, tb->page_addr[0], NULL, tb->page_addr[1], false);
 }

 static inline void page_unlock_tb(const TranslationBlock *tb)
@ -847,7 +846,7 @@ void page_collection_unlock(struct page_collection *set)
 #endif /* !CONFIG_USER_ONLY */

 static void page_lock_pair(PageDesc **ret_p1, tb_page_addr_t phys1,
-                           PageDesc **ret_p2, tb_page_addr_t phys2, int alloc)
+                           PageDesc **ret_p2, tb_page_addr_t phys2, bool alloc)
 {
    PageDesc *p1, *p2;
    tb_page_addr_t page1;
@ -891,13 +890,13 @@ static bool tb_cmp(const void *ap, const void *bp)
    const TranslationBlock *a = ap;
    const TranslationBlock *b = bp;

-    return a->pc == b->pc &&
-        a->cs_base == b->cs_base &&
-        a->flags == b->flags &&
-        (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
-        a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
-        a->page_addr[0] == b->page_addr[0] &&
-        a->page_addr[1] == b->page_addr[1];
+    return ((TARGET_TB_PCREL || tb_pc(a) == tb_pc(b)) &&
+            a->cs_base == b->cs_base &&
+            a->flags == b->flags &&
+            (tb_cflags(a) & ~CF_INVALID) == (tb_cflags(b) & ~CF_INVALID) &&
+            a->trace_vcpu_dstate == b->trace_vcpu_dstate &&
+            a->page_addr[0] == b->page_addr[0] &&
+            a->page_addr[1] == b->page_addr[1]);
 }

 void tb_htable_init(void)
@ -907,17 +906,6 @@ void tb_htable_init(void)
    qht_init(&tb_ctx.htable, tb_cmp, CODE_GEN_HTABLE_SIZE, mode);
 }

-/* call with @p->lock held */
-static inline void invalidate_page_bitmap(PageDesc *p)
-{
-    assert_page_locked(p);
-#ifdef CONFIG_SOFTMMU
-    g_free(p->code_bitmap);
-    p->code_bitmap = NULL;
-    p->code_write_count = 0;
-#endif
-}
-
 /* Set to NULL all the 'first_tb' fields in all PageDescs. */
 static void page_flush_tb_1(int level, void **lp)
 {
@ -932,7 +920,6 @@ static void page_flush_tb_1(int level, void **lp)
        for (i = 0; i < V_L2_SIZE; ++i) {
            page_lock(&pd[i]);
            pd[i].first_tb = (uintptr_t)NULL;
-            invalidate_page_bitmap(pd + i);
            page_unlock(&pd[i]);
        }
    } else {
@ -986,7 +973,7 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
    }

    CPU_FOREACH(cpu) {
-        cpu_tb_jmp_cache_clear(cpu);
+        tcg_flush_jmp_cache(cpu);
    }

    qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
@ -1031,9 +1018,10 @@ static void do_tb_invalidate_check(void *p, uint32_t hash, void *userp)
    TranslationBlock *tb = p;
    target_ulong addr = *(target_ulong *)userp;

-    if (!(addr + TARGET_PAGE_SIZE <= tb->pc || addr >= tb->pc + tb->size)) {
+    if (!(addr + TARGET_PAGE_SIZE <= tb_pc(tb) ||
+          addr >= tb_pc(tb) + tb->size)) {
        printf("ERROR invalidate: address=" TARGET_FMT_lx
-               " PC=%08lx size=%04x\n", addr, (long)tb->pc, tb->size);
+               " PC=%08lx size=%04x\n", addr, (long)tb_pc(tb), tb->size);
    }
 }

@ -1052,11 +1040,11 @@ static void do_tb_page_check(void *p, uint32_t hash, void *userp)
    TranslationBlock *tb = p;
    int flags1, flags2;

-    flags1 = page_get_flags(tb->pc);
-    flags2 = page_get_flags(tb->pc + tb->size - 1);
+    flags1 = page_get_flags(tb_pc(tb));
+    flags2 = page_get_flags(tb_pc(tb) + tb->size - 1);
    if ((flags1 & PAGE_WRITE) || (flags2 & PAGE_WRITE)) {
        printf("ERROR page flags: PC=%08lx size=%04x f1=%x f2=%x\n",
-               (long)tb->pc, tb->size, flags1, flags2);
+               (long)tb_pc(tb), tb->size, flags1, flags2);
    }
 }

@ -1165,6 +1153,28 @@ static inline void tb_jmp_unlink(TranslationBlock *dest)
    qemu_spin_unlock(&dest->jmp_lock);
 }

+static void tb_jmp_cache_inval_tb(TranslationBlock *tb)
+{
+    CPUState *cpu;
+
+    if (TARGET_TB_PCREL) {
+        /* A TB may be at any virtual address */
+        CPU_FOREACH(cpu) {
+            tcg_flush_jmp_cache(cpu);
+        }
+    } else {
+        uint32_t h = tb_jmp_cache_hash_func(tb_pc(tb));
+
+        CPU_FOREACH(cpu) {
+            CPUJumpCache *jc = cpu->tb_jmp_cache;
+
+            if (qatomic_read(&jc->array[h].tb) == tb) {
+                qatomic_set(&jc->array[h].tb, NULL);
+            }
+        }
+    }
+}
+
 /*
 * In user-mode, call with mmap_lock held.
 * In !user-mode, if @rm_from_page_list is set, call with the TB's pages'
@ -1172,7 +1182,6 @@ static inline void tb_jmp_unlink(TranslationBlock *dest)
 */
 static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
 {
-    CPUState *cpu;
    PageDesc *p;
    uint32_t h;
    tb_page_addr_t phys_pc;
@ -1186,9 +1195,9 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
    qemu_spin_unlock(&tb->jmp_lock);

    /* remove the TB from the hash list */
-    phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
-    h = tb_hash_func(phys_pc, tb->pc, tb->flags, orig_cflags,
-                     tb->trace_vcpu_dstate);
+    phys_pc = tb->page_addr[0];
+    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
+                     tb->flags, orig_cflags, tb->trace_vcpu_dstate);
    if (!qht_remove(&tb_ctx.htable, tb, h)) {
        return;
    }
@ -1197,21 +1206,14 @@ static void do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
    if (rm_from_page_list) {
        p = page_find(tb->page_addr[0] >> TARGET_PAGE_BITS);
        tb_page_remove(p, tb);
-        invalidate_page_bitmap(p);
        if (tb->page_addr[1] != -1) {
            p = page_find(tb->page_addr[1] >> TARGET_PAGE_BITS);
            tb_page_remove(p, tb);
-            invalidate_page_bitmap(p);
        }
    }

    /* remove the TB from the hash list */
-    h = tb_jmp_cache_hash_func(tb->pc);
-    CPU_FOREACH(cpu) {
-        if (qatomic_read(&cpu->tb_jmp_cache[h]) == tb) {
-            qatomic_set(&cpu->tb_jmp_cache[h], NULL);
-        }
-    }
+    tb_jmp_cache_inval_tb(tb);

    /* suppress this TB from the two jump lists */
    tb_remove_from_jmp_list(tb, 0);
@ -1246,35 +1248,6 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
    }
 }

-#ifdef CONFIG_SOFTMMU
-/* call with @p->lock held */
-static void build_page_bitmap(PageDesc *p)
-{
-    int n, tb_start, tb_end;
-    TranslationBlock *tb;
-
-    assert_page_locked(p);
-    p->code_bitmap = bitmap_new(TARGET_PAGE_SIZE);
-
-    PAGE_FOR_EACH_TB(p, tb, n) {
-        /* NOTE: this is subtle as a TB may span two physical pages */
-        if (n == 0) {
-            /* NOTE: tb_end may be after the end of the page, but
-               it is not a problem */
-            tb_start = tb->pc & ~TARGET_PAGE_MASK;
-            tb_end = tb_start + tb->size;
-            if (tb_end > TARGET_PAGE_SIZE) {
-                tb_end = TARGET_PAGE_SIZE;
-             }
-        } else {
-            tb_start = 0;
-            tb_end = ((tb->pc + tb->size) & ~TARGET_PAGE_MASK);
-        }
-        bitmap_set(p->code_bitmap, tb_start, tb_end - tb_start);
-    }
-}
-#endif
-
 /* add the tb in the target page and protect it if necessary
 *
 * Called with mmap_lock held for user-mode emulation.
@ -1295,7 +1268,6 @@ static inline void tb_page_add(PageDesc *p, TranslationBlock *tb,
    page_already_protected = p->first_tb != (uintptr_t)NULL;
 #endif
    p->first_tb = (uintptr_t)tb | n;
-    invalidate_page_bitmap(p);

 #if defined(CONFIG_USER_ONLY)
    /* translator_loop() must have made all TB pages non-writable */
@ -1341,8 +1313,8 @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
     * Note that inserting into the hash table first isn't an option, since
     * we can only insert TBs that are fully initialized.
     */
-    page_lock_pair(&p, phys_pc, &p2, phys_page2, 1);
-    tb_page_add(p, tb, 0, phys_pc & TARGET_PAGE_MASK);
+    page_lock_pair(&p, phys_pc, &p2, phys_page2, true);
+    tb_page_add(p, tb, 0, phys_pc);
    if (p2) {
        tb_page_add(p2, tb, 1, phys_page2);
    } else {
@ -1350,17 +1322,15 @@ tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
    }

    /* add in the hash table */
-    h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags,
-                     tb->trace_vcpu_dstate);
+    h = tb_hash_func(phys_pc, (TARGET_TB_PCREL ? 0 : tb_pc(tb)),
+                     tb->flags, tb->cflags, tb->trace_vcpu_dstate);
    qht_insert(&tb_ctx.htable, tb, h, &existing_tb);

    /* remove TB from the page(s) if we couldn't insert it */
    if (unlikely(existing_tb)) {
        tb_page_remove(p, tb);
-        invalidate_page_bitmap(p);
        if (p2) {
            tb_page_remove(p2, tb);
-            invalidate_page_bitmap(p2);
        }
        tb = existing_tb;
    }
@ -1423,7 +1393,9 @@ TranslationBlock *tb_gen_code(CPUState *cpu,

    gen_code_buf = tcg_ctx->code_gen_ptr;
    tb->tc.ptr = tcg_splitwx_to_rx(gen_code_buf);
+#if !TARGET_TB_PCREL
    tb->pc = pc;
+#endif
    tb->cs_base = cs_base;
    tb->flags = flags;
    tb->cflags = cflags;
@ -1452,7 +1424,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
    tcg_ctx->cpu = NULL;
    max_insns = tb->icount;

-    trace_translate_block(tb, tb->pc, tb->tc.ptr);
+    trace_translate_block(tb, pc, tb->tc.ptr);

    /* generate machine code */
    tb->jmp_reset_offset[0] = TB_JMP_RESET_OFFSET_INVALID;
@ -1473,7 +1445,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
    ti = profile_getclock();
 #endif

-    gen_code_size = tcg_gen_code(tcg_ctx, tb);
+    gen_code_size = tcg_gen_code(tcg_ctx, tb, pc);
    if (unlikely(gen_code_size < 0)) {
 error_return:
        switch (gen_code_size) {
@ -1529,7 +1501,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,

 #ifdef DEBUG_DISAS
    if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
-        qemu_log_in_addr_range(tb->pc)) {
+        qemu_log_in_addr_range(pc)) {
        FILE *logfile = qemu_log_trylock();
        if (logfile) {
            int code_size, data_size;
@ -1697,11 +1669,12 @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
        if (n == 0) {
            /* NOTE: tb_end may be after the end of the page, but
               it is not a problem */
-            tb_start = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
+            tb_start = tb->page_addr[0];
            tb_end = tb_start + tb->size;
        } else {
            tb_start = tb->page_addr[1];
-            tb_end = tb_start + ((tb->pc + tb->size) & ~TARGET_PAGE_MASK);
+            tb_end = tb_start + ((tb->page_addr[0] + tb->size)
+                                 & ~TARGET_PAGE_MASK);
        }
        if (!(tb_end <= start || tb_start >= end)) {
 #ifdef TARGET_HAS_PRECISE_SMC
@ -1731,7 +1704,6 @@ tb_invalidate_phys_page_range__locked(struct page_collection *pages,
 #if !defined(CONFIG_USER_ONLY)
    /* if no code remaining, no need to continue to use slow writes */
    if (!p->first_tb) {
-        invalidate_page_bitmap(p);
        tlb_unprotect_code(start);
    }
 #endif
@ -1827,24 +1799,8 @@ void tb_invalidate_phys_page_fast(struct page_collection *pages,
    }

    assert_page_locked(p);
-    if (!p->code_bitmap &&
-        ++p->code_write_count >= SMC_BITMAP_USE_THRESHOLD) {
-        build_page_bitmap(p);
-    }
-    if (p->code_bitmap) {
-        unsigned int nr;
-        unsigned long b;
-
-        nr = start & ~TARGET_PAGE_MASK;
-        b = p->code_bitmap[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG - 1));
-        if (b & ((1 << len) - 1)) {
-            goto do_invalidate;
-        }
-    } else {
-    do_invalidate:
-        tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
-                                              retaddr);
-    }
+    tb_invalidate_phys_page_range__locked(pages, p, start, start + len,
+                                          retaddr);
 }
 #else
 /* Called with mmap_lock held. If pc is not 0 then it indicates the
@ -1985,9 +1941,13 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
     */
    cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_LAST_IO | n;

-    qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
-                           "cpu_io_recompile: rewound execution of TB to "
-                           TARGET_FMT_lx "\n", tb->pc);
+    if (qemu_loglevel_mask(CPU_LOG_EXEC)) {
+        target_ulong pc = log_pc(cpu, tb);
+        if (qemu_log_in_addr_range(pc)) {
+            qemu_log("cpu_io_recompile: rewound execution of TB to "
+                     TARGET_FMT_lx "\n", pc);
+        }
+    }

    cpu_loop_exit_noexc(cpu);
 }
@ -2289,7 +2249,7 @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
    for (addr = start, len = end - start;
         len != 0;
         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
-        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, 1);
+        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, true);

        /* If the write protection bit is set, then we invalidate
           the code inside.  */
@ -2512,6 +2472,26 @@ int page_unprotect(target_ulong address, uintptr_t pc)
 }
 #endif /* CONFIG_USER_ONLY */

+/*
+ * Called by generic code at e.g. cpu reset after cpu creation,
+ * therefore we must be prepared to allocate the jump cache.
+ */
+void tcg_flush_jmp_cache(CPUState *cpu)
+{
+    CPUJumpCache *jc = cpu->tb_jmp_cache;
+
+    if (likely(jc)) {
+        for (int i = 0; i < TB_JMP_CACHE_SIZE; i++) {
+            qatomic_set(&jc->array[i].tb, NULL);
+        }
+    } else {
+        /* This should happen once during realize, and thus never race. */
+        jc = g_new0(CPUJumpCache, 1);
+        jc = qatomic_xchg(&cpu->tb_jmp_cache, jc);
+        assert(jc == NULL);
+    }
+}
+
 /* This is a wrapper for common code that can not use CONFIG_SOFTMMU */
 void tcg_flush_softmmu_tlb(CPUState *cs)
 {
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@ -75,7 +75,7 @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, int max_insns,
    ops->tb_start(db, cpu);
    tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */

-    plugin_enabled = plugin_gen_tb_start(cpu, tb, cflags & CF_MEMI_ONLY);
+    plugin_enabled = plugin_gen_tb_start(cpu, db, cflags & CF_MEMI_ONLY);

    while (true) {
        db->num_insns++;
--- a/cpu.c
+++ b/cpu.c
@ -131,9 +131,8 @@ const VMStateDescription vmstate_cpu_common = {

 void cpu_exec_realizefn(CPUState *cpu, Error **errp)
 {
-#ifndef CONFIG_USER_ONLY
-    CPUClass *cc = CPU_GET_CLASS(cpu);
-#endif
+    /* cache the cpu class for the hotpath */
+    cpu->cc = CPU_GET_CLASS(cpu);

    cpu_list_add(cpu);
    if (!accel_cpu_realizefn(cpu, errp)) {
@ -151,8 +150,8 @@ void cpu_exec_realizefn(CPUState *cpu, Error **errp)
    if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
        vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu);
    }
-    if (cc->sysemu_ops->legacy_vmsd != NULL) {
-        vmstate_register(NULL, cpu->cpu_index, cc->sysemu_ops->legacy_vmsd, cpu);
+    if (cpu->cc->sysemu_ops->legacy_vmsd != NULL) {
+        vmstate_register(NULL, cpu->cpu_index, cpu->cc->sysemu_ops->legacy_vmsd, cpu);
    }
 #endif /* CONFIG_USER_ONLY */
 }
--- a/hw/core/cpu-common.c
+++ b/hw/core/cpu-common.c
@ -137,8 +137,7 @@ static void cpu_common_reset(DeviceState *dev)
    cpu->cflags_next_tb = -1;

    if (tcg_enabled()) {
-        cpu_tb_jmp_cache_clear(cpu);
-
+        tcg_flush_jmp_cache(cpu);
        tcg_flush_softmmu_tlb(cpu);
    }
 }
--- a/hw/core/cpu-sysemu.c
+++ b/hw/core/cpu-sysemu.c
@ -69,11 +69,10 @@ hwaddr cpu_get_phys_page_debug(CPUState *cpu, vaddr addr)

 int cpu_asidx_from_attrs(CPUState *cpu, MemTxAttrs attrs)
 {
-    CPUClass *cc = CPU_GET_CLASS(cpu);
    int ret = 0;

-    if (cc->sysemu_ops->asidx_from_attrs) {
-        ret = cc->sysemu_ops->asidx_from_attrs(cpu, attrs);
+    if (cpu->cc->sysemu_ops->asidx_from_attrs) {
+        ret = cpu->cc->sysemu_ops->asidx_from_attrs(cpu, attrs);
        assert(ret < cpu->num_ases && ret >= 0);
    }
    return ret;
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@ -38,6 +38,7 @@ void cpu_list_unlock(void);
 unsigned int cpu_list_generation_id_get(void);

 void tcg_flush_softmmu_tlb(CPUState *cs);
+void tcg_flush_jmp_cache(CPUState *cs);

 void tcg_iommu_init_notifier_list(CPUState *cpu);
 void tcg_iommu_free_notifier_list(CPUState *cpu);
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@ -54,6 +54,9 @@
 #  error TARGET_PAGE_BITS must be defined in cpu-param.h
 # endif
 #endif
+#ifndef TARGET_TB_PCREL
+# define TARGET_TB_PCREL 0
+#endif

 #define TARGET_LONG_SIZE (TARGET_LONG_BITS / 8)

@ -108,6 +111,7 @@ typedef uint64_t target_ulong;
 #  endif
 # endif

+/* Minimalized TLB entry for use by TCG fast path. */
 typedef struct CPUTLBEntry {
    /* bit TARGET_LONG_BITS to TARGET_PAGE_BITS : virtual address
       bit TARGET_PAGE_BITS-1..4  : Nonzero for accesses that should not
@ -131,14 +135,14 @@ typedef struct CPUTLBEntry {

 QEMU_BUILD_BUG_ON(sizeof(CPUTLBEntry) != (1 << CPU_TLB_ENTRY_BITS));

-/* The IOTLB is not accessed directly inline by generated TCG code,
- * so the CPUIOTLBEntry layout is not as critical as that of the
- * CPUTLBEntry. (This is also why we don't want to combine the two
- * structs into one.)
+/*
+ * The full TLB entry, which is not accessed by generated TCG code,
+ * so the layout is not as critical as that of CPUTLBEntry. This is
+ * also why we don't want to combine the two structs.
 */
-typedef struct CPUIOTLBEntry {
+typedef struct CPUTLBEntryFull {
    /*
-     * @addr contains:
+     * @xlat_section contains:
     *  - in the lower TARGET_PAGE_BITS, a physical section number
     *  - with the lower TARGET_PAGE_BITS masked off, an offset which
     *    must be added to the virtual address to obtain:
@ -146,9 +150,32 @@ typedef struct CPUIOTLBEntry {
     *       number is PHYS_SECTION_NOTDIRTY or PHYS_SECTION_ROM)
     *     + the offset within the target MemoryRegion (otherwise)
     */
-    hwaddr addr;
+    hwaddr xlat_section;
+
+    /*
+     * @phys_addr contains the physical address in the address space
+     * given by cpu_asidx_from_attrs(cpu, @attrs).
+     */
+    hwaddr phys_addr;
+
+    /* @attrs contains the memory transaction attributes for the page. */
    MemTxAttrs attrs;
-} CPUIOTLBEntry;
+
+    /* @prot contains the complete protections for the page. */
+    uint8_t prot;
+
+    /* @lg_page_size contains the log2 of the page size. */
+    uint8_t lg_page_size;
+
+    /*
+     * Allow target-specific additions to this structure.
+     * This may be used to cache items from the guest cpu
+     * page tables for later use by the implementation.
+     */
+#ifdef TARGET_PAGE_ENTRY_EXTRA
+    TARGET_PAGE_ENTRY_EXTRA
+#endif
+} CPUTLBEntryFull;

 /*
 * Data elements that are per MMU mode, minus the bits accessed by
@ -172,9 +199,8 @@ typedef struct CPUTLBDesc {
    size_t vindex;
    /* The tlb victim table, in two parts.  */
    CPUTLBEntry vtable[CPU_VTLB_SIZE];
-    CPUIOTLBEntry viotlb[CPU_VTLB_SIZE];
-    /* The iotlb.  */
-    CPUIOTLBEntry *iotlb;
+    CPUTLBEntryFull vfulltlb[CPU_VTLB_SIZE];
+    CPUTLBEntryFull *fulltlb;
 } CPUTLBDesc;

 /*
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@ -257,6 +257,28 @@ void tlb_flush_range_by_mmuidx_all_cpus_synced(CPUState *cpu,
                                               uint16_t idxmap,
                                               unsigned bits);

+/**
+ * tlb_set_page_full:
+ * @cpu: CPU context
+ * @mmu_idx: mmu index of the tlb to modify
+ * @vaddr: virtual address of the entry to add
+ * @full: the details of the tlb entry
+ *
+ * Add an entry to @cpu tlb index @mmu_idx.  All of the fields of
+ * @full must be filled, except for xlat_section, and constitute
+ * the complete description of the translated page.
+ *
+ * This is generally called by the target tlb_fill function after
+ * having performed a successful page table walk to find the physical
+ * address and attributes for the translation.
+ *
+ * At most one entry for a given virtual address is permitted. Only a
+ * single TARGET_PAGE_SIZE region is mapped; @full->lg_page_size is only
+ * used by tlb_flush_page.
+ */
+void tlb_set_page_full(CPUState *cpu, int mmu_idx, target_ulong vaddr,
+                       CPUTLBEntryFull *full);
+
 /**
 * tlb_set_page_with_attrs:
 * @cpu: CPU to add this TLB entry for
@ -434,6 +456,21 @@ int probe_access_flags(CPUArchState *env, target_ulong addr,
                       MMUAccessType access_type, int mmu_idx,
                       bool nonfault, void **phost, uintptr_t retaddr);

+#ifndef CONFIG_USER_ONLY
+/**
+ * probe_access_full:
+ * Like probe_access_flags, except also return into @pfull.
+ *
+ * The CPUTLBEntryFull structure returned via @pfull is transient
+ * and must be consumed or copied immediately, before any further
+ * access or changes to TLB @mmu_idx.
+ */
+int probe_access_full(CPUArchState *env, target_ulong addr,
+                      MMUAccessType access_type, int mmu_idx,
+                      bool nonfault, void **phost,
+                      CPUTLBEntryFull **pfull, uintptr_t retaddr);
+#endif
+
 #define CODE_GEN_ALIGN           16 /* must be >= of the size of a icache line */

 /* Estimated block size for TB allocation.  */
@ -459,8 +496,32 @@ struct tb_tc {
 };

 struct TranslationBlock {
-    target_ulong pc;   /* simulated PC corresponding to this block (EIP + CS base) */
-    target_ulong cs_base; /* CS base for this block */
+#if !TARGET_TB_PCREL
+    /*
+     * Guest PC corresponding to this block.  This must be the true
+     * virtual address.  Therefore e.g. x86 stores EIP + CS_BASE, and
+     * targets like Arm, MIPS, HP-PA, which reuse low bits for ISA or
+     * privilege, must store those bits elsewhere.
+     *
+     * If TARGET_TB_PCREL, the opcodes for the TranslationBlock are
+     * written such that the TB is associated only with the physical
+     * page and may be run in any virtual address context.  In this case,
+     * PC must always be taken from ENV in a target-specific manner.
+     * Unwind information is taken as offsets from the page, to be
+     * deposited into the "current" PC.
+     */
+    target_ulong pc;
+#endif
+
+    /*
+     * Target-specific data associated with the TranslationBlock, e.g.:
+     * x86: the original user, the Code Segment virtual base,
+     * arm: an extension of tb->flags,
+     * s390x: instruction data for EXECUTE,
+     * sparc: the next pc of the instruction queue (for delay slots).
+     */
+    target_ulong cs_base;
+
    uint32_t flags; /* flags defining in which context the code was generated */
    uint32_t cflags;    /* compile flags */

@ -533,6 +594,16 @@ struct TranslationBlock {
    uintptr_t jmp_dest[2];
 };

+/* Hide the read to avoid ifdefs for TARGET_TB_PCREL. */
+static inline target_ulong tb_pc(const TranslationBlock *tb)
+{
+#if TARGET_TB_PCREL
+    qemu_build_not_reached();
+#else
+    return tb->pc;
+#endif
+}
+
 /* Hide the qatomic_read to make code a little easier on the eyes */
 static inline uint32_t tb_cflags(const TranslationBlock *tb)
 {
--- a/include/exec/plugin-gen.h
+++ b/include/exec/plugin-gen.h
@ -19,7 +19,8 @@ struct DisasContextBase;

 #ifdef CONFIG_PLUGIN

-bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress);
+bool plugin_gen_tb_start(CPUState *cpu, const struct DisasContextBase *db,
+                         bool supress);
 void plugin_gen_tb_end(CPUState *cpu);
 void plugin_gen_insn_start(CPUState *cpu, const struct DisasContextBase *db);
 void plugin_gen_insn_end(void);
@ -48,8 +49,8 @@ static inline void plugin_insn_append(abi_ptr pc, const void *from, size_t size)

 #else /* !CONFIG_PLUGIN */

-static inline
-bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool supress)
+static inline bool
+plugin_gen_tb_start(CPUState *cpu, const struct DisasContextBase *db, bool sup)
 {
    return false;
 }
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@ -51,6 +51,13 @@ typedef int (*WriteCoreDumpFunction)(const void *buf, size_t size,
 */
 #define CPU(obj) ((CPUState *)(obj))

+/*
+ * The class checkers bring in CPU_GET_CLASS() which is potentially
+ * expensive given the eventual call to
+ * object_class_dynamic_cast_assert(). Because of this the CPUState
+ * has a cached value for the class in cs->cc which is set up in
+ * cpu_exec_realizefn() for use in hot code paths.
+ */
 typedef struct CPUClass CPUClass;
 DECLARE_CLASS_CHECKERS(CPUClass, CPU,
                       TYPE_CPU)
@ -108,6 +115,8 @@ struct SysemuCPUOps;
 *       If the target behaviour here is anything other than "set
 *       the PC register to the value passed in" then the target must
 *       also implement the synchronize_from_tb hook.
+ * @get_pc: Callback for getting the Program Counter register.
+ *       As above, with the semantics of the target architecture.
 * @gdb_read_register: Callback for letting GDB read a register.
 * @gdb_write_register: Callback for letting GDB write a register.
 * @gdb_adjust_breakpoint: Callback for adjusting the address of a
@ -144,6 +153,7 @@ struct CPUClass {
    void (*dump_state)(CPUState *cpu, FILE *, int flags);
    int64_t (*get_arch_id)(CPUState *cpu);
    void (*set_pc)(CPUState *cpu, vaddr value);
+    vaddr (*get_pc)(CPUState *cpu);
    int (*gdb_read_register)(CPUState *cpu, GByteArray *buf, int reg);
    int (*gdb_write_register)(CPUState *cpu, uint8_t *buf, int reg);
    vaddr (*gdb_adjust_breakpoint)(CPUState *cpu, vaddr addr);
@ -218,7 +228,6 @@ struct CPUWatchpoint {
 * the memory regions get moved around  by io_writex.
 */
 typedef struct SavedIOTLB {
-    hwaddr addr;
    MemoryRegionSection *section;
    hwaddr mr_offset;
 } SavedIOTLB;
@ -230,9 +239,6 @@ struct kvm_run;
 struct hax_vcpu_state;
 struct hvf_vcpu_state;

-#define TB_JMP_CACHE_BITS 12
-#define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
-
 /* work queue */

 /* The union type allows passing of 64 bit target pointers on 32 bit
@ -317,6 +323,8 @@ struct qemu_work_item;
 struct CPUState {
    /*< private >*/
    DeviceState parent_obj;
+    /* cache to avoid expensive CPU_GET_CLASS */
+    CPUClass *cc;
    /*< public >*/

    int nr_cores;
@ -361,8 +369,7 @@ struct CPUState {
    CPUArchState *env_ptr;
    IcountDecr *icount_decr_ptr;

-    /* Accessed in parallel; all accesses must be atomic */
-    TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE];
+    CPUJumpCache *tb_jmp_cache;

    struct GDBRegisterState *gdb_regs;
    int gdb_num_regs;
@ -448,15 +455,6 @@ extern CPUTailQ cpus;

 extern __thread CPUState *current_cpu;

-static inline void cpu_tb_jmp_cache_clear(CPUState *cpu)
-{
-    unsigned int i;
-
-    for (i = 0; i < TB_JMP_CACHE_SIZE; i++) {
-        qatomic_set(&cpu->tb_jmp_cache[i], NULL);
-    }
-}
-
 /**
 * qemu_tcg_mttcg_enabled:
 * Check whether we are running MultiThread TCG or not.
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@ -41,7 +41,9 @@ typedef struct CoMutex CoMutex;
 typedef struct ConfidentialGuestSupport ConfidentialGuestSupport;
 typedef struct CPUAddressSpace CPUAddressSpace;
 typedef struct CPUArchState CPUArchState;
+typedef struct CPUJumpCache CPUJumpCache;
 typedef struct CPUState CPUState;
+typedef struct CPUTLBEntryFull CPUTLBEntryFull;
 typedef struct DeviceListener DeviceListener;
 typedef struct DeviceState DeviceState;
 typedef struct DirtyBitmapSnapshot DirtyBitmapSnapshot;
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@ -840,7 +840,7 @@ void tcg_register_thread(void);
 void tcg_prologue_init(TCGContext *s);
 void tcg_func_start(TCGContext *s);

-int tcg_gen_code(TCGContext *s, TranslationBlock *tb);
+int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start);

 void tcg_set_frame(TCGContext *s, TCGReg reg, intptr_t start, intptr_t size);

--- a/linux-user/sh4/signal.c
+++ b/linux-user/sh4/signal.c
@ -161,7 +161,7 @@ static void restore_sigcontext(CPUSH4State *regs, struct target_sigcontext *sc)
    __get_user(regs->fpul, &sc->sc_fpul);

    regs->tra = -1;         /* disable syscall checks */
-    regs->flags &= ~(DELAY_SLOT_MASK | GUSA_MASK);
+    regs->flags = 0;
 }

 void setup_frame(int sig, struct target_sigaction *ka,
@ -199,7 +199,7 @@ void setup_frame(int sig, struct target_sigaction *ka,
    regs->gregs[5] = 0;
    regs->gregs[6] = frame_addr += offsetof(typeof(*frame), sc);
    regs->pc = (unsigned long) ka->_sa_handler;
-    regs->flags &= ~(DELAY_SLOT_MASK | GUSA_MASK);
+    regs->flags &= ~(TB_FLAG_DELAY_SLOT_MASK | TB_FLAG_GUSA_MASK);

    unlock_user_struct(frame, frame_addr, 1);
    return;
@ -251,7 +251,7 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
    regs->gregs[5] = frame_addr + offsetof(typeof(*frame), info);
    regs->gregs[6] = frame_addr + offsetof(typeof(*frame), uc);
    regs->pc = (unsigned long) ka->_sa_handler;
-    regs->flags &= ~(DELAY_SLOT_MASK | GUSA_MASK);
+    regs->flags &= ~(TB_FLAG_DELAY_SLOT_MASK | TB_FLAG_GUSA_MASK);

    unlock_user_struct(frame, frame_addr, 1);
    return;
--- a/plugins/core.c
+++ b/plugins/core.c
@ -56,7 +56,7 @@ struct qemu_plugin_ctx *plugin_id_to_ctx_locked(qemu_plugin_id_t id)
 static void plugin_cpu_update__async(CPUState *cpu, run_on_cpu_data data)
 {
    bitmap_copy(cpu->plugin_mask, &data.host_ulong, QEMU_PLUGIN_EV_MAX);
-    cpu_tb_jmp_cache_clear(cpu);
+    tcg_flush_jmp_cache(cpu);
 }

 static void plugin_cpu_update__locked(gpointer k, gpointer v, gpointer udata)
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@ -33,6 +33,14 @@ static void alpha_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.pc = value;
 }

+static vaddr alpha_cpu_get_pc(CPUState *cs)
+{
+    AlphaCPU *cpu = ALPHA_CPU(cs);
+
+    return cpu->env.pc;
+}
+
+
 static bool alpha_cpu_has_work(CPUState *cs)
 {
    /* Here we are checking to see if the CPU should wake up from HALT.
@ -244,6 +252,7 @@ static void alpha_cpu_class_init(ObjectClass *oc, void *data)
    cc->has_work = alpha_cpu_has_work;
    cc->dump_state = alpha_cpu_dump_state;
    cc->set_pc = alpha_cpu_set_pc;
+    cc->get_pc = alpha_cpu_get_pc;
    cc->gdb_read_register = alpha_cpu_gdb_read_register;
    cc->gdb_write_register = alpha_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@ -60,6 +60,18 @@ static void arm_cpu_set_pc(CPUState *cs, vaddr value)
    }
 }

+static vaddr arm_cpu_get_pc(CPUState *cs)
+{
+    ARMCPU *cpu = ARM_CPU(cs);
+    CPUARMState *env = &cpu->env;
+
+    if (is_a64(env)) {
+        return env->pc;
+    } else {
+        return env->regs[15];
+    }
+}
+
 #ifdef CONFIG_TCG
 void arm_cpu_synchronize_from_tb(CPUState *cs,
                                 const TranslationBlock *tb)
@ -72,9 +84,9 @@ void arm_cpu_synchronize_from_tb(CPUState *cs,
     * never possible for an AArch64 TB to chain to an AArch32 TB.
     */
    if (is_a64(env)) {
-        env->pc = tb->pc;
+        env->pc = tb_pc(tb);
    } else {
-        env->regs[15] = tb->pc;
+        env->regs[15] = tb_pc(tb);
    }
 }
 #endif /* CONFIG_TCG */
@ -2172,6 +2184,7 @@ static void arm_cpu_class_init(ObjectClass *oc, void *data)
    cc->has_work = arm_cpu_has_work;
    cc->dump_state = arm_cpu_dump_state;
    cc->set_pc = arm_cpu_set_pc;
+    cc->get_pc = arm_cpu_get_pc;
    cc->gdb_read_register = arm_cpu_gdb_read_register;
    cc->gdb_write_register = arm_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
--- a/target/arm/mte_helper.c
+++ b/target/arm/mte_helper.c
@ -106,7 +106,7 @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
    return tags + index;
 #else
    uintptr_t index;
-    CPUIOTLBEntry *iotlbentry;
+    CPUTLBEntryFull *full;
    int in_page, flags;
    ram_addr_t ptr_ra;
    hwaddr ptr_paddr, tag_paddr, xlat;
@ -129,7 +129,7 @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
    assert(!(flags & TLB_INVALID_MASK));

    /*
-     * Find the iotlbentry for ptr.  This *must* be present in the TLB
+     * Find the CPUTLBEntryFull for ptr.  This *must* be present in the TLB
     * because we just found the mapping.
     * TODO: Perhaps there should be a cputlb helper that returns a
     * matching tlb entry + iotlb entry.
@ -144,10 +144,10 @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
        g_assert(tlb_hit(comparator, ptr));
    }
 # endif
-    iotlbentry = &env_tlb(env)->d[ptr_mmu_idx].iotlb[index];
+    full = &env_tlb(env)->d[ptr_mmu_idx].fulltlb[index];

    /* If the virtual page MemAttr != Tagged, access unchecked. */
-    if (!arm_tlb_mte_tagged(&iotlbentry->attrs)) {
+    if (!arm_tlb_mte_tagged(&full->attrs)) {
        return NULL;
    }

@ -181,7 +181,7 @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
        int wp = ptr_access == MMU_DATA_LOAD ? BP_MEM_READ : BP_MEM_WRITE;
        assert(ra != 0);
        cpu_check_watchpoint(env_cpu(env), ptr, ptr_size,
-                             iotlbentry->attrs, wp, ra);
+                             full->attrs, wp, ra);
    }

    /*
@ -202,11 +202,11 @@ static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
    tag_paddr = ptr_paddr >> (LOG2_TAG_GRANULE + 1);

    /* Look up the address in tag space. */
-    tag_asi = iotlbentry->attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS;
+    tag_asi = full->attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS;
    tag_as = cpu_get_address_space(env_cpu(env), tag_asi);
    mr = address_space_translate(tag_as, tag_paddr, &xlat, NULL,
                                 tag_access == MMU_DATA_STORE,
-                                 iotlbentry->attrs);
+                                 full->attrs);

    /*
     * Note that @mr will never be NULL.  If there is nothing in the address
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@ -5384,8 +5384,8 @@ bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
        g_assert(tlb_hit(comparator, addr));
 # endif

-        CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
-        info->attrs = iotlbentry->attrs;
+        CPUTLBEntryFull *full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+        info->attrs = full->attrs;
    }
 #endif

--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@ -14624,7 +14624,7 @@ static bool is_guarded_page(CPUARMState *env, DisasContext *s)
     * table entry even for that case.
     */
    return (tlb_hit(entry->addr_code, addr) &&
-            arm_tlb_bti_gp(&env_tlb(env)->d[mmu_idx].iotlb[index].attrs));
+            arm_tlb_bti_gp(&env_tlb(env)->d[mmu_idx].fulltlb[index].attrs));
 #endif
 }

--- a/target/avr/cpu.c
+++ b/target/avr/cpu.c
@ -32,6 +32,13 @@ static void avr_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.pc_w = value / 2; /* internally PC points to words */
 }

+static vaddr avr_cpu_get_pc(CPUState *cs)
+{
+    AVRCPU *cpu = AVR_CPU(cs);
+
+    return cpu->env.pc_w * 2;
+}
+
 static bool avr_cpu_has_work(CPUState *cs)
 {
    AVRCPU *cpu = AVR_CPU(cs);
@ -47,7 +54,7 @@ static void avr_cpu_synchronize_from_tb(CPUState *cs,
    AVRCPU *cpu = AVR_CPU(cs);
    CPUAVRState *env = &cpu->env;

-    env->pc_w = tb->pc / 2; /* internally PC points to words */
+    env->pc_w = tb_pc(tb) / 2; /* internally PC points to words */
 }

 static void avr_cpu_reset(DeviceState *ds)
@ -214,6 +221,7 @@ static void avr_cpu_class_init(ObjectClass *oc, void *data)
    cc->has_work = avr_cpu_has_work;
    cc->dump_state = avr_cpu_dump_state;
    cc->set_pc = avr_cpu_set_pc;
+    cc->get_pc = avr_cpu_get_pc;
    dc->vmsd = &vms_avr_cpu;
    cc->sysemu_ops = &avr_sysemu_ops;
    cc->disas_set_info = avr_cpu_disas_set_info;
--- a/target/cris/cpu.c
+++ b/target/cris/cpu.c
@ -35,6 +35,13 @@ static void cris_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.pc = value;
 }

+static vaddr cris_cpu_get_pc(CPUState *cs)
+{
+    CRISCPU *cpu = CRIS_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static bool cris_cpu_has_work(CPUState *cs)
 {
    return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI);
@ -297,6 +304,7 @@ static void cris_cpu_class_init(ObjectClass *oc, void *data)
    cc->has_work = cris_cpu_has_work;
    cc->dump_state = cris_cpu_dump_state;
    cc->set_pc = cris_cpu_set_pc;
+    cc->get_pc = cris_cpu_get_pc;
    cc->gdb_read_register = cris_cpu_gdb_read_register;
    cc->gdb_write_register = cris_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
--- a/target/hexagon/cpu.c
+++ b/target/hexagon/cpu.c
@ -251,12 +251,19 @@ static void hexagon_cpu_set_pc(CPUState *cs, vaddr value)
    env->gpr[HEX_REG_PC] = value;
 }

+static vaddr hexagon_cpu_get_pc(CPUState *cs)
+{
+    HexagonCPU *cpu = HEXAGON_CPU(cs);
+    CPUHexagonState *env = &cpu->env;
+    return env->gpr[HEX_REG_PC];
+}
+
 static void hexagon_cpu_synchronize_from_tb(CPUState *cs,
                                            const TranslationBlock *tb)
 {
    HexagonCPU *cpu = HEXAGON_CPU(cs);
    CPUHexagonState *env = &cpu->env;
-    env->gpr[HEX_REG_PC] = tb->pc;
+    env->gpr[HEX_REG_PC] = tb_pc(tb);
 }

 static bool hexagon_cpu_has_work(CPUState *cs)
@ -337,6 +344,7 @@ static void hexagon_cpu_class_init(ObjectClass *c, void *data)
    cc->has_work = hexagon_cpu_has_work;
    cc->dump_state = hexagon_dump_state;
    cc->set_pc = hexagon_cpu_set_pc;
+    cc->get_pc = hexagon_cpu_get_pc;
    cc->gdb_read_register = hexagon_gdb_read_register;
    cc->gdb_write_register = hexagon_gdb_write_register;
    cc->gdb_num_core_regs = TOTAL_PER_THREAD_REGS + NUM_VREGS + NUM_QREGS;
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@ -36,13 +36,20 @@ static void hppa_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.iaoq_b = value + 4;
 }

+static vaddr hppa_cpu_get_pc(CPUState *cs)
+{
+    HPPACPU *cpu = HPPA_CPU(cs);
+
+    return cpu->env.iaoq_f;
+}
+
 static void hppa_cpu_synchronize_from_tb(CPUState *cs,
                                         const TranslationBlock *tb)
 {
    HPPACPU *cpu = HPPA_CPU(cs);

 #ifdef CONFIG_USER_ONLY
-    cpu->env.iaoq_f = tb->pc;
+    cpu->env.iaoq_f = tb_pc(tb);
    cpu->env.iaoq_b = tb->cs_base;
 #else
    /* Recover the IAOQ values from the GVA + PRIV.  */
@ -52,7 +59,7 @@ static void hppa_cpu_synchronize_from_tb(CPUState *cs,
    int32_t diff = cs_base;

    cpu->env.iasq_f = iasq_f;
-    cpu->env.iaoq_f = (tb->pc & ~iasq_f) + priv;
+    cpu->env.iaoq_f = (tb_pc(tb) & ~iasq_f) + priv;
    if (diff) {
        cpu->env.iaoq_b = cpu->env.iaoq_f + diff;
    }
@ -168,6 +175,7 @@ static void hppa_cpu_class_init(ObjectClass *oc, void *data)
    cc->has_work = hppa_cpu_has_work;
    cc->dump_state = hppa_cpu_dump_state;
    cc->set_pc = hppa_cpu_set_pc;
+    cc->get_pc = hppa_cpu_get_pc;
    cc->gdb_read_register = hppa_cpu_gdb_read_register;
    cc->gdb_write_register = hppa_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@ -6824,6 +6824,14 @@ static void x86_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.eip = value;
 }

+static vaddr x86_cpu_get_pc(CPUState *cs)
+{
+    X86CPU *cpu = X86_CPU(cs);
+
+    /* Match cpu_get_tb_cpu_state. */
+    return cpu->env.eip + cpu->env.segs[R_CS].base;
+}
+
 int x86_cpu_pending_interrupt(CPUState *cs, int interrupt_request)
 {
    X86CPU *cpu = X86_CPU(cs);
@ -7106,6 +7114,7 @@ static void x86_cpu_common_class_init(ObjectClass *oc, void *data)
    cc->has_work = x86_cpu_has_work;
    cc->dump_state = x86_cpu_dump_state;
    cc->set_pc = x86_cpu_set_pc;
+    cc->get_pc = x86_cpu_get_pc;
    cc->gdb_read_register = x86_cpu_gdb_read_register;
    cc->gdb_write_register = x86_cpu_gdb_write_register;
    cc->get_arch_id = x86_cpu_get_arch_id;
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@ -51,7 +51,7 @@ static void x86_cpu_synchronize_from_tb(CPUState *cs,
 {
    X86CPU *cpu = X86_CPU(cs);

-    cpu->env.eip = tb->pc - tb->cs_base;
+    cpu->env.eip = tb_pc(tb) - tb->cs_base;
 }

 #ifndef CONFIG_USER_ONLY
--- a/target/loongarch/cpu.c
+++ b/target/loongarch/cpu.c
@ -82,6 +82,14 @@ static void loongarch_cpu_set_pc(CPUState *cs, vaddr value)
    env->pc = value;
 }

+static vaddr loongarch_cpu_get_pc(CPUState *cs)
+{
+    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+    CPULoongArchState *env = &cpu->env;
+
+    return env->pc;
+}
+
 #ifndef CONFIG_USER_ONLY
 #include "hw/loongarch/virt.h"

@ -309,7 +317,7 @@ static void loongarch_cpu_synchronize_from_tb(CPUState *cs,
    LoongArchCPU *cpu = LOONGARCH_CPU(cs);
    CPULoongArchState *env = &cpu->env;

-    env->pc = tb->pc;
+    env->pc = tb_pc(tb);
 }
 #endif /* CONFIG_TCG */

@ -680,6 +688,7 @@ static void loongarch_cpu_class_init(ObjectClass *c, void *data)
    cc->has_work = loongarch_cpu_has_work;
    cc->dump_state = loongarch_cpu_dump_state;
    cc->set_pc = loongarch_cpu_set_pc;
+    cc->get_pc = loongarch_cpu_get_pc;
 #ifndef CONFIG_USER_ONLY
    dc->vmsd = &vmstate_loongarch_cpu;
    cc->sysemu_ops = &loongarch_sysemu_ops;
--- a/target/m68k/cpu.c
+++ b/target/m68k/cpu.c
@ -31,6 +31,13 @@ static void m68k_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.pc = value;
 }

+static vaddr m68k_cpu_get_pc(CPUState *cs)
+{
+    M68kCPU *cpu = M68K_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static bool m68k_cpu_has_work(CPUState *cs)
 {
    return cs->interrupt_request & CPU_INTERRUPT_HARD;
@ -540,6 +547,7 @@ static void m68k_cpu_class_init(ObjectClass *c, void *data)
    cc->has_work = m68k_cpu_has_work;
    cc->dump_state = m68k_cpu_dump_state;
    cc->set_pc = m68k_cpu_set_pc;
+    cc->get_pc = m68k_cpu_get_pc;
    cc->gdb_read_register = m68k_cpu_gdb_read_register;
    cc->gdb_write_register = m68k_cpu_gdb_write_register;
 #if defined(CONFIG_SOFTMMU)
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@ -84,12 +84,19 @@ static void mb_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.iflags = 0;
 }

+static vaddr mb_cpu_get_pc(CPUState *cs)
+{
+    MicroBlazeCPU *cpu = MICROBLAZE_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static void mb_cpu_synchronize_from_tb(CPUState *cs,
                                       const TranslationBlock *tb)
 {
    MicroBlazeCPU *cpu = MICROBLAZE_CPU(cs);

-    cpu->env.pc = tb->pc;
+    cpu->env.pc = tb_pc(tb);
    cpu->env.iflags = tb->flags & IFLAGS_TB_MASK;
 }

@ -391,6 +398,7 @@ static void mb_cpu_class_init(ObjectClass *oc, void *data)

    cc->dump_state = mb_cpu_dump_state;
    cc->set_pc = mb_cpu_set_pc;
+    cc->get_pc = mb_cpu_get_pc;
    cc->gdb_read_register = mb_cpu_gdb_read_register;
    cc->gdb_write_register = mb_cpu_gdb_write_register;

--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@ -128,6 +128,13 @@ static void mips_cpu_set_pc(CPUState *cs, vaddr value)
    mips_env_set_pc(&cpu->env, value);
 }

+static vaddr mips_cpu_get_pc(CPUState *cs)
+{
+    MIPSCPU *cpu = MIPS_CPU(cs);
+
+    return cpu->env.active_tc.PC;
+}
+
 static bool mips_cpu_has_work(CPUState *cs)
 {
    MIPSCPU *cpu = MIPS_CPU(cs);
@ -557,6 +564,7 @@ static void mips_cpu_class_init(ObjectClass *c, void *data)
    cc->has_work = mips_cpu_has_work;
    cc->dump_state = mips_cpu_dump_state;
    cc->set_pc = mips_cpu_set_pc;
+    cc->get_pc = mips_cpu_get_pc;
    cc->gdb_read_register = mips_cpu_gdb_read_register;
    cc->gdb_write_register = mips_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
--- a/target/mips/tcg/exception.c
+++ b/target/mips/tcg/exception.c
@ -82,7 +82,7 @@ void mips_cpu_synchronize_from_tb(CPUState *cs, const TranslationBlock *tb)
    MIPSCPU *cpu = MIPS_CPU(cs);
    CPUMIPSState *env = &cpu->env;

-    env->active_tc.PC = tb->pc;
+    env->active_tc.PC = tb_pc(tb);
    env->hflags &= ~MIPS_HFLAG_BMASK;
    env->hflags |= tb->flags & MIPS_HFLAG_BMASK;
 }
--- a/target/mips/tcg/sysemu/special_helper.c
+++ b/target/mips/tcg/sysemu/special_helper.c
@ -94,7 +94,7 @@ bool mips_io_recompile_replay_branch(CPUState *cs, const TranslationBlock *tb)
    CPUMIPSState *env = &cpu->env;

    if ((env->hflags & MIPS_HFLAG_BMASK) != 0
-        && env->active_tc.PC != tb->pc) {
+        && env->active_tc.PC != tb_pc(tb)) {
        env->active_tc.PC -= (env->hflags & MIPS_HFLAG_B16 ? 2 : 4);
        env->hflags &= ~MIPS_HFLAG_BMASK;
        return true;
--- a/target/nios2/cpu.c
+++ b/target/nios2/cpu.c
@ -34,6 +34,14 @@ static void nios2_cpu_set_pc(CPUState *cs, vaddr value)
    env->pc = value;
 }

+static vaddr nios2_cpu_get_pc(CPUState *cs)
+{
+    Nios2CPU *cpu = NIOS2_CPU(cs);
+    CPUNios2State *env = &cpu->env;
+
+    return env->pc;
+}
+
 static bool nios2_cpu_has_work(CPUState *cs)
 {
    return cs->interrupt_request & CPU_INTERRUPT_HARD;
@ -362,6 +370,7 @@ static void nios2_cpu_class_init(ObjectClass *oc, void *data)
    cc->has_work = nios2_cpu_has_work;
    cc->dump_state = nios2_cpu_dump_state;
    cc->set_pc = nios2_cpu_set_pc;
+    cc->get_pc = nios2_cpu_get_pc;
    cc->disas_set_info = nios2_cpu_disas_set_info;
 #ifndef CONFIG_USER_ONLY
    cc->sysemu_ops = &nios2_sysemu_ops;
--- a/target/openrisc/cpu.c
+++ b/target/openrisc/cpu.c
@ -31,12 +31,19 @@ static void openrisc_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.dflag = 0;
 }

+static vaddr openrisc_cpu_get_pc(CPUState *cs)
+{
+    OpenRISCCPU *cpu = OPENRISC_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static void openrisc_cpu_synchronize_from_tb(CPUState *cs,
                                             const TranslationBlock *tb)
 {
    OpenRISCCPU *cpu = OPENRISC_CPU(cs);

-    cpu->env.pc = tb->pc;
+    cpu->env.pc = tb_pc(tb);
 }


@ -218,6 +225,7 @@ static void openrisc_cpu_class_init(ObjectClass *oc, void *data)
    cc->has_work = openrisc_cpu_has_work;
    cc->dump_state = openrisc_cpu_dump_state;
    cc->set_pc = openrisc_cpu_set_pc;
+    cc->get_pc = openrisc_cpu_get_pc;
    cc->gdb_read_register = openrisc_cpu_gdb_read_register;
    cc->gdb_write_register = openrisc_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@ -7214,6 +7214,13 @@ static void ppc_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.nip = value;
 }

+static vaddr ppc_cpu_get_pc(CPUState *cs)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+
+    return cpu->env.nip;
+}
+
 static bool ppc_cpu_has_work(CPUState *cs)
 {
    PowerPCCPU *cpu = POWERPC_CPU(cs);
@ -7472,6 +7479,7 @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data)
    cc->has_work = ppc_cpu_has_work;
    cc->dump_state = ppc_cpu_dump_state;
    cc->set_pc = ppc_cpu_set_pc;
+    cc->get_pc = ppc_cpu_get_pc;
    cc->gdb_read_register = ppc_cpu_gdb_read_register;
    cc->gdb_write_register = ppc_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@ -462,6 +462,18 @@ static void riscv_cpu_set_pc(CPUState *cs, vaddr value)
    }
 }

+static vaddr riscv_cpu_get_pc(CPUState *cs)
+{
+    RISCVCPU *cpu = RISCV_CPU(cs);
+    CPURISCVState *env = &cpu->env;
+
+    /* Match cpu_get_tb_cpu_state. */
+    if (env->xl == MXL_RV32) {
+        return env->pc & UINT32_MAX;
+    }
+    return env->pc;
+}
+
 static void riscv_cpu_synchronize_from_tb(CPUState *cs,
                                          const TranslationBlock *tb)
 {
@ -470,9 +482,9 @@ static void riscv_cpu_synchronize_from_tb(CPUState *cs,
    RISCVMXL xl = FIELD_EX32(tb->flags, TB_FLAGS, XL);

    if (xl == MXL_RV32) {
-        env->pc = (int32_t)tb->pc;
+        env->pc = (int32_t)tb_pc(tb);
    } else {
-        env->pc = tb->pc;
+        env->pc = tb_pc(tb);
    }
 }

@ -1154,6 +1166,7 @@ static void riscv_cpu_class_init(ObjectClass *c, void *data)
    cc->has_work = riscv_cpu_has_work;
    cc->dump_state = riscv_cpu_dump_state;
    cc->set_pc = riscv_cpu_set_pc;
+    cc->get_pc = riscv_cpu_get_pc;
    cc->gdb_read_register = riscv_cpu_gdb_read_register;
    cc->gdb_write_register = riscv_cpu_gdb_write_register;
    cc->gdb_num_core_regs = 33;
--- a/target/rx/cpu.c
+++ b/target/rx/cpu.c
@ -32,12 +32,19 @@ static void rx_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.pc = value;
 }

+static vaddr rx_cpu_get_pc(CPUState *cs)
+{
+    RXCPU *cpu = RX_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static void rx_cpu_synchronize_from_tb(CPUState *cs,
                                       const TranslationBlock *tb)
 {
    RXCPU *cpu = RX_CPU(cs);

-    cpu->env.pc = tb->pc;
+    cpu->env.pc = tb_pc(tb);
 }

 static bool rx_cpu_has_work(CPUState *cs)
@ -208,6 +215,7 @@ static void rx_cpu_class_init(ObjectClass *klass, void *data)
    cc->has_work = rx_cpu_has_work;
    cc->dump_state = rx_cpu_dump_state;
    cc->set_pc = rx_cpu_set_pc;
+    cc->get_pc = rx_cpu_get_pc;

 #ifndef CONFIG_USER_ONLY
    cc->sysemu_ops = &rx_sysemu_ops;
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@ -88,6 +88,13 @@ static void s390_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.psw.addr = value;
 }

+static vaddr s390_cpu_get_pc(CPUState *cs)
+{
+    S390CPU *cpu = S390_CPU(cs);
+
+    return cpu->env.psw.addr;
+}
+
 static bool s390_cpu_has_work(CPUState *cs)
 {
    S390CPU *cpu = S390_CPU(cs);
@ -297,6 +304,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
    cc->has_work = s390_cpu_has_work;
    cc->dump_state = s390_cpu_dump_state;
    cc->set_pc = s390_cpu_set_pc;
+    cc->get_pc = s390_cpu_get_pc;
    cc->gdb_read_register = s390_cpu_gdb_read_register;
    cc->gdb_write_register = s390_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
--- a/target/s390x/tcg/mem_helper.c
+++ b/target/s390x/tcg/mem_helper.c
@ -148,10 +148,6 @@ static int s390_probe_access(CPUArchState *env, target_ulong addr, int size,
 #else
    int flags;

-    /*
-     * For !CONFIG_USER_ONLY, we cannot rely on TLB_INVALID_MASK or haddr==NULL
-     * to detect if there was an exception during tlb_fill().
-     */
    env->tlb_fill_exc = 0;
    flags = probe_access_flags(env, addr, access_type, mmu_idx, nonfault, phost,
                               ra);
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@ -34,13 +34,20 @@ static void superh_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.pc = value;
 }

+static vaddr superh_cpu_get_pc(CPUState *cs)
+{
+    SuperHCPU *cpu = SUPERH_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static void superh_cpu_synchronize_from_tb(CPUState *cs,
                                           const TranslationBlock *tb)
 {
    SuperHCPU *cpu = SUPERH_CPU(cs);

-    cpu->env.pc = tb->pc;
-    cpu->env.flags = tb->flags & TB_FLAG_ENVFLAGS_MASK;
+    cpu->env.pc = tb_pc(tb);
+    cpu->env.flags = tb->flags;
 }

 #ifndef CONFIG_USER_ONLY
@ -50,10 +57,10 @@ static bool superh_io_recompile_replay_branch(CPUState *cs,
    SuperHCPU *cpu = SUPERH_CPU(cs);
    CPUSH4State *env = &cpu->env;

-    if ((env->flags & ((DELAY_SLOT | DELAY_SLOT_CONDITIONAL))) != 0
-        && env->pc != tb->pc) {
+    if ((env->flags & (TB_FLAG_DELAY_SLOT | TB_FLAG_DELAY_SLOT_COND))
+        && env->pc != tb_pc(tb)) {
        env->pc -= 2;
-        env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
+        env->flags &= ~(TB_FLAG_DELAY_SLOT | TB_FLAG_DELAY_SLOT_COND);
        return true;
    }
    return false;
@ -261,6 +268,7 @@ static void superh_cpu_class_init(ObjectClass *oc, void *data)
    cc->has_work = superh_cpu_has_work;
    cc->dump_state = superh_cpu_dump_state;
    cc->set_pc = superh_cpu_set_pc;
+    cc->get_pc = superh_cpu_get_pc;
    cc->gdb_read_register = superh_cpu_gdb_read_register;
    cc->gdb_write_register = superh_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@ -78,26 +78,33 @@
 #define FPSCR_RM_NEAREST       (0 << 0)
 #define FPSCR_RM_ZERO          (1 << 0)

-#define DELAY_SLOT_MASK        0x7
-#define DELAY_SLOT             (1 << 0)
-#define DELAY_SLOT_CONDITIONAL (1 << 1)
-#define DELAY_SLOT_RTE         (1 << 2)
+#define TB_FLAG_DELAY_SLOT       (1 << 0)
+#define TB_FLAG_DELAY_SLOT_COND  (1 << 1)
+#define TB_FLAG_DELAY_SLOT_RTE   (1 << 2)
+#define TB_FLAG_PENDING_MOVCA    (1 << 3)
+#define TB_FLAG_GUSA_SHIFT       4                      /* [11:4] */
+#define TB_FLAG_GUSA_EXCLUSIVE   (1 << 12)
+#define TB_FLAG_UNALIGN          (1 << 13)
+#define TB_FLAG_SR_FD            (1 << SR_FD)           /* 15 */
+#define TB_FLAG_FPSCR_PR         FPSCR_PR               /* 19 */
+#define TB_FLAG_FPSCR_SZ         FPSCR_SZ               /* 20 */
+#define TB_FLAG_FPSCR_FR         FPSCR_FR               /* 21 */
+#define TB_FLAG_SR_RB            (1 << SR_RB)           /* 29 */
+#define TB_FLAG_SR_MD            (1 << SR_MD)           /* 30 */

-#define TB_FLAG_PENDING_MOVCA  (1 << 3)
-#define TB_FLAG_UNALIGN        (1 << 4)
-
-#define GUSA_SHIFT             4
-#ifdef CONFIG_USER_ONLY
-#define GUSA_EXCLUSIVE         (1 << 12)
-#define GUSA_MASK              ((0xff << GUSA_SHIFT) | GUSA_EXCLUSIVE)
-#else
-/* Provide dummy versions of the above to allow tests against tbflags
-   to be elided while avoiding ifdefs.  */
-#define GUSA_EXCLUSIVE         0
-#define GUSA_MASK              0
-#endif
-
-#define TB_FLAG_ENVFLAGS_MASK  (DELAY_SLOT_MASK | GUSA_MASK)
+#define TB_FLAG_DELAY_SLOT_MASK  (TB_FLAG_DELAY_SLOT |       \
+                                  TB_FLAG_DELAY_SLOT_COND |  \
+                                  TB_FLAG_DELAY_SLOT_RTE)
+#define TB_FLAG_GUSA_MASK        ((0xff << TB_FLAG_GUSA_SHIFT) | \
+                                  TB_FLAG_GUSA_EXCLUSIVE)
+#define TB_FLAG_FPSCR_MASK       (TB_FLAG_FPSCR_PR | \
+                                  TB_FLAG_FPSCR_SZ | \
+                                  TB_FLAG_FPSCR_FR)
+#define TB_FLAG_SR_MASK          (TB_FLAG_SR_FD | \
+                                  TB_FLAG_SR_RB | \
+                                  TB_FLAG_SR_MD)
+#define TB_FLAG_ENVFLAGS_MASK    (TB_FLAG_DELAY_SLOT_MASK | \
+                                  TB_FLAG_GUSA_MASK)

 typedef struct tlb_t {
    uint32_t vpn;		/* virtual page number */
@ -258,7 +265,7 @@ static inline int cpu_mmu_index (CPUSH4State *env, bool ifetch)
 {
    /* The instruction in a RTE delay slot is fetched in privileged
       mode, but executed in user mode.  */
-    if (ifetch && (env->flags & DELAY_SLOT_RTE)) {
+    if (ifetch && (env->flags & TB_FLAG_DELAY_SLOT_RTE)) {
        return 0;
    } else {
        return (env->sr & (1u << SR_MD)) == 0 ? 1 : 0;
@ -366,11 +373,10 @@ static inline void cpu_get_tb_cpu_state(CPUSH4State *env, target_ulong *pc,
 {
    *pc = env->pc;
    /* For a gUSA region, notice the end of the region.  */
-    *cs_base = env->flags & GUSA_MASK ? env->gregs[0] : 0;
-    *flags = env->flags /* TB_FLAG_ENVFLAGS_MASK: bits 0-2, 4-12 */
-            | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
-            | (env->sr & ((1u << SR_MD) | (1u << SR_RB)))      /* Bits 29-30 */
-            | (env->sr & (1u << SR_FD))                        /* Bit 15 */
+    *cs_base = env->flags & TB_FLAG_GUSA_MASK ? env->gregs[0] : 0;
+    *flags = env->flags
+            | (env->fpscr & TB_FLAG_FPSCR_MASK)
+            | (env->sr & TB_FLAG_SR_MASK)
            | (env->movcal_backup ? TB_FLAG_PENDING_MOVCA : 0); /* Bit 3 */
 #ifdef CONFIG_USER_ONLY
    *flags |= TB_FLAG_UNALIGN * !env_cpu(env)->prctl_unalign_sigbus;
--- a/target/sh4/helper.c
+++ b/target/sh4/helper.c
@ -147,11 +147,11 @@ void superh_cpu_do_interrupt(CPUState *cs)
    env->sr |= (1u << SR_BL) | (1u << SR_MD) | (1u << SR_RB);
    env->lock_addr = -1;

-    if (env->flags & DELAY_SLOT_MASK) {
+    if (env->flags & TB_FLAG_DELAY_SLOT_MASK) {
        /* Branch instruction should be executed again before delay slot. */
 	env->spc -= 2;
 	/* Clear flags for exception/interrupt routine. */
-        env->flags &= ~DELAY_SLOT_MASK;
+        env->flags &= ~TB_FLAG_DELAY_SLOT_MASK;
    }

    if (do_exp) {
@ -786,7 +786,7 @@ bool superh_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
        CPUSH4State *env = &cpu->env;

        /* Delay slots are indivisible, ignore interrupts */
-        if (env->flags & DELAY_SLOT_MASK) {
+        if (env->flags & TB_FLAG_DELAY_SLOT_MASK) {
            return false;
        } else {
            superh_cpu_do_interrupt(cs);
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@ -175,13 +175,13 @@ void superh_cpu_dump_state(CPUState *cs, FILE *f, int flags)
 		    i, env->gregs[i], i + 1, env->gregs[i + 1],
 		    i + 2, env->gregs[i + 2], i + 3, env->gregs[i + 3]);
    }
-    if (env->flags & DELAY_SLOT) {
+    if (env->flags & TB_FLAG_DELAY_SLOT) {
        qemu_printf("in delay slot (delayed_pc=0x%08x)\n",
 		    env->delayed_pc);
-    } else if (env->flags & DELAY_SLOT_CONDITIONAL) {
+    } else if (env->flags & TB_FLAG_DELAY_SLOT_COND) {
        qemu_printf("in conditional delay slot (delayed_pc=0x%08x)\n",
 		    env->delayed_pc);
-    } else if (env->flags & DELAY_SLOT_RTE) {
+    } else if (env->flags & TB_FLAG_DELAY_SLOT_RTE) {
        qemu_fprintf(f, "in rte delay slot (delayed_pc=0x%08x)\n",
                     env->delayed_pc);
    }
@ -223,7 +223,7 @@ static inline void gen_save_cpu_state(DisasContext *ctx, bool save_pc)

 static inline bool use_exit_tb(DisasContext *ctx)
 {
-    return (ctx->tbflags & GUSA_EXCLUSIVE) != 0;
+    return (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE) != 0;
 }

 static bool use_goto_tb(DisasContext *ctx, target_ulong dest)
@ -276,12 +276,12 @@ static void gen_conditional_jump(DisasContext *ctx, target_ulong dest,
    TCGLabel *l1 = gen_new_label();
    TCGCond cond_not_taken = jump_if_true ? TCG_COND_EQ : TCG_COND_NE;

-    if (ctx->tbflags & GUSA_EXCLUSIVE) {
+    if (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE) {
        /* When in an exclusive region, we must continue to the end.
           Therefore, exit the region on a taken branch, but otherwise
           fall through to the next instruction.  */
        tcg_gen_brcondi_i32(cond_not_taken, cpu_sr_t, 0, l1);
-        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~GUSA_MASK);
+        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~TB_FLAG_GUSA_MASK);
        /* Note that this won't actually use a goto_tb opcode because we
           disallow it in use_goto_tb, but it handles exit + singlestep.  */
        gen_goto_tb(ctx, 0, dest);
@ -307,14 +307,14 @@ static void gen_delayed_conditional_jump(DisasContext * ctx)
    tcg_gen_mov_i32(ds, cpu_delayed_cond);
    tcg_gen_discard_i32(cpu_delayed_cond);

-    if (ctx->tbflags & GUSA_EXCLUSIVE) {
+    if (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE) {
        /* When in an exclusive region, we must continue to the end.
           Therefore, exit the region on a taken branch, but otherwise
           fall through to the next instruction.  */
        tcg_gen_brcondi_i32(TCG_COND_EQ, ds, 0, l1);

        /* Leave the gUSA region.  */
-        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~GUSA_MASK);
+        tcg_gen_movi_i32(cpu_flags, ctx->envflags & ~TB_FLAG_GUSA_MASK);
        gen_jump(ctx);

        gen_set_label(l1);
@ -361,8 +361,8 @@ static inline void gen_store_fpr64(DisasContext *ctx, TCGv_i64 t, int reg)
 #define XHACK(x) ((((x) & 1 ) << 4) | ((x) & 0xe))

 #define CHECK_NOT_DELAY_SLOT \
-    if (ctx->envflags & DELAY_SLOT_MASK) {  \
-        goto do_illegal_slot;               \
+    if (ctx->envflags & TB_FLAG_DELAY_SLOT_MASK) {  \
+        goto do_illegal_slot;                       \
    }

 #define CHECK_PRIVILEGED \
@ -436,7 +436,7 @@ static void _decode_opc(DisasContext * ctx)
    case 0x000b:		/* rts */
 	CHECK_NOT_DELAY_SLOT
 	tcg_gen_mov_i32(cpu_delayed_pc, cpu_pr);
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	ctx->delayed_pc = (uint32_t) - 1;
 	return;
    case 0x0028:		/* clrmac */
@ -458,7 +458,7 @@ static void _decode_opc(DisasContext * ctx)
 	CHECK_NOT_DELAY_SLOT
        gen_write_sr(cpu_ssr);
 	tcg_gen_mov_i32(cpu_delayed_pc, cpu_spc);
-        ctx->envflags |= DELAY_SLOT_RTE;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT_RTE;
 	ctx->delayed_pc = (uint32_t) - 1;
        ctx->base.is_jmp = DISAS_STOP;
 	return;
@ -513,12 +513,15 @@ static void _decode_opc(DisasContext * ctx)
 	return;
    case 0xe000:		/* mov #imm,Rn */
 #ifdef CONFIG_USER_ONLY
-        /* Detect the start of a gUSA region.  If so, update envflags
-           and end the TB.  This will allow us to see the end of the
-           region (stored in R0) in the next TB.  */
+        /*
+         * Detect the start of a gUSA region (mov #-n, r15).
+         * If so, update envflags and end the TB.  This will allow us
+         * to see the end of the region (stored in R0) in the next TB.
+         */
        if (B11_8 == 15 && B7_0s < 0 &&
            (tb_cflags(ctx->base.tb) & CF_PARALLEL)) {
-            ctx->envflags = deposit32(ctx->envflags, GUSA_SHIFT, 8, B7_0s);
+            ctx->envflags =
+                deposit32(ctx->envflags, TB_FLAG_GUSA_SHIFT, 8, B7_0s);
            ctx->base.is_jmp = DISAS_STOP;
        }
 #endif
@ -544,13 +547,13 @@ static void _decode_opc(DisasContext * ctx)
    case 0xa000:		/* bra disp */
 	CHECK_NOT_DELAY_SLOT
        ctx->delayed_pc = ctx->base.pc_next + 4 + B11_0s * 2;
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	return;
    case 0xb000:		/* bsr disp */
 	CHECK_NOT_DELAY_SLOT
        tcg_gen_movi_i32(cpu_pr, ctx->base.pc_next + 4);
        ctx->delayed_pc = ctx->base.pc_next + 4 + B11_0s * 2;
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	return;
    }

@ -1194,7 +1197,7 @@ static void _decode_opc(DisasContext * ctx)
 	CHECK_NOT_DELAY_SLOT
        tcg_gen_xori_i32(cpu_delayed_cond, cpu_sr_t, 1);
        ctx->delayed_pc = ctx->base.pc_next + 4 + B7_0s * 2;
-        ctx->envflags |= DELAY_SLOT_CONDITIONAL;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT_COND;
 	return;
    case 0x8900:		/* bt label */
 	CHECK_NOT_DELAY_SLOT
@ -1204,7 +1207,7 @@ static void _decode_opc(DisasContext * ctx)
 	CHECK_NOT_DELAY_SLOT
        tcg_gen_mov_i32(cpu_delayed_cond, cpu_sr_t);
        ctx->delayed_pc = ctx->base.pc_next + 4 + B7_0s * 2;
-        ctx->envflags |= DELAY_SLOT_CONDITIONAL;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT_COND;
 	return;
    case 0x8800:		/* cmp/eq #imm,R0 */
        tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, REG(0), B7_0s);
@ -1388,14 +1391,14 @@ static void _decode_opc(DisasContext * ctx)
    case 0x0023:		/* braf Rn */
 	CHECK_NOT_DELAY_SLOT
        tcg_gen_addi_i32(cpu_delayed_pc, REG(B11_8), ctx->base.pc_next + 4);
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	ctx->delayed_pc = (uint32_t) - 1;
 	return;
    case 0x0003:		/* bsrf Rn */
 	CHECK_NOT_DELAY_SLOT
        tcg_gen_movi_i32(cpu_pr, ctx->base.pc_next + 4);
 	tcg_gen_add_i32(cpu_delayed_pc, REG(B11_8), cpu_pr);
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	ctx->delayed_pc = (uint32_t) - 1;
 	return;
    case 0x4015:		/* cmp/pl Rn */
@ -1411,14 +1414,14 @@ static void _decode_opc(DisasContext * ctx)
    case 0x402b:		/* jmp @Rn */
 	CHECK_NOT_DELAY_SLOT
 	tcg_gen_mov_i32(cpu_delayed_pc, REG(B11_8));
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	ctx->delayed_pc = (uint32_t) - 1;
 	return;
    case 0x400b:		/* jsr @Rn */
 	CHECK_NOT_DELAY_SLOT
        tcg_gen_movi_i32(cpu_pr, ctx->base.pc_next + 4);
 	tcg_gen_mov_i32(cpu_delayed_pc, REG(B11_8));
-        ctx->envflags |= DELAY_SLOT;
+        ctx->envflags |= TB_FLAG_DELAY_SLOT;
 	ctx->delayed_pc = (uint32_t) - 1;
 	return;
    case 0x400e:		/* ldc Rm,SR */
@ -1839,7 +1842,7 @@ static void _decode_opc(DisasContext * ctx)
    fflush(stderr);
 #endif
 do_illegal:
-    if (ctx->envflags & DELAY_SLOT_MASK) {
+    if (ctx->envflags & TB_FLAG_DELAY_SLOT_MASK) {
 do_illegal_slot:
        gen_save_cpu_state(ctx, true);
        gen_helper_raise_slot_illegal_instruction(cpu_env);
@ -1852,7 +1855,7 @@ static void _decode_opc(DisasContext * ctx)

 do_fpu_disabled:
    gen_save_cpu_state(ctx, true);
-    if (ctx->envflags & DELAY_SLOT_MASK) {
+    if (ctx->envflags & TB_FLAG_DELAY_SLOT_MASK) {
        gen_helper_raise_slot_fpu_disable(cpu_env);
    } else {
        gen_helper_raise_fpu_disable(cpu_env);
@ -1867,23 +1870,23 @@ static void decode_opc(DisasContext * ctx)

    _decode_opc(ctx);

-    if (old_flags & DELAY_SLOT_MASK) {
+    if (old_flags & TB_FLAG_DELAY_SLOT_MASK) {
        /* go out of the delay slot */
-        ctx->envflags &= ~DELAY_SLOT_MASK;
+        ctx->envflags &= ~TB_FLAG_DELAY_SLOT_MASK;

        /* When in an exclusive region, we must continue to the end
           for conditional branches.  */
-        if (ctx->tbflags & GUSA_EXCLUSIVE
-            && old_flags & DELAY_SLOT_CONDITIONAL) {
+        if (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE
+            && old_flags & TB_FLAG_DELAY_SLOT_COND) {
            gen_delayed_conditional_jump(ctx);
            return;
        }
        /* Otherwise this is probably an invalid gUSA region.
           Drop the GUSA bits so the next TB doesn't see them.  */
-        ctx->envflags &= ~GUSA_MASK;
+        ctx->envflags &= ~TB_FLAG_GUSA_MASK;

        tcg_gen_movi_i32(cpu_flags, ctx->envflags);
-        if (old_flags & DELAY_SLOT_CONDITIONAL) {
+        if (old_flags & TB_FLAG_DELAY_SLOT_COND) {
 	    gen_delayed_conditional_jump(ctx);
        } else {
            gen_jump(ctx);
@ -2223,7 +2226,7 @@ static void decode_gusa(DisasContext *ctx, CPUSH4State *env)
    }

    /* The entire region has been translated.  */
-    ctx->envflags &= ~GUSA_MASK;
+    ctx->envflags &= ~TB_FLAG_GUSA_MASK;
    ctx->base.pc_next = pc_end;
    ctx->base.num_insns += max_insns - 1;
    return;
@ -2234,7 +2237,7 @@ static void decode_gusa(DisasContext *ctx, CPUSH4State *env)

    /* Restart with the EXCLUSIVE bit set, within a TB run via
       cpu_exec_step_atomic holding the exclusive lock.  */
-    ctx->envflags |= GUSA_EXCLUSIVE;
+    ctx->envflags |= TB_FLAG_GUSA_EXCLUSIVE;
    gen_save_cpu_state(ctx, false);
    gen_helper_exclusive(cpu_env);
    ctx->base.is_jmp = DISAS_NORETURN;
@ -2267,17 +2270,19 @@ static void sh4_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
                  (tbflags & (1 << SR_RB))) * 0x10;
    ctx->fbank = tbflags & FPSCR_FR ? 0x10 : 0;

-    if (tbflags & GUSA_MASK) {
+#ifdef CONFIG_USER_ONLY
+    if (tbflags & TB_FLAG_GUSA_MASK) {
+        /* In gUSA exclusive region. */
        uint32_t pc = ctx->base.pc_next;
        uint32_t pc_end = ctx->base.tb->cs_base;
-        int backup = sextract32(ctx->tbflags, GUSA_SHIFT, 8);
+        int backup = sextract32(ctx->tbflags, TB_FLAG_GUSA_SHIFT, 8);
        int max_insns = (pc_end - pc) / 2;

        if (pc != pc_end + backup || max_insns < 2) {
            /* This is a malformed gUSA region.  Don't do anything special,
               since the interpreter is likely to get confused.  */
-            ctx->envflags &= ~GUSA_MASK;
-        } else if (tbflags & GUSA_EXCLUSIVE) {
+            ctx->envflags &= ~TB_FLAG_GUSA_MASK;
+        } else if (tbflags & TB_FLAG_GUSA_EXCLUSIVE) {
            /* Regardless of single-stepping or the end of the page,
               we must complete execution of the gUSA region while
               holding the exclusive lock.  */
@ -2285,6 +2290,7 @@ static void sh4_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
            return;
        }
    }
+#endif

    /* Since the ISA is fixed-width, we can bound by the number
       of instructions remaining on the page.  */
@ -2309,8 +2315,8 @@ static void sh4_tr_translate_insn(DisasContextBase *dcbase, CPUState *cs)
    DisasContext *ctx = container_of(dcbase, DisasContext, base);

 #ifdef CONFIG_USER_ONLY
-    if (unlikely(ctx->envflags & GUSA_MASK)
-        && !(ctx->envflags & GUSA_EXCLUSIVE)) {
+    if (unlikely(ctx->envflags & TB_FLAG_GUSA_MASK)
+        && !(ctx->envflags & TB_FLAG_GUSA_EXCLUSIVE)) {
        /* We're in an gUSA region, and we have not already fallen
           back on using an exclusive region.  Attempt to parse the
           region into a single supported atomic operation.  Failure
@ -2330,9 +2336,9 @@ static void sh4_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
 {
    DisasContext *ctx = container_of(dcbase, DisasContext, base);

-    if (ctx->tbflags & GUSA_EXCLUSIVE) {
+    if (ctx->tbflags & TB_FLAG_GUSA_EXCLUSIVE) {
        /* Ending the region of exclusivity.  Clear the bits.  */
-        ctx->envflags &= ~GUSA_MASK;
+        ctx->envflags &= ~TB_FLAG_GUSA_MASK;
    }

    switch (ctx->base.is_jmp) {
--- a/target/sparc/cpu.c
+++ b/target/sparc/cpu.c
@ -693,12 +693,19 @@ static void sparc_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.npc = value + 4;
 }

+static vaddr sparc_cpu_get_pc(CPUState *cs)
+{
+    SPARCCPU *cpu = SPARC_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static void sparc_cpu_synchronize_from_tb(CPUState *cs,
                                          const TranslationBlock *tb)
 {
    SPARCCPU *cpu = SPARC_CPU(cs);

-    cpu->env.pc = tb->pc;
+    cpu->env.pc = tb_pc(tb);
    cpu->env.npc = tb->cs_base;
 }

@ -896,6 +903,7 @@ static void sparc_cpu_class_init(ObjectClass *oc, void *data)
    cc->memory_rw_debug = sparc_cpu_memory_rw_debug;
 #endif
    cc->set_pc = sparc_cpu_set_pc;
+    cc->get_pc = sparc_cpu_get_pc;
    cc->gdb_read_register = sparc_cpu_gdb_read_register;
    cc->gdb_write_register = sparc_cpu_gdb_write_register;
 #ifndef CONFIG_USER_ONLY
--- a/target/tricore/cpu.c
+++ b/target/tricore/cpu.c
@ -41,13 +41,21 @@ static void tricore_cpu_set_pc(CPUState *cs, vaddr value)
    env->PC = value & ~(target_ulong)1;
 }

+static vaddr tricore_cpu_get_pc(CPUState *cs)
+{
+    TriCoreCPU *cpu = TRICORE_CPU(cs);
+    CPUTriCoreState *env = &cpu->env;
+
+    return env->PC;
+}
+
 static void tricore_cpu_synchronize_from_tb(CPUState *cs,
                                            const TranslationBlock *tb)
 {
    TriCoreCPU *cpu = TRICORE_CPU(cs);
    CPUTriCoreState *env = &cpu->env;

-    env->PC = tb->pc;
+    env->PC = tb_pc(tb);
 }

 static void tricore_cpu_reset(DeviceState *dev)
@ -176,6 +184,7 @@ static void tricore_cpu_class_init(ObjectClass *c, void *data)

    cc->dump_state = tricore_cpu_dump_state;
    cc->set_pc = tricore_cpu_set_pc;
+    cc->get_pc = tricore_cpu_get_pc;
    cc->sysemu_ops = &tricore_sysemu_ops;
    cc->tcg_ops = &tricore_tcg_ops;
 }
--- a/target/xtensa/cpu.c
+++ b/target/xtensa/cpu.c
@ -44,6 +44,13 @@ static void xtensa_cpu_set_pc(CPUState *cs, vaddr value)
    cpu->env.pc = value;
 }

+static vaddr xtensa_cpu_get_pc(CPUState *cs)
+{
+    XtensaCPU *cpu = XTENSA_CPU(cs);
+
+    return cpu->env.pc;
+}
+
 static bool xtensa_cpu_has_work(CPUState *cs)
 {
 #ifndef CONFIG_USER_ONLY
@ -233,6 +240,7 @@ static void xtensa_cpu_class_init(ObjectClass *oc, void *data)
    cc->has_work = xtensa_cpu_has_work;
    cc->dump_state = xtensa_cpu_dump_state;
    cc->set_pc = xtensa_cpu_set_pc;
+    cc->get_pc = xtensa_cpu_get_pc;
    cc->gdb_read_register = xtensa_cpu_gdb_read_register;
    cc->gdb_write_register = xtensa_cpu_gdb_write_register;
    cc->gdb_stop_before_watchpoint = true;
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@ -1847,44 +1847,101 @@ static void tcg_out_mb(TCGContext *s, TCGArg a0)
    tcg_out32(s, insn);
 }

+static inline uint64_t make_pair(tcg_insn_unit i1, tcg_insn_unit i2)
+{
+    if (HOST_BIG_ENDIAN) {
+        return (uint64_t)i1 << 32 | i2;
+    }
+    return (uint64_t)i2 << 32 | i1;
+}
+
+static inline void ppc64_replace2(uintptr_t rx, uintptr_t rw,
+                                  tcg_insn_unit i0, tcg_insn_unit i1)
+{
+#if TCG_TARGET_REG_BITS == 64
+    qatomic_set((uint64_t *)rw, make_pair(i0, i1));
+    flush_idcache_range(rx, rw, 8);
+#else
+    qemu_build_not_reached();
+#endif
+}
+
+static inline void ppc64_replace4(uintptr_t rx, uintptr_t rw,
+                                  tcg_insn_unit i0, tcg_insn_unit i1,
+                                  tcg_insn_unit i2, tcg_insn_unit i3)
+{
+    uint64_t p[2];
+
+    p[!HOST_BIG_ENDIAN] = make_pair(i0, i1);
+    p[HOST_BIG_ENDIAN] = make_pair(i2, i3);
+
+    /*
+     * There's no convenient way to get the compiler to allocate a pair
+     * of registers at an even index, so copy into r6/r7 and clobber.
+     */
+    asm("mr  %%r6, %1\n\t"
+        "mr  %%r7, %2\n\t"
+        "stq %%r6, %0"
+        : "=Q"(*(__int128 *)rw) : "r"(p[0]), "r"(p[1]) : "r6", "r7");
+    flush_idcache_range(rx, rw, 16);
+}
+
 void tb_target_set_jmp_target(uintptr_t tc_ptr, uintptr_t jmp_rx,
                              uintptr_t jmp_rw, uintptr_t addr)
 {
-    if (TCG_TARGET_REG_BITS == 64) {
-        tcg_insn_unit i1, i2;
-        intptr_t tb_diff = addr - tc_ptr;
-        intptr_t br_diff = addr - (jmp_rx + 4);
-        uint64_t pair;
+    tcg_insn_unit i0, i1, i2, i3;
+    intptr_t tb_diff = addr - tc_ptr;
+    intptr_t br_diff = addr - (jmp_rx + 4);
+    intptr_t lo, hi;

-        /* This does not exercise the range of the branch, but we do
-           still need to be able to load the new value of TCG_REG_TB.
-           But this does still happen quite often.  */
-        if (tb_diff == (int16_t)tb_diff) {
-            i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
-            i2 = B | (br_diff & 0x3fffffc);
-        } else {
-            intptr_t lo = (int16_t)tb_diff;
-            intptr_t hi = (int32_t)(tb_diff - lo);
-            assert(tb_diff == hi + lo);
-            i1 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
-            i2 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
-        }
-#if HOST_BIG_ENDIAN
-        pair = (uint64_t)i1 << 32 | i2;
-#else
-        pair = (uint64_t)i2 << 32 | i1;
-#endif
-
-        /* As per the enclosing if, this is ppc64.  Avoid the _Static_assert
-           within qatomic_set that would fail to build a ppc32 host.  */
-        qatomic_set__nocheck((uint64_t *)jmp_rw, pair);
-        flush_idcache_range(jmp_rx, jmp_rw, 8);
-    } else {
+    if (TCG_TARGET_REG_BITS == 32) {
        intptr_t diff = addr - jmp_rx;
        tcg_debug_assert(in_range_b(diff));
        qatomic_set((uint32_t *)jmp_rw, B | (diff & 0x3fffffc));
        flush_idcache_range(jmp_rx, jmp_rw, 4);
+        return;
    }
+
+    /*
+     * For 16-bit displacements, we can use a single add + branch.
+     * This happens quite often.
+     */
+    if (tb_diff == (int16_t)tb_diff) {
+        i0 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, tb_diff);
+        i1 = B | (br_diff & 0x3fffffc);
+        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
+        return;
+    }
+
+    lo = (int16_t)tb_diff;
+    hi = (int32_t)(tb_diff - lo);
+    assert(tb_diff == hi + lo);
+    i0 = ADDIS | TAI(TCG_REG_TB, TCG_REG_TB, hi >> 16);
+    i1 = ADDI | TAI(TCG_REG_TB, TCG_REG_TB, lo);
+
+    /*
+     * Without stq from 2.07, we can only update two insns,
+     * and those must be the ones that load the target address.
+     */
+    if (!have_isa_2_07) {
+        ppc64_replace2(jmp_rx, jmp_rw, i0, i1);
+        return;
+    }
+
+    /*
+     * For 26-bit displacements, we can use a direct branch.
+     * Otherwise we still need the indirect branch, which we
+     * must restore after a potential direct branch write.
+     */
+    br_diff -= 4;
+    if (in_range_b(br_diff)) {
+        i2 = B | (br_diff & 0x3fffffc);
+        i3 = NOP;
+    } else {
+        i2 = MTSPR | RS(TCG_REG_TB) | CTR;
+        i3 = BCCTR | BO_ALWAYS;
+    }
+    ppc64_replace4(jmp_rx, jmp_rw, i0, i1, i2, i3);
 }

 static void tcg_out_call_int(TCGContext *s, int lk,
@ -2574,8 +2631,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
        if (s->tb_jmp_insn_offset) {
            /* Direct jump. */
            if (TCG_TARGET_REG_BITS == 64) {
-                /* Ensure the next insns are 8-byte aligned. */
-                if ((uintptr_t)s->code_ptr & 7) {
+                /* Ensure the next insns are 8 or 16-byte aligned. */
+                while ((uintptr_t)s->code_ptr & (have_isa_2_07 ? 15 : 7)) {
                    tcg_out32(s, NOP);
                }
                s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@ -4188,7 +4188,7 @@ int64_t tcg_cpu_exec_time(void)
 #endif


-int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
+int tcg_gen_code(TCGContext *s, TranslationBlock *tb, target_ulong pc_start)
 {
 #ifdef CONFIG_PROFILER
    TCGProfile *prof = &s->prof;
@ -4218,7 +4218,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)

 #ifdef DEBUG_DISAS
    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)
-                 && qemu_log_in_addr_range(tb->pc))) {
+                 && qemu_log_in_addr_range(pc_start))) {
        FILE *logfile = qemu_log_trylock();
        if (logfile) {
            fprintf(logfile, "OP:\n");
@ -4265,7 +4265,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
    if (s->nb_indirects > 0) {
 #ifdef DEBUG_DISAS
        if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND)
-                     && qemu_log_in_addr_range(tb->pc))) {
+                     && qemu_log_in_addr_range(pc_start))) {
            FILE *logfile = qemu_log_trylock();
            if (logfile) {
                fprintf(logfile, "OP before indirect lowering:\n");
@ -4288,7 +4288,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)

 #ifdef DEBUG_DISAS
    if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_OPT)
-                 && qemu_log_in_addr_range(tb->pc))) {
+                 && qemu_log_in_addr_range(pc_start))) {
        FILE *logfile = qemu_log_trylock();
        if (logfile) {
            fprintf(logfile, "OP after optimization and liveness analysis:\n");
--- a/trace/control-target.c
+++ b/trace/control-target.c
@ -65,7 +65,7 @@ static void trace_event_synchronize_vcpu_state_dynamic(
 {
    bitmap_copy(vcpu->trace_dstate, vcpu->trace_dstate_delayed,
                CPU_TRACE_DSTATE_MAX_EVENTS);
-    cpu_tb_jmp_cache_clear(vcpu);
+    tcg_flush_jmp_cache(vcpu);
 }

 void trace_event_set_vcpu_state_dynamic(CPUState *vcpu,