*: Delete checks for old host definitions

tcg/loongarch64: Generate LSX instructions fpu: Add conversions between bfloat16 and [u]int8 fpu: Handle m68k extended precision denormals properly accel/tcg: Improve cputlb i/o organization accel/tcg: Simplify tlb_plugin_lookup accel/tcg: Remove false-negative halted assertion tcg: Add gvec compare with immediate and scalar operand tcg/aarch64: Emit BTI insns at jump landing pads -----BEGIN PGP SIGNATURE----- iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmUF4VIdHHJpY2hhcmQu aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV8wOwf+I9qNus2kV3yQxpuU 2hqYuLXvH96l9vbqaoyx7hyyJTtrqytLGCMPmQKUdtBGtO6z7PnLNDiooGcbO+gw 2gdfw3Q//JZUTdx+ZSujUksV0F96Tqu0zi4TdJUPNIwhCrh0K8VjiftfPfbynRtz KhQ1lNeO/QzcAgzKiun2NyqdPiYDmNuEIS/jYedQwQweRp/xQJ4/x8DmhGf/OiD4 rGAcdslN+RenqgFACcJ2A1vxUGMeQv5g/Cn82FgTk0cmgcfAODMnC+WnOm8ruQdT snluvnh/2/r8jIhx3frKDKGtaKHCPhoCS7GNK48qejxaybvv3CJQ4qsjRIBKVrVM cIrsSw== =cTgD -----END PGP SIGNATURE----- Merge tag 'pull-tcg-20230915-2' of https://gitlab.com/rth7680/qemu into staging *: Delete checks for old host definitions tcg/loongarch64: Generate LSX instructions fpu: Add conversions between bfloat16 and [u]int8 fpu: Handle m68k extended precision denormals properly accel/tcg: Improve cputlb i/o organization accel/tcg: Simplify tlb_plugin_lookup accel/tcg: Remove false-negative halted assertion tcg: Add gvec compare with immediate and scalar operand tcg/aarch64: Emit BTI insns at jump landing pads [Resolved conflict between CPUINFO_PMULL and CPUINFO_BTI. --Stefan] * tag 'pull-tcg-20230915-2' of https://gitlab.com/rth7680/qemu: (39 commits) tcg: Map code_gen_buffer with PROT_BTI tcg/aarch64: Emit BTI insns at jump landing pads util/cpuinfo-aarch64: Add CPUINFO_BTI tcg: Add tcg_out_tb_start backend hook fpu: Handle m68k extended precision denormals properly fpu: Add conversions between bfloat16 and [u]int8 accel/tcg: Introduce do_st16_mmio_leN accel/tcg: Introduce do_ld16_mmio_beN accel/tcg: Merge io_writex into do_st_mmio_leN accel/tcg: Merge io_readx into do_ld_mmio_beN accel/tcg: Replace direct use of io_readx/io_writex in do_{ld,st}_1 accel/tcg: Merge cpu_transaction_failed into io_failed plugin: Simplify struct qemu_plugin_hwaddr accel/tcg: Use CPUTLBEntryFull.phys_addr in io_failed accel/tcg: Split out io_prepare and io_failed accel/tcg: Simplify tlb_plugin_lookup target/arm: Use tcg_gen_gvec_cmpi for compare vs 0 tcg: Add gvec compare with immediate and scalar operand tcg/loongarch64: Implement 128-bit load & store tcg/loongarch64: Lower rotli_vec to vrotri ... Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2023-09-19 13:20:54 -04:00 · 2023-09-19 13:20:54 -04:00 · d7754940d7
parent 13d6b16081 a97a83753c
commit d7754940d7
39 changed files with 7419 additions and 393 deletions
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@ -1193,6 +1193,7 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,
    write_flags = read_flags;
    if (is_ram) {
        iotlb = memory_region_get_ram_addr(section->mr) + xlat;
+        assert(!(iotlb & ~TARGET_PAGE_MASK));
        /*
         * Computing is_clean is expensive; avoid all that unless
         * the page is actually writable.
@ -1255,16 +1256,18 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,

    /* refill the tlb */
    /*
-     * At this point iotlb contains a physical section number in the lower
-     * TARGET_PAGE_BITS, and either
-     *  + the ram_addr_t of the page base of the target RAM (RAM)
-     *  + the offset within section->mr of the page base (I/O, ROMD)
+     * When memory region is ram, iotlb contains a TARGET_PAGE_BITS
+     * aligned ram_addr_t of the page base of the target RAM.
+     * Otherwise, iotlb contains
+     *  - a physical section number in the lower TARGET_PAGE_BITS
+     *  - the offset within section->mr of the page base (I/O, ROMD) with the
+     *    TARGET_PAGE_BITS masked off.
     * We subtract addr_page (which is page aligned and thus won't
     * disturb the low bits) to give an offset which can be added to the
     * (non-page-aligned) vaddr of the eventual memory access to get
     * the MemoryRegion offset for the access. Note that the vaddr we
     * subtract here is that of the page base, and not the same as the
-     * vaddr we add back in io_readx()/io_writex()/get_page_addr_code().
+     * vaddr we add back in io_prepare()/get_page_addr_code().
     */
    desc->fulltlb[index] = *full;
    full = &desc->fulltlb[index];
@ -1347,116 +1350,41 @@ static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
                                          mmu_idx, retaddr);
 }

-static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
-                                          vaddr addr, unsigned size,
-                                          MMUAccessType access_type,
-                                          int mmu_idx, MemTxAttrs attrs,
-                                          MemTxResult response,
-                                          uintptr_t retaddr)
-{
-    CPUClass *cc = CPU_GET_CLASS(cpu);
-
-    if (!cpu->ignore_memory_transaction_failures &&
-        cc->tcg_ops->do_transaction_failed) {
-        cc->tcg_ops->do_transaction_failed(cpu, physaddr, addr, size,
-                                           access_type, mmu_idx, attrs,
-                                           response, retaddr);
-    }
-}
-
-/*
- * Save a potentially trashed CPUTLBEntryFull for later lookup by plugin.
- * This is read by tlb_plugin_lookup if the fulltlb entry doesn't match
- * because of the side effect of io_writex changing memory layout.
- */
-static void save_iotlb_data(CPUState *cs, MemoryRegionSection *section,
-                            hwaddr mr_offset)
-{
-#ifdef CONFIG_PLUGIN
-    SavedIOTLB *saved = &cs->saved_iotlb;
-    saved->section = section;
-    saved->mr_offset = mr_offset;
-#endif
-}
-
-static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
-                         int mmu_idx, vaddr addr, uintptr_t retaddr,
-                         MMUAccessType access_type, MemOp op)
+static MemoryRegionSection *
+io_prepare(hwaddr *out_offset, CPUArchState *env, hwaddr xlat,
+           MemTxAttrs attrs, vaddr addr, uintptr_t retaddr)
 {
    CPUState *cpu = env_cpu(env);
-    hwaddr mr_offset;
    MemoryRegionSection *section;
-    MemoryRegion *mr;
-    uint64_t val;
-    MemTxResult r;
+    hwaddr mr_offset;

-    section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
-    mr = section->mr;
-    mr_offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
+    section = iotlb_to_section(cpu, xlat, attrs);
+    mr_offset = (xlat & TARGET_PAGE_MASK) + addr;
    cpu->mem_io_pc = retaddr;
    if (!cpu->can_do_io) {
        cpu_io_recompile(cpu, retaddr);
    }

-    /*
-     * The memory_region_dispatch may trigger a flush/resize
-     * so for plugins we save the iotlb_data just in case.
-     */
-    save_iotlb_data(cpu, section, mr_offset);
-
-    {
-        QEMU_IOTHREAD_LOCK_GUARD();
-        r = memory_region_dispatch_read(mr, mr_offset, &val, op, full->attrs);
-    }
-
-    if (r != MEMTX_OK) {
-        hwaddr physaddr = mr_offset +
-            section->offset_within_address_space -
-            section->offset_within_region;
-
-        cpu_transaction_failed(cpu, physaddr, addr, memop_size(op), access_type,
-                               mmu_idx, full->attrs, r, retaddr);
-    }
-    return val;
+    *out_offset = mr_offset;
+    return section;
 }

-static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
-                      int mmu_idx, uint64_t val, vaddr addr,
-                      uintptr_t retaddr, MemOp op)
+static void io_failed(CPUArchState *env, CPUTLBEntryFull *full, vaddr addr,
+                      unsigned size, MMUAccessType access_type, int mmu_idx,
+                      MemTxResult response, uintptr_t retaddr)
 {
    CPUState *cpu = env_cpu(env);
-    hwaddr mr_offset;
-    MemoryRegionSection *section;
-    MemoryRegion *mr;
-    MemTxResult r;

-    section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
-    mr = section->mr;
-    mr_offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
-    if (!cpu->can_do_io) {
-        cpu_io_recompile(cpu, retaddr);
-    }
-    cpu->mem_io_pc = retaddr;
+    if (!cpu->ignore_memory_transaction_failures) {
+        CPUClass *cc = CPU_GET_CLASS(cpu);

-    /*
-     * The memory_region_dispatch may trigger a flush/resize
-     * so for plugins we save the iotlb_data just in case.
-     */
-    save_iotlb_data(cpu, section, mr_offset);
+        if (cc->tcg_ops->do_transaction_failed) {
+            hwaddr physaddr = full->phys_addr | (addr & ~TARGET_PAGE_MASK);

-    {
-        QEMU_IOTHREAD_LOCK_GUARD();
-        r = memory_region_dispatch_write(mr, mr_offset, val, op, full->attrs);
-    }
-
-    if (r != MEMTX_OK) {
-        hwaddr physaddr = mr_offset +
-            section->offset_within_address_space -
-            section->offset_within_region;
-
-        cpu_transaction_failed(cpu, physaddr, addr, memop_size(op),
-                               MMU_DATA_STORE, mmu_idx, full->attrs, r,
-                               retaddr);
+            cc->tcg_ops->do_transaction_failed(cpu, physaddr, addr, size,
+                                               access_type, mmu_idx,
+                                               full->attrs, response, retaddr);
+        }
    }
 }

@ -1726,45 +1654,41 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, vaddr addr,
 * in the softmmu lookup code (or helper). We don't handle re-fills or
 * checking the victim table. This is purely informational.
 *
- * This almost never fails as the memory access being instrumented
- * should have just filled the TLB. The one corner case is io_writex
- * which can cause TLB flushes and potential resizing of the TLBs
- * losing the information we need. In those cases we need to recover
- * data from a copy of the CPUTLBEntryFull. As long as this always occurs
- * from the same thread (which a mem callback will be) this is safe.
+ * The one corner case is i/o write, which can cause changes to the
+ * address space.  Those changes, and the corresponding tlb flush,
+ * should be delayed until the next TB, so even then this ought not fail.
+ * But check, Just in Case.
 */
-
 bool tlb_plugin_lookup(CPUState *cpu, vaddr addr, int mmu_idx,
                       bool is_store, struct qemu_plugin_hwaddr *data)
 {
    CPUArchState *env = cpu->env_ptr;
    CPUTLBEntry *tlbe = tlb_entry(env, mmu_idx, addr);
    uintptr_t index = tlb_index(env, mmu_idx, addr);
-    uint64_t tlb_addr = is_store ? tlb_addr_write(tlbe) : tlbe->addr_read;
+    MMUAccessType access_type = is_store ? MMU_DATA_STORE : MMU_DATA_LOAD;
+    uint64_t tlb_addr = tlb_read_idx(tlbe, access_type);
+    CPUTLBEntryFull *full;

-    if (likely(tlb_hit(tlb_addr, addr))) {
-        /* We must have an iotlb entry for MMIO */
-        if (tlb_addr & TLB_MMIO) {
-            CPUTLBEntryFull *full;
-            full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
-            data->is_io = true;
-            data->v.io.section =
-                iotlb_to_section(cpu, full->xlat_section, full->attrs);
-            data->v.io.offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
-        } else {
-            data->is_io = false;
-            data->v.ram.hostaddr = (void *)((uintptr_t)addr + tlbe->addend);
-        }
-        return true;
-    } else {
-        SavedIOTLB *saved = &cpu->saved_iotlb;
-        data->is_io = true;
-        data->v.io.section = saved->section;
-        data->v.io.offset = saved->mr_offset;
-        return true;
+    if (unlikely(!tlb_hit(tlb_addr, addr))) {
+        return false;
    }
-}

+    full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+    data->phys_addr = full->phys_addr | (addr & ~TARGET_PAGE_MASK);
+
+    /* We must have an iotlb entry for MMIO */
+    if (tlb_addr & TLB_MMIO) {
+        MemoryRegionSection *section =
+            iotlb_to_section(cpu, full->xlat_section & ~TARGET_PAGE_MASK,
+                             full->attrs);
+        data->is_io = true;
+        data->mr = section->mr;
+    } else {
+        data->is_io = false;
+        data->mr = NULL;
+    }
+    return true;
+}
 #endif

 /*
@ -2084,45 +2008,88 @@ static void *atomic_mmu_lookup(CPUArchState *env, vaddr addr, MemOpIdx oi,
 * Load @size bytes from @addr, which is memory-mapped i/o.
 * The bytes are concatenated in big-endian order with @ret_be.
 */
+static uint64_t int_ld_mmio_beN(CPUArchState *env, CPUTLBEntryFull *full,
+                                uint64_t ret_be, vaddr addr, int size,
+                                int mmu_idx, MMUAccessType type, uintptr_t ra,
+                                MemoryRegion *mr, hwaddr mr_offset)
+{
+    do {
+        MemOp this_mop;
+        unsigned this_size;
+        uint64_t val;
+        MemTxResult r;
+
+        /* Read aligned pieces up to 8 bytes. */
+        this_mop = ctz32(size | (int)addr | 8);
+        this_size = 1 << this_mop;
+        this_mop |= MO_BE;
+
+        r = memory_region_dispatch_read(mr, mr_offset, &val,
+                                        this_mop, full->attrs);
+        if (unlikely(r != MEMTX_OK)) {
+            io_failed(env, full, addr, this_size, type, mmu_idx, r, ra);
+        }
+        if (this_size == 8) {
+            return val;
+        }
+
+        ret_be = (ret_be << (this_size * 8)) | val;
+        addr += this_size;
+        mr_offset += this_size;
+        size -= this_size;
+    } while (size);
+
+    return ret_be;
+}
+
 static uint64_t do_ld_mmio_beN(CPUArchState *env, CPUTLBEntryFull *full,
                               uint64_t ret_be, vaddr addr, int size,
                               int mmu_idx, MMUAccessType type, uintptr_t ra)
 {
-    uint64_t t;
+    MemoryRegionSection *section;
+    MemoryRegion *mr;
+    hwaddr mr_offset;
+    MemTxAttrs attrs;
+    uint64_t ret;

    tcg_debug_assert(size > 0 && size <= 8);
-    do {
-        /* Read aligned pieces up to 8 bytes. */
-        switch ((size | (int)addr) & 7) {
-        case 1:
-        case 3:
-        case 5:
-        case 7:
-            t = io_readx(env, full, mmu_idx, addr, ra, type, MO_UB);
-            ret_be = (ret_be << 8) | t;
-            size -= 1;
-            addr += 1;
-            break;
-        case 2:
-        case 6:
-            t = io_readx(env, full, mmu_idx, addr, ra, type, MO_BEUW);
-            ret_be = (ret_be << 16) | t;
-            size -= 2;
-            addr += 2;
-            break;
-        case 4:
-            t = io_readx(env, full, mmu_idx, addr, ra, type, MO_BEUL);
-            ret_be = (ret_be << 32) | t;
-            size -= 4;
-            addr += 4;
-            break;
-        case 0:
-            return io_readx(env, full, mmu_idx, addr, ra, type, MO_BEUQ);
-        default:
-            qemu_build_not_reached();
-        }
-    } while (size);
-    return ret_be;
+
+    attrs = full->attrs;
+    section = io_prepare(&mr_offset, env, full->xlat_section, attrs, addr, ra);
+    mr = section->mr;
+
+    qemu_mutex_lock_iothread();
+    ret = int_ld_mmio_beN(env, full, ret_be, addr, size, mmu_idx,
+                          type, ra, mr, mr_offset);
+    qemu_mutex_unlock_iothread();
+
+    return ret;
+}
+
+static Int128 do_ld16_mmio_beN(CPUArchState *env, CPUTLBEntryFull *full,
+                               uint64_t ret_be, vaddr addr, int size,
+                               int mmu_idx, uintptr_t ra)
+{
+    MemoryRegionSection *section;
+    MemoryRegion *mr;
+    hwaddr mr_offset;
+    MemTxAttrs attrs;
+    uint64_t a, b;
+
+    tcg_debug_assert(size > 8 && size <= 16);
+
+    attrs = full->attrs;
+    section = io_prepare(&mr_offset, env, full->xlat_section, attrs, addr, ra);
+    mr = section->mr;
+
+    qemu_mutex_lock_iothread();
+    a = int_ld_mmio_beN(env, full, ret_be, addr, size - 8, mmu_idx,
+                        MMU_DATA_LOAD, ra, mr, mr_offset);
+    b = int_ld_mmio_beN(env, full, ret_be, addr + size - 8, 8, mmu_idx,
+                        MMU_DATA_LOAD, ra, mr, mr_offset + size - 8);
+    qemu_mutex_unlock_iothread();
+
+    return int128_make128(b, a);
 }

 /**
@ -2267,7 +2234,6 @@ static uint64_t do_ld_beN(CPUArchState *env, MMULookupPageData *p,
    unsigned tmp, half_size;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
        return do_ld_mmio_beN(env, p->full, ret_be, p->addr, p->size,
                              mmu_idx, type, ra);
    }
@ -2318,12 +2284,7 @@ static Int128 do_ld16_beN(CPUArchState *env, MMULookupPageData *p,
    MemOp atom;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
-        a = do_ld_mmio_beN(env, p->full, a, p->addr, size - 8,
-                           mmu_idx, MMU_DATA_LOAD, ra);
-        b = do_ld_mmio_beN(env, p->full, 0, p->addr + 8, 8,
-                           mmu_idx, MMU_DATA_LOAD, ra);
-        return int128_make128(b, a);
+        return do_ld16_mmio_beN(env, p->full, a, p->addr, size, mmu_idx, ra);
    }

    /*
@ -2368,7 +2329,7 @@ static uint8_t do_ld_1(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
                       MMUAccessType type, uintptr_t ra)
 {
    if (unlikely(p->flags & TLB_MMIO)) {
-        return io_readx(env, p->full, mmu_idx, p->addr, ra, type, MO_UB);
+        return do_ld_mmio_beN(env, p->full, 0, p->addr, 1, mmu_idx, type, ra);
    } else {
        return *(uint8_t *)p->haddr;
    }
@ -2380,7 +2341,6 @@ static uint16_t do_ld_2(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
    uint16_t ret;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
        ret = do_ld_mmio_beN(env, p->full, 0, p->addr, 2, mmu_idx, type, ra);
        if ((memop & MO_BSWAP) == MO_LE) {
            ret = bswap16(ret);
@ -2401,7 +2361,6 @@ static uint32_t do_ld_4(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
    uint32_t ret;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
        ret = do_ld_mmio_beN(env, p->full, 0, p->addr, 4, mmu_idx, type, ra);
        if ((memop & MO_BSWAP) == MO_LE) {
            ret = bswap32(ret);
@ -2422,7 +2381,6 @@ static uint64_t do_ld_8(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
    uint64_t ret;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
        ret = do_ld_mmio_beN(env, p->full, 0, p->addr, 8, mmu_idx, type, ra);
        if ((memop & MO_BSWAP) == MO_LE) {
            ret = bswap64(ret);
@ -2581,12 +2539,8 @@ static Int128 do_ld16_mmu(CPUArchState *env, vaddr addr,
    crosspage = mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD, &l);
    if (likely(!crosspage)) {
        if (unlikely(l.page[0].flags & TLB_MMIO)) {
-            QEMU_IOTHREAD_LOCK_GUARD();
-            a = do_ld_mmio_beN(env, l.page[0].full, 0, addr, 8,
-                               l.mmu_idx, MMU_DATA_LOAD, ra);
-            b = do_ld_mmio_beN(env, l.page[0].full, 0, addr + 8, 8,
-                               l.mmu_idx, MMU_DATA_LOAD, ra);
-            ret = int128_make128(b, a);
+            ret = do_ld16_mmio_beN(env, l.page[0].full, 0, addr, 16,
+                                   l.mmu_idx, ra);
            if ((l.memop & MO_BSWAP) == MO_LE) {
                ret = bswap128(ret);
            }
@ -2727,46 +2681,88 @@ Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
 * The bytes to store are extracted in little-endian order from @val_le;
 * return the bytes of @val_le beyond @p->size that have not been stored.
 */
+static uint64_t int_st_mmio_leN(CPUArchState *env, CPUTLBEntryFull *full,
+                                uint64_t val_le, vaddr addr, int size,
+                                int mmu_idx, uintptr_t ra,
+                                MemoryRegion *mr, hwaddr mr_offset)
+{
+    do {
+        MemOp this_mop;
+        unsigned this_size;
+        MemTxResult r;
+
+        /* Store aligned pieces up to 8 bytes. */
+        this_mop = ctz32(size | (int)addr | 8);
+        this_size = 1 << this_mop;
+        this_mop |= MO_LE;
+
+        r = memory_region_dispatch_write(mr, mr_offset, val_le,
+                                         this_mop, full->attrs);
+        if (unlikely(r != MEMTX_OK)) {
+            io_failed(env, full, addr, this_size, MMU_DATA_STORE,
+                      mmu_idx, r, ra);
+        }
+        if (this_size == 8) {
+            return 0;
+        }
+
+        val_le >>= this_size * 8;
+        addr += this_size;
+        mr_offset += this_size;
+        size -= this_size;
+    } while (size);
+
+    return val_le;
+}
+
 static uint64_t do_st_mmio_leN(CPUArchState *env, CPUTLBEntryFull *full,
                               uint64_t val_le, vaddr addr, int size,
                               int mmu_idx, uintptr_t ra)
 {
+    MemoryRegionSection *section;
+    hwaddr mr_offset;
+    MemoryRegion *mr;
+    MemTxAttrs attrs;
+    uint64_t ret;
+
    tcg_debug_assert(size > 0 && size <= 8);

-    do {
-        /* Store aligned pieces up to 8 bytes. */
-        switch ((size | (int)addr) & 7) {
-        case 1:
-        case 3:
-        case 5:
-        case 7:
-            io_writex(env, full, mmu_idx, val_le, addr, ra, MO_UB);
-            val_le >>= 8;
-            size -= 1;
-            addr += 1;
-            break;
-        case 2:
-        case 6:
-            io_writex(env, full, mmu_idx, val_le, addr, ra, MO_LEUW);
-            val_le >>= 16;
-            size -= 2;
-            addr += 2;
-            break;
-        case 4:
-            io_writex(env, full, mmu_idx, val_le, addr, ra, MO_LEUL);
-            val_le >>= 32;
-            size -= 4;
-            addr += 4;
-            break;
-        case 0:
-            io_writex(env, full, mmu_idx, val_le, addr, ra, MO_LEUQ);
-            return 0;
-        default:
-            qemu_build_not_reached();
-        }
-    } while (size);
+    attrs = full->attrs;
+    section = io_prepare(&mr_offset, env, full->xlat_section, attrs, addr, ra);
+    mr = section->mr;

-    return val_le;
+    qemu_mutex_lock_iothread();
+    ret = int_st_mmio_leN(env, full, val_le, addr, size, mmu_idx,
+                          ra, mr, mr_offset);
+    qemu_mutex_unlock_iothread();
+
+    return ret;
+}
+
+static uint64_t do_st16_mmio_leN(CPUArchState *env, CPUTLBEntryFull *full,
+                                 Int128 val_le, vaddr addr, int size,
+                                 int mmu_idx, uintptr_t ra)
+{
+    MemoryRegionSection *section;
+    MemoryRegion *mr;
+    hwaddr mr_offset;
+    MemTxAttrs attrs;
+    uint64_t ret;
+
+    tcg_debug_assert(size > 8 && size <= 16);
+
+    attrs = full->attrs;
+    section = io_prepare(&mr_offset, env, full->xlat_section, attrs, addr, ra);
+    mr = section->mr;
+
+    qemu_mutex_lock_iothread();
+    int_st_mmio_leN(env, full, int128_getlo(val_le), addr, 8,
+                    mmu_idx, ra, mr, mr_offset);
+    ret = int_st_mmio_leN(env, full, int128_gethi(val_le), addr + 8,
+                          size - 8, mmu_idx, ra, mr, mr_offset + 8);
+    qemu_mutex_unlock_iothread();
+
+    return ret;
 }

 /*
@ -2780,7 +2776,6 @@ static uint64_t do_st_leN(CPUArchState *env, MMULookupPageData *p,
    unsigned tmp, half_size;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
        return do_st_mmio_leN(env, p->full, val_le, p->addr,
                              p->size, mmu_idx, ra);
    } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
@ -2835,11 +2830,8 @@ static uint64_t do_st16_leN(CPUArchState *env, MMULookupPageData *p,
    MemOp atom;

    if (unlikely(p->flags & TLB_MMIO)) {
-        QEMU_IOTHREAD_LOCK_GUARD();
-        do_st_mmio_leN(env, p->full, int128_getlo(val_le),
-                       p->addr, 8, mmu_idx, ra);
-        return do_st_mmio_leN(env, p->full, int128_gethi(val_le),
-                              p->addr + 8, size - 8, mmu_idx, ra);
+        return do_st16_mmio_leN(env, p->full, val_le, p->addr,
+                                size, mmu_idx, ra);
    } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
        return int128_gethi(val_le) >> ((size - 8) * 8);
    }
@ -2883,7 +2875,7 @@ static void do_st_1(CPUArchState *env, MMULookupPageData *p, uint8_t val,
                    int mmu_idx, uintptr_t ra)
 {
    if (unlikely(p->flags & TLB_MMIO)) {
-        io_writex(env, p->full, mmu_idx, val, p->addr, ra, MO_UB);
+        do_st_mmio_leN(env, p->full, val, p->addr, 1, mmu_idx, ra);
    } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
        /* nothing */
    } else {
@ -2898,7 +2890,6 @@ static void do_st_2(CPUArchState *env, MMULookupPageData *p, uint16_t val,
        if ((memop & MO_BSWAP) != MO_LE) {
            val = bswap16(val);
        }
-        QEMU_IOTHREAD_LOCK_GUARD();
        do_st_mmio_leN(env, p->full, val, p->addr, 2, mmu_idx, ra);
    } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
        /* nothing */
@ -2918,7 +2909,6 @@ static void do_st_4(CPUArchState *env, MMULookupPageData *p, uint32_t val,
        if ((memop & MO_BSWAP) != MO_LE) {
            val = bswap32(val);
        }
-        QEMU_IOTHREAD_LOCK_GUARD();
        do_st_mmio_leN(env, p->full, val, p->addr, 4, mmu_idx, ra);
    } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
        /* nothing */
@ -2938,7 +2928,6 @@ static void do_st_8(CPUArchState *env, MMULookupPageData *p, uint64_t val,
        if ((memop & MO_BSWAP) != MO_LE) {
            val = bswap64(val);
        }
-        QEMU_IOTHREAD_LOCK_GUARD();
        do_st_mmio_leN(env, p->full, val, p->addr, 8, mmu_idx, ra);
    } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
        /* nothing */
@ -3066,11 +3055,7 @@ static void do_st16_mmu(CPUArchState *env, vaddr addr, Int128 val,
            if ((l.memop & MO_BSWAP) != MO_LE) {
                val = bswap128(val);
            }
-            a = int128_getlo(val);
-            b = int128_gethi(val);
-            QEMU_IOTHREAD_LOCK_GUARD();
-            do_st_mmio_leN(env, l.page[0].full, a, addr, 8, l.mmu_idx, ra);
-            do_st_mmio_leN(env, l.page[0].full, b, addr + 8, 8, l.mmu_idx, ra);
+            do_st16_mmio_leN(env, l.page[0].full, val, addr, 16, l.mmu_idx, ra);
        } else if (unlikely(l.page[0].flags & TLB_DISCARD_WRITE)) {
            /* nothing */
        } else {
--- a/accel/tcg/tcg-accel-ops-mttcg.c
+++ b/accel/tcg/tcg-accel-ops-mttcg.c
@ -100,14 +100,9 @@ static void *mttcg_cpu_thread_fn(void *arg)
                break;
            case EXCP_HALTED:
                /*
-                 * during start-up the vCPU is reset and the thread is
-                 * kicked several times. If we don't ensure we go back
-                 * to sleep in the halted state we won't cleanly
-                 * start-up when the vCPU is enabled.
-                 *
-                 * cpu->halted should ensure we sleep in wait_io_event
+                 * Usually cpu->halted is set, but may have already been
+                 * reset by another thread by the time we arrive here.
                 */
-                g_assert(cpu->halted);
                break;
            case EXCP_ATOMIC:
                qemu_mutex_unlock_iothread();
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@ -1042,6 +1042,32 @@ DO_CMP2(64)
 #undef DO_CMP1
 #undef DO_CMP2

+#define DO_CMP1(NAME, TYPE, OP)                                            \
+void HELPER(NAME)(void *d, void *a, uint64_t b64, uint32_t desc)           \
+{                                                                          \
+    intptr_t oprsz = simd_oprsz(desc);                                     \
+    TYPE inv = simd_data(desc), b = b64;                                   \
+    for (intptr_t i = 0; i < oprsz; i += sizeof(TYPE)) {                   \
+        *(TYPE *)(d + i) = -((*(TYPE *)(a + i) OP b) ^ inv);               \
+    }                                                                      \
+    clear_high(d, oprsz, desc);                                            \
+}
+
+#define DO_CMP2(SZ) \
+    DO_CMP1(gvec_eqs##SZ, uint##SZ##_t, ==)    \
+    DO_CMP1(gvec_lts##SZ, int##SZ##_t, <)      \
+    DO_CMP1(gvec_les##SZ, int##SZ##_t, <=)     \
+    DO_CMP1(gvec_ltus##SZ, uint##SZ##_t, <)    \
+    DO_CMP1(gvec_leus##SZ, uint##SZ##_t, <=)
+
+DO_CMP2(8)
+DO_CMP2(16)
+DO_CMP2(32)
+DO_CMP2(64)
+
+#undef DO_CMP1
+#undef DO_CMP2
+
 void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@ -297,4 +297,29 @@ DEF_HELPER_FLAGS_4(gvec_leu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_leu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_leu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

+DEF_HELPER_FLAGS_4(gvec_eqs8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_eqs16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_eqs32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_eqs64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_lts8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_lts16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_lts32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_lts64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_les8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_les16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_les32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_les64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ltus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ltus16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ltus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ltus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_leus8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_leus16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_leus32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_leus64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
 DEF_HELPER_FLAGS_5(gvec_bitsel, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
--- a/fpu/softfloat-parts.c.inc
+++ b/fpu/softfloat-parts.c.inc
@ -118,7 +118,8 @@ static void partsN(canonicalize)(FloatPartsN *p, float_status *status,
        } else {
            int shift = frac_normalize(p);
            p->cls = float_class_normal;
-            p->exp = fmt->frac_shift - fmt->exp_bias - shift + 1;
+            p->exp = fmt->frac_shift - fmt->exp_bias
+                   - shift + !fmt->m68k_denormal;
        }
    } else if (likely(p->exp < fmt->exp_max) || fmt->arm_althp) {
        p->cls = float_class_normal;
@ -256,7 +257,7 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
            is_tiny = !frac_addi(&discard, p, inc);
        }

-        frac_shrjam(p, 1 - exp);
+        frac_shrjam(p, !fmt->m68k_denormal - exp);

        if (p->frac_lo & round_mask) {
            /* Need to recompute round-to-even/round-to-odd. */
@ -287,7 +288,7 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s,
            p->frac_lo &= ~round_mask;
        }

-        exp = (p->frac_hi & DECOMPOSED_IMPLICIT_BIT) != 0;
+        exp = (p->frac_hi & DECOMPOSED_IMPLICIT_BIT) && !fmt->m68k_denormal;
        frac_shr(p, frac_shift);

        if (is_tiny && (flags & float_flag_inexact)) {
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@ -517,6 +517,7 @@ typedef struct {
 *   round_mask: bits below lsb which must be rounded
 * The following optional modifiers are available:
 *   arm_althp: handle ARM Alternative Half Precision
+ *   m68k_denormal: explicit integer bit for extended precision may be 1
 */
 typedef struct {
    int exp_size;
@ -526,6 +527,7 @@ typedef struct {
    int frac_size;
    int frac_shift;
    bool arm_althp;
+    bool m68k_denormal;
    uint64_t round_mask;
 } FloatFmt;

@ -576,7 +578,12 @@ static const FloatFmt float128_params = {
 static const FloatFmt floatx80_params[3] = {
    [floatx80_precision_s] = { FLOATX80_PARAMS(23) },
    [floatx80_precision_d] = { FLOATX80_PARAMS(52) },
-    [floatx80_precision_x] = { FLOATX80_PARAMS(64) },
+    [floatx80_precision_x] = {
+        FLOATX80_PARAMS(64),
+#ifdef TARGET_M68K
+        .m68k_denormal = true,
+#endif
+    },
 };

 /* Unpack a float to parts, but do not canonicalize.  */
@ -3126,6 +3133,15 @@ int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
    return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s);
 }

+int8_t bfloat16_to_int8_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
+                               float_status *s)
+{
+    FloatParts64 p;
+
+    bfloat16_unpack_canonical(&p, a, s);
+    return parts_float_to_sint(&p, rmode, scale, INT8_MIN, INT8_MAX, s);
+}
+
 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
                                 float_status *s)
 {
@ -3392,6 +3408,11 @@ int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *s)
    return floatx80_to_int64_scalbn(a, float_round_to_zero, 0, s);
 }

+int8_t bfloat16_to_int8(bfloat16 a, float_status *s)
+{
+    return bfloat16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
+}
+
 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
 {
    return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
@ -3407,6 +3428,11 @@ int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
    return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
 }

+int8_t bfloat16_to_int8_round_to_zero(bfloat16 a, float_status *s)
+{
+    return bfloat16_to_int8_scalbn(a, float_round_to_zero, 0, s);
+}
+
 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
 {
    return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
@ -3534,6 +3560,15 @@ uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
    return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s);
 }

+uint8_t bfloat16_to_uint8_scalbn(bfloat16 a, FloatRoundMode rmode,
+                                 int scale, float_status *s)
+{
+    FloatParts64 p;
+
+    bfloat16_unpack_canonical(&p, a, s);
+    return parts_float_to_uint(&p, rmode, scale, UINT8_MAX, s);
+}
+
 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
                                   int scale, float_status *s)
 {
@ -3759,6 +3794,11 @@ Int128 float128_to_uint128_round_to_zero(float128 a, float_status *s)
    return float128_to_uint128_scalbn(a, float_round_to_zero, 0, s);
 }

+uint8_t bfloat16_to_uint8(bfloat16 a, float_status *s)
+{
+    return bfloat16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
+}
+
 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
 {
    return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
@ -3774,6 +3814,11 @@ uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
    return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
 }

+uint8_t bfloat16_to_uint8_round_to_zero(bfloat16 a, float_status *s)
+{
+    return bfloat16_to_uint8_scalbn(a, float_round_to_zero, 0, s);
+}
+
 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
 {
    return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
@ -3929,6 +3974,11 @@ bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
    return int64_to_bfloat16_scalbn(a, scale, status);
 }

+bfloat16 int8_to_bfloat16_scalbn(int8_t a, int scale, float_status *status)
+{
+    return int64_to_bfloat16_scalbn(a, scale, status);
+}
+
 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
 {
    return int64_to_bfloat16_scalbn(a, 0, status);
@ -3944,6 +3994,11 @@ bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
    return int64_to_bfloat16_scalbn(a, 0, status);
 }

+bfloat16 int8_to_bfloat16(int8_t a, float_status *status)
+{
+    return int64_to_bfloat16_scalbn(a, 0, status);
+}
+
 float128 int128_to_float128(Int128 a, float_status *status)
 {
    FloatParts128 p = { };
@ -4139,6 +4194,11 @@ bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
    return uint64_to_bfloat16_scalbn(a, scale, status);
 }

+bfloat16 uint8_to_bfloat16_scalbn(uint8_t a, int scale, float_status *status)
+{
+    return uint64_to_bfloat16_scalbn(a, scale, status);
+}
+
 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
 {
    return uint64_to_bfloat16_scalbn(a, 0, status);
@ -4154,6 +4214,11 @@ bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
    return uint64_to_bfloat16_scalbn(a, 0, status);
 }

+bfloat16 uint8_to_bfloat16(uint8_t a, float_status *status)
+{
+    return uint64_to_bfloat16_scalbn(a, 0, status);
+}
+
 float128 uint64_to_float128(uint64_t a, float_status *status)
 {
    FloatParts128 p;
--- a/host/include/aarch64/host/cpuinfo.h
+++ b/host/include/aarch64/host/cpuinfo.h
@ -11,6 +11,7 @@
 #define CPUINFO_LSE2            (1u << 2)
 #define CPUINFO_AES             (1u << 3)
 #define CPUINFO_PMULL           (1u << 4)
+#define CPUINFO_BTI             (1u << 5)

 /* Initialized with a constructor. */
 extern unsigned cpuinfo;
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@ -100,12 +100,12 @@
 typedef struct CPUTLBEntryFull {
    /*
     * @xlat_section contains:
-     *  - in the lower TARGET_PAGE_BITS, a physical section number
-     *  - with the lower TARGET_PAGE_BITS masked off, an offset which
-     *    must be added to the virtual address to obtain:
-     *     + the ram_addr_t of the target RAM (if the physical section
-     *       number is PHYS_SECTION_NOTDIRTY or PHYS_SECTION_ROM)
-     *     + the offset within the target MemoryRegion (otherwise)
+     *  - For ram, an offset which must be added to the virtual address
+     *    to obtain the ram_addr_t of the target RAM
+     *  - For other memory regions,
+     *     + in the lower TARGET_PAGE_BITS, the physical section number
+     *     + with the TARGET_PAGE_BITS masked off, the offset within
+     *       the target MemoryRegion
     */
    hwaddr xlat_section;

--- a/include/exec/user/thunk.h
+++ b/include/exec/user/thunk.h
@ -111,8 +111,7 @@ static inline int thunk_type_size(const argtype *type_ptr, int is_host)
        if (is_host) {
 #if defined(HOST_X86_64)
            return 8;
-#elif defined(HOST_ALPHA) || defined(HOST_IA64) || defined(HOST_MIPS) || \
-      defined(HOST_PARISC) || defined(HOST_SPARC64)
+#elif defined(HOST_MIPS) || defined(HOST_SPARC64)
            return 4;
 #elif defined(HOST_PPC)
            return sizeof(void *);
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@ -366,6 +366,8 @@ float32 bfloat16_to_float32(bfloat16, float_status *status);
 bfloat16 float64_to_bfloat16(float64 a, float_status *status);
 float64 bfloat16_to_float64(bfloat16 a, float_status *status);

+int8_t bfloat16_to_int8_scalbn(bfloat16, FloatRoundMode,
+                               int, float_status *status);
 int16_t bfloat16_to_int16_scalbn(bfloat16, FloatRoundMode,
                                 int, float_status *status);
 int32_t bfloat16_to_int32_scalbn(bfloat16, FloatRoundMode,
@ -373,14 +375,18 @@ int32_t bfloat16_to_int32_scalbn(bfloat16, FloatRoundMode,
 int64_t bfloat16_to_int64_scalbn(bfloat16, FloatRoundMode,
                                 int, float_status *status);

+int8_t bfloat16_to_int8(bfloat16, float_status *status);
 int16_t bfloat16_to_int16(bfloat16, float_status *status);
 int32_t bfloat16_to_int32(bfloat16, float_status *status);
 int64_t bfloat16_to_int64(bfloat16, float_status *status);

+int8_t bfloat16_to_int8_round_to_zero(bfloat16, float_status *status);
 int16_t bfloat16_to_int16_round_to_zero(bfloat16, float_status *status);
 int32_t bfloat16_to_int32_round_to_zero(bfloat16, float_status *status);
 int64_t bfloat16_to_int64_round_to_zero(bfloat16, float_status *status);

+uint8_t bfloat16_to_uint8_scalbn(bfloat16 a, FloatRoundMode,
+                                 int, float_status *status);
 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode,
                                   int, float_status *status);
 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode,
@ -388,24 +394,30 @@ uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode,
 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode,
                                   int, float_status *status);

+uint8_t bfloat16_to_uint8(bfloat16 a, float_status *status);
 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *status);
 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *status);
 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *status);

+uint8_t bfloat16_to_uint8_round_to_zero(bfloat16 a, float_status *status);
 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *status);
 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *status);
 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *status);

+bfloat16 int8_to_bfloat16_scalbn(int8_t a, int, float_status *status);
 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int, float_status *status);
 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int, float_status *status);
 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int, float_status *status);
+bfloat16 uint8_to_bfloat16_scalbn(uint8_t a, int, float_status *status);
 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int, float_status *status);
 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int, float_status *status);
 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int, float_status *status);

+bfloat16 int8_to_bfloat16(int8_t a, float_status *status);
 bfloat16 int16_to_bfloat16(int16_t a, float_status *status);
 bfloat16 int32_to_bfloat16(int32_t a, float_status *status);
 bfloat16 int64_to_bfloat16(int64_t a, float_status *status);
+bfloat16 uint8_to_bfloat16(uint8_t a, float_status *status);
 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status);
 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status);
 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status);
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@ -227,17 +227,6 @@ struct CPUWatchpoint {
    QTAILQ_ENTRY(CPUWatchpoint) entry;
 };

-#ifdef CONFIG_PLUGIN
-/*
- * For plugins we sometime need to save the resolved iotlb data before
- * the memory regions get moved around  by io_writex.
- */
-typedef struct SavedIOTLB {
-    MemoryRegionSection *section;
-    hwaddr mr_offset;
-} SavedIOTLB;
-#endif
-
 struct KVMState;
 struct kvm_run;

@ -409,8 +398,6 @@ struct CPUState {

 #ifdef CONFIG_PLUGIN
    GArray *plugin_mem_cbs;
-    /* saved iotlb data from io_writex */
-    SavedIOTLB saved_iotlb;
 #endif

    /* TODO Move common fields from CPUArchState here. */
--- a/include/qemu/plugin-memory.h
+++ b/include/qemu/plugin-memory.h
@ -15,15 +15,8 @@
 struct qemu_plugin_hwaddr {
    bool is_io;
    bool is_store;
-    union {
-        struct {
-            MemoryRegionSection *section;
-            hwaddr    offset;
-        } io;
-        struct {
-            void *hostaddr;
-        } ram;
-    } v;
+    hwaddr phys_addr;
+    MemoryRegion *mr;
 };

 /**
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@ -129,7 +129,6 @@ typedef struct QString QString;
 typedef struct RAMBlock RAMBlock;
 typedef struct Range Range;
 typedef struct ReservedRegion ReservedRegion;
-typedef struct SavedIOTLB SavedIOTLB;
 typedef struct SHPCDevice SHPCDevice;
 typedef struct SSIBus SSIBus;
 typedef struct TCGHelperInfo TCGHelperInfo;
--- a/include/tcg/tcg-op-gvec-common.h
+++ b/include/tcg/tcg-op-gvec-common.h
@ -374,6 +374,12 @@ void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
                      uint32_t aofs, uint32_t bofs,
                      uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
+                       uint32_t aofs, int64_t c,
+                       uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
+                       uint32_t aofs, TCGv_i64 c,
+                       uint32_t oprsz, uint32_t maxsz);

 /*
 * Perform vector bit select: d = (b & a) | (c & ~a).
--- a/plugins/api.c
+++ b/plugins/api.c
@ -316,22 +316,7 @@ uint64_t qemu_plugin_hwaddr_phys_addr(const struct qemu_plugin_hwaddr *haddr)
 {
 #ifdef CONFIG_SOFTMMU
    if (haddr) {
-        if (!haddr->is_io) {
-            RAMBlock *block;
-            ram_addr_t offset;
-            void *hostaddr = haddr->v.ram.hostaddr;
-
-            block = qemu_ram_block_from_host(hostaddr, false, &offset);
-            if (!block) {
-                error_report("Bad host ram pointer %p", haddr->v.ram.hostaddr);
-                abort();
-            }
-
-            return block->offset + offset + block->mr->addr;
-        } else {
-            MemoryRegionSection *mrs = haddr->v.io.section;
-            return mrs->offset_within_address_space + haddr->v.io.offset;
-        }
+        return haddr->phys_addr;
    }
 #endif
    return 0;
@ -341,13 +326,13 @@ const char *qemu_plugin_hwaddr_device_name(const struct qemu_plugin_hwaddr *h)
 {
 #ifdef CONFIG_SOFTMMU
    if (h && h->is_io) {
-        MemoryRegionSection *mrs = h->v.io.section;
-        if (!mrs->mr->name) {
-            unsigned long maddr = 0xffffffff & (uintptr_t) mrs->mr;
-            g_autofree char *temp = g_strdup_printf("anon%08lx", maddr);
+        MemoryRegion *mr = h->mr;
+        if (!mr->name) {
+            unsigned maddr = (uintptr_t)mr;
+            g_autofree char *temp = g_strdup_printf("anon%08x", maddr);
            return g_intern_string(temp);
        } else {
-            return g_intern_string(mrs->mr->name);
+            return g_intern_string(mr->name);
        }
    } else {
        return g_intern_static_string("RAM");
--- a/softmmu/async-teardown.c
+++ b/softmmu/async-teardown.c
@ -121,10 +121,7 @@ static void *new_stack_for_clone(void)

    /* Allocate a new stack and get a pointer to its top. */
    stack_ptr = qemu_alloc_stack(&stack_size);
-#if !defined(HOST_HPPA)
-    /* The top is at the end of the area, except on HPPA. */
    stack_ptr += stack_size;
-#endif

    return stack_ptr;
 }
--- a/target/arm/tcg/translate.c
+++ b/target/arm/tcg/translate.c
@ -2943,54 +2943,16 @@ void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
    gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]);
 }

-#define GEN_CMP0(NAME, COND)                                            \
-    static void gen_##NAME##0_i32(TCGv_i32 d, TCGv_i32 a)               \
-    {                                                                   \
-        tcg_gen_negsetcond_i32(COND, d, a, tcg_constant_i32(0));        \
-    }                                                                   \
-    static void gen_##NAME##0_i64(TCGv_i64 d, TCGv_i64 a)               \
-    {                                                                   \
-        tcg_gen_negsetcond_i64(COND, d, a, tcg_constant_i64(0));        \
-    }                                                                   \
-    static void gen_##NAME##0_vec(unsigned vece, TCGv_vec d, TCGv_vec a) \
-    {                                                                   \
-        TCGv_vec zero = tcg_constant_vec_matching(d, vece, 0);          \
-        tcg_gen_cmp_vec(COND, vece, d, a, zero);                        \
-    }                                                                   \
-    void gen_gvec_##NAME##0(unsigned vece, uint32_t d, uint32_t m,      \
-                            uint32_t opr_sz, uint32_t max_sz)           \
-    {                                                                   \
-        const GVecGen2 op[4] = {                                        \
-            { .fno = gen_helper_gvec_##NAME##0_b,                       \
-              .fniv = gen_##NAME##0_vec,                                \
-              .opt_opc = vecop_list_cmp,                                \
-              .vece = MO_8 },                                           \
-            { .fno = gen_helper_gvec_##NAME##0_h,                       \
-              .fniv = gen_##NAME##0_vec,                                \
-              .opt_opc = vecop_list_cmp,                                \
-              .vece = MO_16 },                                          \
-            { .fni4 = gen_##NAME##0_i32,                                \
-              .fniv = gen_##NAME##0_vec,                                \
-              .opt_opc = vecop_list_cmp,                                \
-              .vece = MO_32 },                                          \
-            { .fni8 = gen_##NAME##0_i64,                                \
-              .fniv = gen_##NAME##0_vec,                                \
-              .opt_opc = vecop_list_cmp,                                \
-              .prefer_i64 = TCG_TARGET_REG_BITS == 64,                  \
-              .vece = MO_64 },                                          \
-        };                                                              \
-        tcg_gen_gvec_2(d, m, opr_sz, max_sz, &op[vece]);                \
-    }
+#define GEN_CMP0(NAME, COND)                              \
+    void NAME(unsigned vece, uint32_t d, uint32_t m,      \
+              uint32_t opr_sz, uint32_t max_sz)           \
+    { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); }

-static const TCGOpcode vecop_list_cmp[] = {
-    INDEX_op_cmp_vec, 0
-};
-
-GEN_CMP0(ceq, TCG_COND_EQ)
-GEN_CMP0(cle, TCG_COND_LE)
-GEN_CMP0(cge, TCG_COND_GE)
-GEN_CMP0(clt, TCG_COND_LT)
-GEN_CMP0(cgt, TCG_COND_GT)
+GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ)
+GEN_CMP0(gen_gvec_cle0, TCG_COND_LE)
+GEN_CMP0(gen_gvec_cge0, TCG_COND_GE)
+GEN_CMP0(gen_gvec_clt0, TCG_COND_LT)
+GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT)

 #undef GEN_CMP0

--- a/tcg/aarch64/tcg-target.c.inc
+++ b/tcg/aarch64/tcg-target.c.inc
@ -272,7 +272,7 @@ static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
    }
 }

-static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 {
    if (ct & TCG_CT_CONST) {
        return 1;
@ -602,6 +602,10 @@ typedef enum {
    DMB_ISH         = 0xd50338bf,
    DMB_LD          = 0x00000100,
    DMB_ST          = 0x00000200,
+
+    BTI_C           = 0xd503245f,
+    BTI_J           = 0xd503249f,
+    BTI_JC          = 0xd50324df,
 } AArch64Insn;

 static inline uint32_t tcg_in32(TCGContext *s)
@ -843,6 +847,17 @@ static void tcg_out_insn_3313(TCGContext *s, AArch64Insn insn,
              | rn << 5 | (rd & 0x1f));
 }

+static void tcg_out_bti(TCGContext *s, AArch64Insn insn)
+{
+    /*
+     * While BTI insns are nops on hosts without FEAT_BTI,
+     * there is no point in emitting them in that case either.
+     */
+    if (cpuinfo & CPUINFO_BTI) {
+        tcg_out32(s, insn);
+    }
+}
+
 /* Register to register move using ORR (shifted register with no shift). */
 static void tcg_out_movr(TCGContext *s, TCGType ext, TCGReg rd, TCGReg rm)
 {
@ -1351,18 +1366,6 @@ static void tcg_out_goto(TCGContext *s, const tcg_insn_unit *target)
    tcg_out_insn(s, 3206, B, offset);
 }

-static void tcg_out_goto_long(TCGContext *s, const tcg_insn_unit *target)
-{
-    ptrdiff_t offset = tcg_pcrel_diff(s, target) >> 2;
-    if (offset == sextract64(offset, 0, 26)) {
-        tcg_out_insn(s, 3206, B, offset);
-    } else {
-        /* Choose X9 as a call-clobbered non-LR temporary. */
-        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X9, (intptr_t)target);
-        tcg_out_insn(s, 3207, BR, TCG_REG_X9);
-    }
-}
-
 static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *target)
 {
    ptrdiff_t offset = tcg_pcrel_diff(s, target) >> 2;
@ -1947,12 +1950,28 @@ static const tcg_insn_unit *tb_ret_addr;

 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
 {
+    const tcg_insn_unit *target;
+    ptrdiff_t offset;
+
    /* Reuse the zeroing that exists for goto_ptr.  */
    if (a0 == 0) {
-        tcg_out_goto_long(s, tcg_code_gen_epilogue);
+        target = tcg_code_gen_epilogue;
    } else {
        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
-        tcg_out_goto_long(s, tb_ret_addr);
+        target = tb_ret_addr;
+    }
+
+    offset = tcg_pcrel_diff(s, target) >> 2;
+    if (offset == sextract64(offset, 0, 26)) {
+        tcg_out_insn(s, 3206, B, offset);
+    } else {
+        /*
+         * Only x16/x17 generate BTI type Jump (2),
+         * other registers generate BTI type Jump|Call (3).
+         */
+        QEMU_BUILD_BUG_ON(TCG_REG_TMP0 != TCG_REG_X16);
+        tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, (intptr_t)target);
+        tcg_out_insn(s, 3207, BR, TCG_REG_TMP0);
    }
 }

@ -1970,6 +1989,7 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
    tcg_out32(s, I3206_B);
    tcg_out_insn(s, 3207, BR, TCG_REG_TMP0);
    set_jmp_reset_offset(s, which);
+    tcg_out_bti(s, BTI_J);
 }

 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
@ -3074,6 +3094,8 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 {
    TCGReg r;

+    tcg_out_bti(s, BTI_C);
+
    /* Push (FP, LR) and allocate space for all saved registers.  */
    tcg_out_insn(s, 3314, STP, TCG_REG_FP, TCG_REG_LR,
                 TCG_REG_SP, -PUSH_SIZE, 1, 1);
@ -3114,10 +3136,12 @@ static void tcg_target_qemu_prologue(TCGContext *s)
     * and fall through to the rest of the epilogue.
     */
    tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
+    tcg_out_bti(s, BTI_J);
    tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_X0, 0);

    /* TB epilogue */
    tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
+    tcg_out_bti(s, BTI_J);

    /* Remove TCG locals stack space.  */
    tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_SP, TCG_REG_SP,
@ -3135,6 +3159,11 @@ static void tcg_target_qemu_prologue(TCGContext *s)
    tcg_out_insn(s, 3207, RET, TCG_REG_LR);
 }

+static void tcg_out_tb_start(TCGContext *s)
+{
+    tcg_out_bti(s, BTI_J);
+}
+
 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
 {
    int i;
--- a/tcg/arm/tcg-target.c.inc
+++ b/tcg/arm/tcg-target.c.inc
@ -509,7 +509,7 @@ static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
 * mov operand2:     values represented with x << (2 * y), x < 0x100
 * add, sub, eor...: ditto
 */
-static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 {
    if (ct & TCG_CT_CONST) {
        return 1;
@ -2962,6 +2962,11 @@ static void tcg_out_epilogue(TCGContext *s)
                  (1 << TCG_REG_R10) | (1 << TCG_REG_R11) | (1 << TCG_REG_PC));
 }

+static void tcg_out_tb_start(TCGContext *s)
+{
+    /* nothing to do */
+}
+
 typedef struct {
    DebugFrameHeader h;
    uint8_t fde_def_cfa[4];
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@ -198,7 +198,7 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 }

 /* test if a constant matches the constraint */
-static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 {
    if (ct & TCG_CT_CONST) {
        return 1;
@ -4191,6 +4191,11 @@ static void tcg_target_qemu_prologue(TCGContext *s)
    tcg_out_opc(s, OPC_RET, 0, 0, 0);
 }

+static void tcg_out_tb_start(TCGContext *s)
+{
+    /* nothing to do */
+}
+
 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
 {
    memset(p, 0x90, count);
--- a/tcg/loongarch64/tcg-insn-defs.c.inc
+++ b/tcg/loongarch64/tcg-insn-defs.c.inc
--- a/tcg/loongarch64/tcg-target-con-set.h
+++ b/tcg/loongarch64/tcg-target-con-set.h
@ -17,7 +17,11 @@
 C_O0_I1(r)
 C_O0_I2(rZ, r)
 C_O0_I2(rZ, rZ)
+C_O0_I2(w, r)
+C_O0_I3(r, r, r)
 C_O1_I1(r, r)
+C_O1_I1(w, r)
+C_O1_I1(w, w)
 C_O1_I2(r, r, rC)
 C_O1_I2(r, r, ri)
 C_O1_I2(r, r, rI)
@ -29,4 +33,9 @@ C_O1_I2(r, 0, rZ)
 C_O1_I2(r, rZ, ri)
 C_O1_I2(r, rZ, rJ)
 C_O1_I2(r, rZ, rZ)
+C_O1_I2(w, w, w)
+C_O1_I2(w, w, wM)
+C_O1_I2(w, w, wA)
+C_O1_I3(w, w, w, w)
 C_O1_I4(r, rZ, rJ, rZ, rZ)
+C_O2_I1(r, r, r)
--- a/tcg/loongarch64/tcg-target-con-str.h
+++ b/tcg/loongarch64/tcg-target-con-str.h
@ -14,6 +14,7 @@
 * REGS(letter, register_mask)
 */
 REGS('r', ALL_GENERAL_REGS)
+REGS('w', ALL_VECTOR_REGS)

 /*
 * Define constraint letters for constants:
@ -25,3 +26,5 @@ CONST('U', TCG_CT_CONST_U12)
 CONST('Z', TCG_CT_CONST_ZERO)
 CONST('C', TCG_CT_CONST_C12)
 CONST('W', TCG_CT_CONST_WSZ)
+CONST('M', TCG_CT_CONST_VCMP)
+CONST('A', TCG_CT_CONST_VADD)
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@ -32,6 +32,8 @@
 #include "../tcg-ldst.c.inc"
 #include <asm/hwcap.h>

+bool use_lsx_instructions;
+
 #ifdef CONFIG_DEBUG_TCG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
    "zero",
@ -65,7 +67,39 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
    "s5",
    "s6",
    "s7",
-    "s8"
+    "s8",
+    "vr0",
+    "vr1",
+    "vr2",
+    "vr3",
+    "vr4",
+    "vr5",
+    "vr6",
+    "vr7",
+    "vr8",
+    "vr9",
+    "vr10",
+    "vr11",
+    "vr12",
+    "vr13",
+    "vr14",
+    "vr15",
+    "vr16",
+    "vr17",
+    "vr18",
+    "vr19",
+    "vr20",
+    "vr21",
+    "vr22",
+    "vr23",
+    "vr24",
+    "vr25",
+    "vr26",
+    "vr27",
+    "vr28",
+    "vr29",
+    "vr30",
+    "vr31",
 };
 #endif

@ -102,6 +136,15 @@ static const int tcg_target_reg_alloc_order[] = {
    TCG_REG_A2,
    TCG_REG_A1,
    TCG_REG_A0,
+
+    /* Vector registers */
+    TCG_REG_V0, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
+    TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
+    TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11,
+    TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
+    TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+    TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+    /* V24 - V31 are caller-saved, and skipped.  */
 };

 static const int tcg_target_call_iarg_regs[] = {
@ -133,8 +176,11 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 #define TCG_CT_CONST_U12   0x800
 #define TCG_CT_CONST_C12   0x1000
 #define TCG_CT_CONST_WSZ   0x2000
+#define TCG_CT_CONST_VCMP  0x4000
+#define TCG_CT_CONST_VADD  0x8000

 #define ALL_GENERAL_REGS   MAKE_64BIT_MASK(0, 32)
+#define ALL_VECTOR_REGS    MAKE_64BIT_MASK(32, 32)

 static inline tcg_target_long sextreg(tcg_target_long val, int pos, int len)
 {
@ -142,7 +188,7 @@ static inline tcg_target_long sextreg(tcg_target_long val, int pos, int len)
 }

 /* test if a constant matches the constraint */
-static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 {
    if (ct & TCG_CT_CONST) {
        return true;
@ -165,6 +211,13 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
    if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
        return true;
    }
+    int64_t vec_val = sextract64(val, 0, 8 << vece);
+    if ((ct & TCG_CT_CONST_VCMP) && -0x10 <= vec_val && vec_val <= 0x1f) {
+        return true;
+    }
+    if ((ct & TCG_CT_CONST_VADD) && -0x1f <= vec_val && vec_val <= 0x1f) {
+        return true;
+    }
    return false;
 }

@ -1028,6 +1081,48 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
    }
 }

+static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg data_lo, TCGReg data_hi,
+                                   TCGReg addr_reg, MemOpIdx oi, bool is_ld)
+{
+    TCGLabelQemuLdst *ldst;
+    HostAddress h;
+
+    ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld);
+
+    if (h.aa.atom == MO_128) {
+        /*
+         * Use VLDX/VSTX when 128-bit atomicity is required.
+         * If address is aligned to 16-bytes, the 128-bit load/store is atomic.
+         */
+        if (is_ld) {
+            tcg_out_opc_vldx(s, TCG_VEC_TMP0, h.base, h.index);
+            tcg_out_opc_vpickve2gr_d(s, data_lo, TCG_VEC_TMP0, 0);
+            tcg_out_opc_vpickve2gr_d(s, data_hi, TCG_VEC_TMP0, 1);
+        } else {
+            tcg_out_opc_vinsgr2vr_d(s, TCG_VEC_TMP0, data_lo, 0);
+            tcg_out_opc_vinsgr2vr_d(s, TCG_VEC_TMP0, data_hi, 1);
+            tcg_out_opc_vstx(s, TCG_VEC_TMP0, h.base, h.index);
+        }
+    } else {
+        /* Otherwise use a pair of LD/ST. */
+        tcg_out_opc_add_d(s, TCG_REG_TMP0, h.base, h.index);
+        if (is_ld) {
+            tcg_out_opc_ld_d(s, data_lo, TCG_REG_TMP0, 0);
+            tcg_out_opc_ld_d(s, data_hi, TCG_REG_TMP0, 8);
+        } else {
+            tcg_out_opc_st_d(s, data_lo, TCG_REG_TMP0, 0);
+            tcg_out_opc_st_d(s, data_hi, TCG_REG_TMP0, 8);
+        }
+    }
+
+    if (ldst) {
+        ldst->type = TCG_TYPE_I128;
+        ldst->datalo_reg = data_lo;
+        ldst->datahi_reg = data_hi;
+        ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
+    }
+}
+
 /*
 * Entry-points
 */
@ -1092,6 +1187,7 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
    TCGArg a0 = args[0];
    TCGArg a1 = args[1];
    TCGArg a2 = args[2];
+    TCGArg a3 = args[3];
    int c2 = const_args[2];

    switch (opc) {
@ -1454,6 +1550,10 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
    case INDEX_op_qemu_ld_a64_i64:
        tcg_out_qemu_ld(s, a0, a1, a2, TCG_TYPE_I64);
        break;
+    case INDEX_op_qemu_ld_a32_i128:
+    case INDEX_op_qemu_ld_a64_i128:
+        tcg_out_qemu_ldst_i128(s, a0, a1, a2, a3, true);
+        break;
    case INDEX_op_qemu_st_a32_i32:
    case INDEX_op_qemu_st_a64_i32:
        tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I32);
@ -1462,6 +1562,10 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
    case INDEX_op_qemu_st_a64_i64:
        tcg_out_qemu_st(s, a0, a1, a2, TCG_TYPE_I64);
        break;
+    case INDEX_op_qemu_st_a32_i128:
+    case INDEX_op_qemu_st_a64_i128:
+        tcg_out_qemu_ldst_i128(s, a0, a1, a2, a3, false);
+        break;

    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
    case INDEX_op_mov_i64:
@ -1486,6 +1590,444 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
    }
 }

+static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+                            TCGReg rd, TCGReg rs)
+{
+    switch (vece) {
+    case MO_8:
+        tcg_out_opc_vreplgr2vr_b(s, rd, rs);
+        break;
+    case MO_16:
+        tcg_out_opc_vreplgr2vr_h(s, rd, rs);
+        break;
+    case MO_32:
+        tcg_out_opc_vreplgr2vr_w(s, rd, rs);
+        break;
+    case MO_64:
+        tcg_out_opc_vreplgr2vr_d(s, rd, rs);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return true;
+}
+
+static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg r, TCGReg base, intptr_t offset)
+{
+    /* Handle imm overflow and division (vldrepl.d imm is divided by 8) */
+    if (offset < -0x800 || offset > 0x7ff || \
+        (offset & ((1 << vece) - 1)) != 0) {
+        tcg_out_addi(s, TCG_TYPE_I64, TCG_REG_TMP0, base, offset);
+        base = TCG_REG_TMP0;
+        offset = 0;
+    }
+    offset >>= vece;
+
+    switch (vece) {
+    case MO_8:
+        tcg_out_opc_vldrepl_b(s, r, base, offset);
+        break;
+    case MO_16:
+        tcg_out_opc_vldrepl_h(s, r, base, offset);
+        break;
+    case MO_32:
+        tcg_out_opc_vldrepl_w(s, r, base, offset);
+        break;
+    case MO_64:
+        tcg_out_opc_vldrepl_d(s, r, base, offset);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return true;
+}
+
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg rd, int64_t v64)
+{
+    /* Try vldi if imm can fit */
+    int64_t value = sextract64(v64, 0, 8 << vece);
+    if (-0x200 <= value && value <= 0x1FF) {
+        uint32_t imm = (vece << 10) | ((uint32_t)v64 & 0x3FF);
+        tcg_out_opc_vldi(s, rd, imm);
+        return;
+    }
+
+    /* TODO: vldi patterns when imm 12 is set */
+
+    /* Fallback to vreplgr2vr */
+    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, value);
+    switch (vece) {
+    case MO_8:
+        tcg_out_opc_vreplgr2vr_b(s, rd, TCG_REG_TMP0);
+        break;
+    case MO_16:
+        tcg_out_opc_vreplgr2vr_h(s, rd, TCG_REG_TMP0);
+        break;
+    case MO_32:
+        tcg_out_opc_vreplgr2vr_w(s, rd, TCG_REG_TMP0);
+        break;
+    case MO_64:
+        tcg_out_opc_vreplgr2vr_d(s, rd, TCG_REG_TMP0);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void tcg_out_addsub_vec(TCGContext *s, unsigned vece, const TCGArg a0,
+                               const TCGArg a1, const TCGArg a2,
+                               bool a2_is_const, bool is_add)
+{
+    static const LoongArchInsn add_vec_insn[4] = {
+        OPC_VADD_B, OPC_VADD_H, OPC_VADD_W, OPC_VADD_D
+    };
+    static const LoongArchInsn add_vec_imm_insn[4] = {
+        OPC_VADDI_BU, OPC_VADDI_HU, OPC_VADDI_WU, OPC_VADDI_DU
+    };
+    static const LoongArchInsn sub_vec_insn[4] = {
+        OPC_VSUB_B, OPC_VSUB_H, OPC_VSUB_W, OPC_VSUB_D
+    };
+    static const LoongArchInsn sub_vec_imm_insn[4] = {
+        OPC_VSUBI_BU, OPC_VSUBI_HU, OPC_VSUBI_WU, OPC_VSUBI_DU
+    };
+
+    if (a2_is_const) {
+        int64_t value = sextract64(a2, 0, 8 << vece);
+        if (!is_add) {
+            value = -value;
+        }
+
+        /* Try vaddi/vsubi */
+        if (0 <= value && value <= 0x1f) {
+            tcg_out32(s, encode_vdvjuk5_insn(add_vec_imm_insn[vece], a0, \
+                                             a1, value));
+            return;
+        } else if (-0x1f <= value && value < 0) {
+            tcg_out32(s, encode_vdvjuk5_insn(sub_vec_imm_insn[vece], a0, \
+                                             a1, -value));
+            return;
+        }
+
+        /* constraint TCG_CT_CONST_VADD ensures unreachable */
+        g_assert_not_reached();
+    }
+
+    if (is_add) {
+        tcg_out32(s, encode_vdvjvk_insn(add_vec_insn[vece], a0, a1, a2));
+    } else {
+        tcg_out32(s, encode_vdvjvk_insn(sub_vec_insn[vece], a0, a1, a2));
+    }
+}
+
+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+                           unsigned vecl, unsigned vece,
+                           const TCGArg args[TCG_MAX_OP_ARGS],
+                           const int const_args[TCG_MAX_OP_ARGS])
+{
+    TCGType type = vecl + TCG_TYPE_V64;
+    TCGArg a0, a1, a2, a3;
+    TCGReg temp = TCG_REG_TMP0;
+    TCGReg temp_vec = TCG_VEC_TMP0;
+
+    static const LoongArchInsn cmp_vec_insn[16][4] = {
+        [TCG_COND_EQ] = {OPC_VSEQ_B, OPC_VSEQ_H, OPC_VSEQ_W, OPC_VSEQ_D},
+        [TCG_COND_LE] = {OPC_VSLE_B, OPC_VSLE_H, OPC_VSLE_W, OPC_VSLE_D},
+        [TCG_COND_LEU] = {OPC_VSLE_BU, OPC_VSLE_HU, OPC_VSLE_WU, OPC_VSLE_DU},
+        [TCG_COND_LT] = {OPC_VSLT_B, OPC_VSLT_H, OPC_VSLT_W, OPC_VSLT_D},
+        [TCG_COND_LTU] = {OPC_VSLT_BU, OPC_VSLT_HU, OPC_VSLT_WU, OPC_VSLT_DU},
+    };
+    static const LoongArchInsn cmp_vec_imm_insn[16][4] = {
+        [TCG_COND_EQ] = {OPC_VSEQI_B, OPC_VSEQI_H, OPC_VSEQI_W, OPC_VSEQI_D},
+        [TCG_COND_LE] = {OPC_VSLEI_B, OPC_VSLEI_H, OPC_VSLEI_W, OPC_VSLEI_D},
+        [TCG_COND_LEU] = {OPC_VSLEI_BU, OPC_VSLEI_HU, OPC_VSLEI_WU, OPC_VSLEI_DU},
+        [TCG_COND_LT] = {OPC_VSLTI_B, OPC_VSLTI_H, OPC_VSLTI_W, OPC_VSLTI_D},
+        [TCG_COND_LTU] = {OPC_VSLTI_BU, OPC_VSLTI_HU, OPC_VSLTI_WU, OPC_VSLTI_DU},
+    };
+    LoongArchInsn insn;
+    static const LoongArchInsn neg_vec_insn[4] = {
+        OPC_VNEG_B, OPC_VNEG_H, OPC_VNEG_W, OPC_VNEG_D
+    };
+    static const LoongArchInsn mul_vec_insn[4] = {
+        OPC_VMUL_B, OPC_VMUL_H, OPC_VMUL_W, OPC_VMUL_D
+    };
+    static const LoongArchInsn smin_vec_insn[4] = {
+        OPC_VMIN_B, OPC_VMIN_H, OPC_VMIN_W, OPC_VMIN_D
+    };
+    static const LoongArchInsn umin_vec_insn[4] = {
+        OPC_VMIN_BU, OPC_VMIN_HU, OPC_VMIN_WU, OPC_VMIN_DU
+    };
+    static const LoongArchInsn smax_vec_insn[4] = {
+        OPC_VMAX_B, OPC_VMAX_H, OPC_VMAX_W, OPC_VMAX_D
+    };
+    static const LoongArchInsn umax_vec_insn[4] = {
+        OPC_VMAX_BU, OPC_VMAX_HU, OPC_VMAX_WU, OPC_VMAX_DU
+    };
+    static const LoongArchInsn ssadd_vec_insn[4] = {
+        OPC_VSADD_B, OPC_VSADD_H, OPC_VSADD_W, OPC_VSADD_D
+    };
+    static const LoongArchInsn usadd_vec_insn[4] = {
+        OPC_VSADD_BU, OPC_VSADD_HU, OPC_VSADD_WU, OPC_VSADD_DU
+    };
+    static const LoongArchInsn sssub_vec_insn[4] = {
+        OPC_VSSUB_B, OPC_VSSUB_H, OPC_VSSUB_W, OPC_VSSUB_D
+    };
+    static const LoongArchInsn ussub_vec_insn[4] = {
+        OPC_VSSUB_BU, OPC_VSSUB_HU, OPC_VSSUB_WU, OPC_VSSUB_DU
+    };
+    static const LoongArchInsn shlv_vec_insn[4] = {
+        OPC_VSLL_B, OPC_VSLL_H, OPC_VSLL_W, OPC_VSLL_D
+    };
+    static const LoongArchInsn shrv_vec_insn[4] = {
+        OPC_VSRL_B, OPC_VSRL_H, OPC_VSRL_W, OPC_VSRL_D
+    };
+    static const LoongArchInsn sarv_vec_insn[4] = {
+        OPC_VSRA_B, OPC_VSRA_H, OPC_VSRA_W, OPC_VSRA_D
+    };
+    static const LoongArchInsn shli_vec_insn[4] = {
+        OPC_VSLLI_B, OPC_VSLLI_H, OPC_VSLLI_W, OPC_VSLLI_D
+    };
+    static const LoongArchInsn shri_vec_insn[4] = {
+        OPC_VSRLI_B, OPC_VSRLI_H, OPC_VSRLI_W, OPC_VSRLI_D
+    };
+    static const LoongArchInsn sari_vec_insn[4] = {
+        OPC_VSRAI_B, OPC_VSRAI_H, OPC_VSRAI_W, OPC_VSRAI_D
+    };
+    static const LoongArchInsn rotrv_vec_insn[4] = {
+        OPC_VROTR_B, OPC_VROTR_H, OPC_VROTR_W, OPC_VROTR_D
+    };
+
+    a0 = args[0];
+    a1 = args[1];
+    a2 = args[2];
+    a3 = args[3];
+
+    /* Currently only supports V128 */
+    tcg_debug_assert(type == TCG_TYPE_V128);
+
+    switch (opc) {
+    case INDEX_op_st_vec:
+        /* Try to fit vst imm */
+        if (-0x800 <= a2 && a2 <= 0x7ff) {
+            tcg_out_opc_vst(s, a0, a1, a2);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_I64, temp, a2);
+            tcg_out_opc_vstx(s, a0, a1, temp);
+        }
+        break;
+    case INDEX_op_ld_vec:
+        /* Try to fit vld imm */
+        if (-0x800 <= a2 && a2 <= 0x7ff) {
+            tcg_out_opc_vld(s, a0, a1, a2);
+        } else {
+            tcg_out_movi(s, TCG_TYPE_I64, temp, a2);
+            tcg_out_opc_vldx(s, a0, a1, temp);
+        }
+        break;
+    case INDEX_op_and_vec:
+        tcg_out_opc_vand_v(s, a0, a1, a2);
+        break;
+    case INDEX_op_andc_vec:
+        /*
+         * vandn vd, vj, vk: vd = vk & ~vj
+         * andc_vec vd, vj, vk: vd = vj & ~vk
+         * vk and vk are swapped
+         */
+        tcg_out_opc_vandn_v(s, a0, a2, a1);
+        break;
+    case INDEX_op_or_vec:
+        tcg_out_opc_vor_v(s, a0, a1, a2);
+        break;
+    case INDEX_op_orc_vec:
+        tcg_out_opc_vorn_v(s, a0, a1, a2);
+        break;
+    case INDEX_op_xor_vec:
+        tcg_out_opc_vxor_v(s, a0, a1, a2);
+        break;
+    case INDEX_op_nor_vec:
+        tcg_out_opc_vnor_v(s, a0, a1, a2);
+        break;
+    case INDEX_op_not_vec:
+        tcg_out_opc_vnor_v(s, a0, a1, a1);
+        break;
+    case INDEX_op_cmp_vec:
+        TCGCond cond = args[3];
+        if (const_args[2]) {
+            /*
+             * cmp_vec dest, src, value
+             * Try vseqi/vslei/vslti
+             */
+            int64_t value = sextract64(a2, 0, 8 << vece);
+            if ((cond == TCG_COND_EQ || cond == TCG_COND_LE || \
+                 cond == TCG_COND_LT) && (-0x10 <= value && value <= 0x0f)) {
+                tcg_out32(s, encode_vdvjsk5_insn(cmp_vec_imm_insn[cond][vece], \
+                                                 a0, a1, value));
+                break;
+            } else if ((cond == TCG_COND_LEU || cond == TCG_COND_LTU) &&
+                (0x00 <= value && value <= 0x1f)) {
+                tcg_out32(s, encode_vdvjuk5_insn(cmp_vec_imm_insn[cond][vece], \
+                                                 a0, a1, value));
+                break;
+            }
+
+            /*
+             * Fallback to:
+             * dupi_vec temp, a2
+             * cmp_vec a0, a1, temp, cond
+             */
+            tcg_out_dupi_vec(s, type, vece, temp_vec, a2);
+            a2 = temp_vec;
+        }
+
+        insn = cmp_vec_insn[cond][vece];
+        if (insn == 0) {
+            TCGArg t;
+            t = a1, a1 = a2, a2 = t;
+            cond = tcg_swap_cond(cond);
+            insn = cmp_vec_insn[cond][vece];
+            tcg_debug_assert(insn != 0);
+        }
+        tcg_out32(s, encode_vdvjvk_insn(insn, a0, a1, a2));
+        break;
+    case INDEX_op_add_vec:
+        tcg_out_addsub_vec(s, vece, a0, a1, a2, const_args[2], true);
+        break;
+    case INDEX_op_sub_vec:
+        tcg_out_addsub_vec(s, vece, a0, a1, a2, const_args[2], false);
+        break;
+    case INDEX_op_neg_vec:
+        tcg_out32(s, encode_vdvj_insn(neg_vec_insn[vece], a0, a1));
+        break;
+    case INDEX_op_mul_vec:
+        tcg_out32(s, encode_vdvjvk_insn(mul_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_smin_vec:
+        tcg_out32(s, encode_vdvjvk_insn(smin_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_smax_vec:
+        tcg_out32(s, encode_vdvjvk_insn(smax_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_umin_vec:
+        tcg_out32(s, encode_vdvjvk_insn(umin_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_umax_vec:
+        tcg_out32(s, encode_vdvjvk_insn(umax_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_ssadd_vec:
+        tcg_out32(s, encode_vdvjvk_insn(ssadd_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_usadd_vec:
+        tcg_out32(s, encode_vdvjvk_insn(usadd_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_sssub_vec:
+        tcg_out32(s, encode_vdvjvk_insn(sssub_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_ussub_vec:
+        tcg_out32(s, encode_vdvjvk_insn(ussub_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_shlv_vec:
+        tcg_out32(s, encode_vdvjvk_insn(shlv_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_shrv_vec:
+        tcg_out32(s, encode_vdvjvk_insn(shrv_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_sarv_vec:
+        tcg_out32(s, encode_vdvjvk_insn(sarv_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_shli_vec:
+        tcg_out32(s, encode_vdvjuk3_insn(shli_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_shri_vec:
+        tcg_out32(s, encode_vdvjuk3_insn(shri_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_sari_vec:
+        tcg_out32(s, encode_vdvjuk3_insn(sari_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_rotrv_vec:
+        tcg_out32(s, encode_vdvjvk_insn(rotrv_vec_insn[vece], a0, a1, a2));
+        break;
+    case INDEX_op_rotlv_vec:
+        /* rotlv_vec a1, a2 = rotrv_vec a1, -a2 */
+        tcg_out32(s, encode_vdvj_insn(neg_vec_insn[vece], temp_vec, a2));
+        tcg_out32(s, encode_vdvjvk_insn(rotrv_vec_insn[vece], a0, a1,
+                                        temp_vec));
+        break;
+    case INDEX_op_rotli_vec:
+        /* rotli_vec a1, a2 = rotri_vec a1, -a2 */
+        a2 = extract32(-a2, 0, 3 + vece);
+        switch (vece) {
+        case MO_8:
+            tcg_out_opc_vrotri_b(s, a0, a1, a2);
+            break;
+        case MO_16:
+            tcg_out_opc_vrotri_h(s, a0, a1, a2);
+            break;
+        case MO_32:
+            tcg_out_opc_vrotri_w(s, a0, a1, a2);
+            break;
+        case MO_64:
+            tcg_out_opc_vrotri_d(s, a0, a1, a2);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        break;
+    case INDEX_op_bitsel_vec:
+        /* vbitsel vd, vj, vk, va = bitsel_vec vd, va, vk, vj */
+        tcg_out_opc_vbitsel_v(s, a0, a3, a2, a1);
+        break;
+    case INDEX_op_dupm_vec:
+        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+{
+    switch (opc) {
+    case INDEX_op_ld_vec:
+    case INDEX_op_st_vec:
+    case INDEX_op_dup_vec:
+    case INDEX_op_dupm_vec:
+    case INDEX_op_cmp_vec:
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+    case INDEX_op_and_vec:
+    case INDEX_op_andc_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_orc_vec:
+    case INDEX_op_xor_vec:
+    case INDEX_op_nor_vec:
+    case INDEX_op_not_vec:
+    case INDEX_op_neg_vec:
+    case INDEX_op_mul_vec:
+    case INDEX_op_smin_vec:
+    case INDEX_op_smax_vec:
+    case INDEX_op_umin_vec:
+    case INDEX_op_umax_vec:
+    case INDEX_op_ssadd_vec:
+    case INDEX_op_usadd_vec:
+    case INDEX_op_sssub_vec:
+    case INDEX_op_ussub_vec:
+    case INDEX_op_shlv_vec:
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+    case INDEX_op_bitsel_vec:
+        return 1;
+    default:
+        return 0;
+    }
+}
+
+void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+                       TCGArg a0, ...)
+{
+    g_assert_not_reached();
+}
+
 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 {
    switch (op) {
@ -1505,6 +2047,14 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
    case INDEX_op_qemu_st_a64_i64:
        return C_O0_I2(rZ, r);

+    case INDEX_op_qemu_ld_a32_i128:
+    case INDEX_op_qemu_ld_a64_i128:
+        return C_O2_I1(r, r, r);
+
+    case INDEX_op_qemu_st_a32_i128:
+    case INDEX_op_qemu_st_a64_i128:
+        return C_O0_I3(r, r, r);
+
    case INDEX_op_brcond_i32:
    case INDEX_op_brcond_i64:
        return C_O0_I2(rZ, rZ);
@ -1627,6 +2177,54 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
    case INDEX_op_movcond_i64:
        return C_O1_I4(r, rZ, rJ, rZ, rZ);

+    case INDEX_op_ld_vec:
+    case INDEX_op_dupm_vec:
+    case INDEX_op_dup_vec:
+        return C_O1_I1(w, r);
+
+    case INDEX_op_st_vec:
+        return C_O0_I2(w, r);
+
+    case INDEX_op_cmp_vec:
+        return C_O1_I2(w, w, wM);
+
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+        return C_O1_I2(w, w, wA);
+
+    case INDEX_op_and_vec:
+    case INDEX_op_andc_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_orc_vec:
+    case INDEX_op_xor_vec:
+    case INDEX_op_nor_vec:
+    case INDEX_op_mul_vec:
+    case INDEX_op_smin_vec:
+    case INDEX_op_smax_vec:
+    case INDEX_op_umin_vec:
+    case INDEX_op_umax_vec:
+    case INDEX_op_ssadd_vec:
+    case INDEX_op_usadd_vec:
+    case INDEX_op_sssub_vec:
+    case INDEX_op_ussub_vec:
+    case INDEX_op_shlv_vec:
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+    case INDEX_op_rotrv_vec:
+    case INDEX_op_rotlv_vec:
+        return C_O1_I2(w, w, w);
+
+    case INDEX_op_not_vec:
+    case INDEX_op_neg_vec:
+    case INDEX_op_shli_vec:
+    case INDEX_op_shri_vec:
+    case INDEX_op_sari_vec:
+    case INDEX_op_rotli_vec:
+        return C_O1_I1(w, w);
+
+    case INDEX_op_bitsel_vec:
+        return C_O1_I3(w, w, w, w);
+
    default:
        g_assert_not_reached();
    }
@ -1698,6 +2296,11 @@ static void tcg_target_qemu_prologue(TCGContext *s)
    tcg_out_opc_jirl(s, TCG_REG_ZERO, TCG_REG_RA, 0);
 }

+static void tcg_out_tb_start(TCGContext *s)
+{
+    /* nothing to do */
+}
+
 static void tcg_target_init(TCGContext *s)
 {
    unsigned long hwcap = qemu_getauxval(AT_HWCAP);
@ -1708,6 +2311,10 @@ static void tcg_target_init(TCGContext *s)
        exit(EXIT_FAILURE);
    }

+    if (hwcap & HWCAP_LOONGARCH_LSX) {
+        use_lsx_instructions = 1;
+    }
+
    tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
    tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;

@ -1723,6 +2330,18 @@ static void tcg_target_init(TCGContext *s)
    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_S8);
    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_S9);

+    if (use_lsx_instructions) {
+        tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V24);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V25);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V26);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V27);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V28);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V29);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V30);
+        tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V31);
+    }
+
    s->reserved_regs = 0;
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_ZERO);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP0);
@ -1731,6 +2350,7 @@ static void tcg_target_init(TCGContext *s)
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_TP);
    tcg_regset_set_reg(s->reserved_regs, TCG_REG_RESERVED);
+    tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP0);
 }

 typedef struct {
--- a/tcg/loongarch64/tcg-target.h
+++ b/tcg/loongarch64/tcg-target.h
@ -30,7 +30,7 @@
 #define LOONGARCH_TCG_TARGET_H

 #define TCG_TARGET_INSN_UNIT_SIZE 4
-#define TCG_TARGET_NB_REGS 32
+#define TCG_TARGET_NB_REGS 64

 #define MAX_CODE_GEN_BUFFER_SIZE  ((size_t)-1)

@ -68,13 +68,25 @@ typedef enum {
    TCG_REG_S7,
    TCG_REG_S8,

+    TCG_REG_V0 = 32, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
+    TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
+    TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11,
+    TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
+    TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+    TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+    TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+    TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
+
    /* aliases */
    TCG_AREG0    = TCG_REG_S0,
    TCG_REG_TMP0 = TCG_REG_T8,
    TCG_REG_TMP1 = TCG_REG_T7,
    TCG_REG_TMP2 = TCG_REG_T6,
+    TCG_VEC_TMP0 = TCG_REG_V23,
 } TCGReg;

+extern bool use_lsx_instructions;
+
 /* used for function call generation */
 #define TCG_REG_CALL_STACK              TCG_REG_SP
 #define TCG_TARGET_STACK_ALIGN          16
@ -159,7 +171,31 @@ typedef enum {
 #define TCG_TARGET_HAS_muluh_i64        1
 #define TCG_TARGET_HAS_mulsh_i64        1

-#define TCG_TARGET_HAS_qemu_ldst_i128   0
+#define TCG_TARGET_HAS_qemu_ldst_i128   use_lsx_instructions
+
+#define TCG_TARGET_HAS_v64              0
+#define TCG_TARGET_HAS_v128             use_lsx_instructions
+#define TCG_TARGET_HAS_v256             0
+
+#define TCG_TARGET_HAS_not_vec          1
+#define TCG_TARGET_HAS_neg_vec          1
+#define TCG_TARGET_HAS_abs_vec          0
+#define TCG_TARGET_HAS_andc_vec         1
+#define TCG_TARGET_HAS_orc_vec          1
+#define TCG_TARGET_HAS_nand_vec         0
+#define TCG_TARGET_HAS_nor_vec          1
+#define TCG_TARGET_HAS_eqv_vec          0
+#define TCG_TARGET_HAS_mul_vec          1
+#define TCG_TARGET_HAS_shi_vec          1
+#define TCG_TARGET_HAS_shs_vec          0
+#define TCG_TARGET_HAS_shv_vec          1
+#define TCG_TARGET_HAS_roti_vec         1
+#define TCG_TARGET_HAS_rots_vec         0
+#define TCG_TARGET_HAS_rotv_vec         1
+#define TCG_TARGET_HAS_sat_vec          1
+#define TCG_TARGET_HAS_minmax_vec       1
+#define TCG_TARGET_HAS_bitsel_vec       1
+#define TCG_TARGET_HAS_cmpsel_vec       0

 #define TCG_TARGET_DEFAULT_MO (0)

--- a/tcg/loongarch64/tcg-target.opc.h
+++ b/tcg/loongarch64/tcg-target.opc.h
@ -0,0 +1,12 @@
+/*
+ * Copyright (c) 2023 Jiajie Chen
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.
+ *
+ * See the COPYING file in the top-level directory for details.
+ *
+ * Target-specific opcodes for host vector expansion.  These will be
+ * emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
+ * consider these to be UNSPEC with names.
+ */
--- a/tcg/mips/tcg-target.c.inc
+++ b/tcg/mips/tcg-target.c.inc
@ -190,7 +190,7 @@ static bool is_p2m1(tcg_target_long val)
 }

 /* test if a constant matches the constraint */
-static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 {
    if (ct & TCG_CT_CONST) {
        return 1;
@ -2628,6 +2628,11 @@ static void tcg_target_qemu_prologue(TCGContext *s)
    tcg_out_opc_reg(s, OPC_OR, TCG_TMP3, TCG_TMP3, TCG_TMP1);
 }

+static void tcg_out_tb_start(TCGContext *s)
+{
+    /* nothing to do */
+}
+
 static void tcg_target_init(TCGContext *s)
 {
    tcg_target_detect_isa();
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@ -261,7 +261,7 @@ static bool reloc_pc14(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
 }

 /* test if a constant matches the constraint */
-static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 {
    if (ct & TCG_CT_CONST) {
        return 1;
@ -2527,6 +2527,11 @@ static void tcg_target_qemu_prologue(TCGContext *s)
    tcg_out32(s, BCLR | BO_ALWAYS);
 }

+static void tcg_out_tb_start(TCGContext *s)
+{
+    /* nothing to do */
+}
+
 static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 {
    tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, arg);
--- a/tcg/region.c
+++ b/tcg/region.c
@ -33,8 +33,19 @@
 #include "tcg/tcg.h"
 #include "exec/translation-block.h"
 #include "tcg-internal.h"
+#include "host/cpuinfo.h"


+/*
+ * Local source-level compatibility with Unix.
+ * Used by tcg_region_init below.
+ */
+#if defined(_WIN32)
+#define PROT_READ   1
+#define PROT_WRITE  2
+#define PROT_EXEC   4
+#endif
+
 struct tcg_region_tree {
    QemuMutex lock;
    QTree *tree;
@ -83,6 +94,18 @@ bool in_code_gen_buffer(const void *p)
    return (size_t)(p - region.start_aligned) <= region.total_size;
 }

+#ifndef CONFIG_TCG_INTERPRETER
+static int host_prot_read_exec(void)
+{
+#if defined(CONFIG_LINUX) && defined(HOST_AARCH64) && defined(PROT_BTI)
+    if (cpuinfo & CPUINFO_BTI) {
+        return PROT_READ | PROT_EXEC | PROT_BTI;
+    }
+#endif
+    return PROT_READ | PROT_EXEC;
+}
+#endif
+
 #ifdef CONFIG_DEBUG_TCG
 const void *tcg_splitwx_to_rx(void *rw)
 {
@ -505,14 +528,6 @@ static int alloc_code_gen_buffer(size_t tb_size, int splitwx, Error **errp)
    return PROT_READ | PROT_WRITE;
 }
 #elif defined(_WIN32)
-/*
- * Local source-level compatibility with Unix.
- * Used by tcg_region_init below.
- */
-#define PROT_READ   1
-#define PROT_WRITE  2
-#define PROT_EXEC   4
-
 static int alloc_code_gen_buffer(size_t size, int splitwx, Error **errp)
 {
    void *buf;
@ -567,7 +582,7 @@ static int alloc_code_gen_buffer_splitwx_memfd(size_t size, Error **errp)
        goto fail;
    }

-    buf_rx = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_SHARED, fd, 0);
+    buf_rx = mmap(NULL, size, host_prot_read_exec(), MAP_SHARED, fd, 0);
    if (buf_rx == MAP_FAILED) {
        goto fail_rx;
    }
@ -642,7 +657,7 @@ static int alloc_code_gen_buffer_splitwx_vmremap(size_t size, Error **errp)
        return -1;
    }

-    if (mprotect((void *)buf_rx, size, PROT_READ | PROT_EXEC) != 0) {
+    if (mprotect((void *)buf_rx, size, host_prot_read_exec()) != 0) {
        error_setg_errno(errp, errno, "mprotect for jit splitwx");
        munmap((void *)buf_rx, size);
        munmap((void *)buf_rw, size);
@ -805,7 +820,7 @@ void tcg_region_init(size_t tb_size, int splitwx, unsigned max_cpus)
    need_prot = PROT_READ | PROT_WRITE;
 #ifndef CONFIG_TCG_INTERPRETER
    if (tcg_splitwx_diff == 0) {
-        need_prot |= PROT_EXEC;
+        need_prot |= host_prot_read_exec();
    }
 #endif
    for (size_t i = 0, n = region.n; i < n; i++) {
@ -820,7 +835,11 @@ void tcg_region_init(size_t tb_size, int splitwx, unsigned max_cpus)
            } else if (need_prot == (PROT_READ | PROT_WRITE)) {
                rc = qemu_mprotect_rw(start, end - start);
            } else {
+#ifdef CONFIG_POSIX
+                rc = mprotect(start, end - start, need_prot);
+#else
                g_assert_not_reached();
+#endif
            }
            if (rc) {
                error_setg_errno(&error_fatal, errno,
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@ -145,7 +145,7 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 #define sextreg  sextract64

 /* test if a constant matches the constraint */
-static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 {
    if (ct & TCG_CT_CONST) {
        return 1;
@ -2099,6 +2099,11 @@ static void tcg_target_qemu_prologue(TCGContext *s)
    tcg_out_opc_imm(s, OPC_JALR, TCG_REG_ZERO, TCG_REG_RA, 0);
 }

+static void tcg_out_tb_start(TCGContext *s)
+{
+    /* nothing to do */
+}
+
 static volatile sig_atomic_t got_sigill;

 static void sigill_handler(int signo, siginfo_t *si, void *data)
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@ -540,7 +540,7 @@ static bool risbg_mask(uint64_t c)
 }

 /* Test if a constant matches the constraint. */
-static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 {
    if (ct & TCG_CT_CONST) {
        return 1;
@ -3483,6 +3483,11 @@ static void tcg_target_qemu_prologue(TCGContext *s)
    tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, TCG_REG_R14);
 }

+static void tcg_out_tb_start(TCGContext *s)
+{
+    /* nothing to do */
+}
+
 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
 {
    memset(p, 0x07, count * sizeof(tcg_insn_unit));
--- a/tcg/sparc64/tcg-target.c.inc
+++ b/tcg/sparc64/tcg-target.c.inc
@ -322,7 +322,7 @@ static bool patch_reloc(tcg_insn_unit *src_rw, int type,
 }

 /* test if a constant matches the constraint */
-static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 {
    if (ct & TCG_CT_CONST) {
        return 1;
@ -962,6 +962,11 @@ static void tcg_target_qemu_prologue(TCGContext *s)
    tcg_out_movi_s13(s, TCG_REG_O0, 0);
 }

+static void tcg_out_tb_start(TCGContext *s)
+{
+    /* nothing to do */
+}
+
 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
 {
    int i;
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@ -3846,6 +3846,155 @@ void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
    }
 }

+static void expand_cmps_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
+                            uint32_t oprsz, uint32_t tysz, TCGType type,
+                            TCGCond cond, TCGv_vec c)
+{
+    TCGv_vec t0 = tcg_temp_new_vec(type);
+    TCGv_vec t1 = tcg_temp_new_vec(type);
+    uint32_t i;
+
+    for (i = 0; i < oprsz; i += tysz) {
+        tcg_gen_ld_vec(t1, cpu_env, aofs + i);
+        tcg_gen_cmp_vec(cond, vece, t0, t1, c);
+        tcg_gen_st_vec(t0, cpu_env, dofs + i);
+    }
+}
+
+void tcg_gen_gvec_cmps(TCGCond cond, unsigned vece, uint32_t dofs,
+                       uint32_t aofs, TCGv_i64 c,
+                       uint32_t oprsz, uint32_t maxsz)
+{
+    static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
+    static gen_helper_gvec_2i * const eq_fn[4] = {
+        gen_helper_gvec_eqs8, gen_helper_gvec_eqs16,
+        gen_helper_gvec_eqs32, gen_helper_gvec_eqs64
+    };
+    static gen_helper_gvec_2i * const lt_fn[4] = {
+        gen_helper_gvec_lts8, gen_helper_gvec_lts16,
+        gen_helper_gvec_lts32, gen_helper_gvec_lts64
+    };
+    static gen_helper_gvec_2i * const le_fn[4] = {
+        gen_helper_gvec_les8, gen_helper_gvec_les16,
+        gen_helper_gvec_les32, gen_helper_gvec_les64
+    };
+    static gen_helper_gvec_2i * const ltu_fn[4] = {
+        gen_helper_gvec_ltus8, gen_helper_gvec_ltus16,
+        gen_helper_gvec_ltus32, gen_helper_gvec_ltus64
+    };
+    static gen_helper_gvec_2i * const leu_fn[4] = {
+        gen_helper_gvec_leus8, gen_helper_gvec_leus16,
+        gen_helper_gvec_leus32, gen_helper_gvec_leus64
+    };
+    static gen_helper_gvec_2i * const * const fns[16] = {
+        [TCG_COND_EQ] = eq_fn,
+        [TCG_COND_LT] = lt_fn,
+        [TCG_COND_LE] = le_fn,
+        [TCG_COND_LTU] = ltu_fn,
+        [TCG_COND_LEU] = leu_fn,
+    };
+
+    TCGType type;
+
+    check_size_align(oprsz, maxsz, dofs | aofs);
+    check_overlap_2(dofs, aofs, maxsz);
+
+    if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
+        do_dup(MO_8, dofs, oprsz, maxsz,
+               NULL, NULL, -(cond == TCG_COND_ALWAYS));
+        return;
+    }
+
+    /*
+     * Implement inline with a vector type, if possible.
+     * Prefer integer when 64-bit host and 64-bit comparison.
+     */
+    type = choose_vector_type(cmp_list, vece, oprsz,
+                              TCG_TARGET_REG_BITS == 64 && vece == MO_64);
+    if (type != 0) {
+        const TCGOpcode *hold_list = tcg_swap_vecop_list(cmp_list);
+        TCGv_vec t_vec = tcg_temp_new_vec(type);
+        uint32_t some;
+
+        tcg_gen_dup_i64_vec(vece, t_vec, c);
+        switch (type) {
+        case TCG_TYPE_V256:
+            some = QEMU_ALIGN_DOWN(oprsz, 32);
+            expand_cmps_vec(vece, dofs, aofs, some, 32,
+                            TCG_TYPE_V256, cond, t_vec);
+            aofs += some;
+            dofs += some;
+            oprsz -= some;
+            maxsz -= some;
+            /* fallthru */
+
+        case TCG_TYPE_V128:
+            some = QEMU_ALIGN_DOWN(oprsz, 16);
+            expand_cmps_vec(vece, dofs, aofs, some, 16,
+                            TCG_TYPE_V128, cond, t_vec);
+            break;
+
+        case TCG_TYPE_V64:
+            some = QEMU_ALIGN_DOWN(oprsz, 8);
+            expand_cmps_vec(vece, dofs, aofs, some, 8,
+                            TCG_TYPE_V64, cond, t_vec);
+            break;
+
+        default:
+            g_assert_not_reached();
+        }
+        tcg_temp_free_vec(t_vec);
+        tcg_swap_vecop_list(hold_list);
+    } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
+        TCGv_i64 t0 = tcg_temp_ebb_new_i64();
+        uint32_t i;
+
+        for (i = 0; i < oprsz; i += 8) {
+            tcg_gen_ld_i64(t0, cpu_env, aofs + i);
+            tcg_gen_negsetcond_i64(cond, t0, t0, c);
+            tcg_gen_st_i64(t0, cpu_env, dofs + i);
+        }
+        tcg_temp_free_i64(t0);
+    } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
+        TCGv_i32 t0 = tcg_temp_ebb_new_i32();
+        TCGv_i32 t1 = tcg_temp_ebb_new_i32();
+        uint32_t i;
+
+        tcg_gen_extrl_i64_i32(t1, c);
+        for (i = 0; i < oprsz; i += 8) {
+            tcg_gen_ld_i32(t0, cpu_env, aofs + i);
+            tcg_gen_negsetcond_i32(cond, t0, t0, t1);
+            tcg_gen_st_i32(t0, cpu_env, dofs + i);
+        }
+        tcg_temp_free_i32(t0);
+        tcg_temp_free_i32(t1);
+    } else {
+        gen_helper_gvec_2i * const *fn = fns[cond];
+        bool inv = false;
+
+        if (fn == NULL) {
+            cond = tcg_invert_cond(cond);
+            fn = fns[cond];
+            assert(fn != NULL);
+            inv = true;
+        }
+        tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, inv, fn[vece]);
+        return;
+    }
+
+    if (oprsz < maxsz) {
+        expand_clr(dofs + oprsz, maxsz - oprsz);
+    }
+}
+
+void tcg_gen_gvec_cmpi(TCGCond cond, unsigned vece, uint32_t dofs,
+                       uint32_t aofs, int64_t c,
+                       uint32_t oprsz, uint32_t maxsz)
+{
+    TCGv_i64 tmp = tcg_constant_i64(c);
+    tcg_gen_gvec_cmps(cond, vece, dofs, aofs, tmp, oprsz, maxsz);
+}
+
 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
 {
    TCGv_i64 t = tcg_temp_ebb_new_i64();
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@ -108,6 +108,7 @@ static void tcg_register_jit_int(const void *buf, size_t size,
    __attribute__((unused));

 /* Forward declarations for functions declared and used in tcg-target.c.inc. */
+static void tcg_out_tb_start(TCGContext *s);
 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
                       intptr_t arg2);
 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
@ -171,7 +172,7 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
                         const TCGHelperInfo *info);
 static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot);
-static bool tcg_target_const_match(int64_t val, TCGType type, int ct);
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece);
 #ifdef TCG_TARGET_NEED_LDST_LABELS
 static int tcg_out_ldst_finalize(TCGContext *s);
 #endif
@ -4689,7 +4690,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
        ts = arg_temp(arg);

        if (ts->val_type == TEMP_VAL_CONST
-            && tcg_target_const_match(ts->val, ts->type, arg_ct->ct)) {
+            && tcg_target_const_match(ts->val, ts->type, arg_ct->ct, TCGOP_VECE(op))) {
            /* constant is OK for instruction */
            const_args[i] = 1;
            new_args[i] = ts->val;
@ -6014,6 +6015,8 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
    s->gen_insn_data =
        tcg_malloc(sizeof(uint64_t) * s->gen_tb->icount * start_words);

+    tcg_out_tb_start(s);
+
    num_insns = -1;
    QTAILQ_FOREACH(op, &s->ops, link) {
        TCGOpcode opc = op->opc;
--- a/tcg/tci/tcg-target.c.inc
+++ b/tcg/tci/tcg-target.c.inc
@ -913,7 +913,7 @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
 }

 /* Test if a constant matches the constraint. */
-static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
+static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 {
    return ct & TCG_CT_CONST;
 }
@ -955,6 +955,11 @@ static inline void tcg_target_qemu_prologue(TCGContext *s)
 {
 }

+static void tcg_out_tb_start(TCGContext *s)
+{
+    /* nothing to do */
+}
+
 bool tcg_target_has_memory_bswap(MemOp memop)
 {
    return true;
--- a/tests/tcg/m68k/Makefile.target
+++ b/tests/tcg/m68k/Makefile.target
@ -4,7 +4,7 @@
 #

 VPATH += $(SRC_PATH)/tests/tcg/m68k
-TESTS += trap
+TESTS += trap denormal

 # On m68k Linux supports 4k and 8k pages (but 8k is currently broken)
 EXTRA_RUNS+=run-test-mmap-4096 # run-test-mmap-8192
--- a/tests/tcg/m68k/denormal.c
+++ b/tests/tcg/m68k/denormal.c
@ -0,0 +1,53 @@
+/*
+ * Test m68k extended double denormals.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+
+#define TEST(X, Y)  { X, Y, X * Y }
+
+static volatile long double test[][3] = {
+    TEST(0x1p+16383l, 0x1p-16446l),
+    TEST(0x1.1p-8223l, 0x1.1p-8224l),
+    TEST(1.0l, 0x1p-16383l),
+};
+
+#undef TEST
+
+static void dump_ld(const char *label, long double ld)
+{
+    union {
+        long double  d;
+        struct {
+            uint32_t exp:16;
+            uint32_t space:16;
+            uint32_t h;
+            uint32_t l;
+        };
+    } u;
+
+    u.d = ld;
+    printf("%12s: % -27La 0x%04x 0x%08x 0x%08x\n", label, u.d, u.exp, u.h, u.l);
+}
+
+int main(void)
+{
+    int i, n = sizeof(test) / sizeof(test[0]), err = 0;
+
+    for (i = 0; i < n; ++i) {
+        long double x = test[i][0];
+        long double y = test[i][1];
+        long double build_mul = test[i][2];
+        long double runtime_mul = x * y;
+
+        if (runtime_mul != build_mul) {
+            dump_ld("x", x);
+            dump_ld("y", y);
+            dump_ld("build_mul", build_mul);
+            dump_ld("runtime_mul", runtime_mul);
+            err = 1;
+        }
+    }
+    return err;
+}
--- a/util/cpuinfo-aarch64.c
+++ b/util/cpuinfo-aarch64.c
@ -13,6 +13,9 @@
 #  include <asm/hwcap.h>
 #  include "elf.h"
 # endif
+# ifndef HWCAP2_BTI
+#  define HWCAP2_BTI 0  /* added in glibc 2.32 */
+# endif
 #endif
 #ifdef CONFIG_DARWIN
 # include <sys/sysctl.h>
@ -58,12 +61,16 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
    info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
    info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0);
    info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0);
+
+    unsigned long hwcap2 = qemu_getauxval(AT_HWCAP2);
+    info |= (hwcap2 & HWCAP2_BTI ? CPUINFO_BTI : 0);
 #endif
 #ifdef CONFIG_DARWIN
    info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE;
    info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE2") * CPUINFO_LSE2;
    info |= sysctl_for_bool("hw.optional.arm.FEAT_AES") * CPUINFO_AES;
    info |= sysctl_for_bool("hw.optional.arm.FEAT_PMULL") * CPUINFO_PMULL;
+    info |= sysctl_for_bool("hw.optional.arm.FEAT_BTI") * CPUINFO_BTI;
 #endif

    cpuinfo = info;
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@ -585,7 +585,7 @@ char *qemu_get_pid_name(pid_t pid)

 void *qemu_alloc_stack(size_t *sz)
 {
-    void *ptr, *guardpage;
+    void *ptr;
    int flags;
 #ifdef CONFIG_DEBUG_STACK_USAGE
    void *ptr2;
@ -618,17 +618,8 @@ void *qemu_alloc_stack(size_t *sz)
        abort();
    }

-#if defined(HOST_IA64)
-    /* separate register stack */
-    guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz);
-#elif defined(HOST_HPPA)
-    /* stack grows up */
-    guardpage = ptr + *sz - pagesz;
-#else
-    /* stack grows down */
-    guardpage = ptr;
-#endif
-    if (mprotect(guardpage, pagesz, PROT_NONE) != 0) {
+    /* Stack grows down -- guard page at the bottom. */
+    if (mprotect(ptr, pagesz, PROT_NONE) != 0) {
        perror("failed to set up stack guard page");
        abort();
    }