diff --git a/cpu-exec.c b/cpu-exec.c
index b240b9fa45..9f4bd0b6dd 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -147,7 +147,8 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
                            itb->tc_ptr, itb->pc, lookup_symbol(itb->pc));
 
 #if defined(DEBUG_DISAS)
-    if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) {
+    if (qemu_loglevel_mask(CPU_LOG_TB_CPU)
+        && qemu_log_in_addr_range(itb->pc)) {
 #if defined(TARGET_I386)
         log_cpu_state(cpu, CPU_DUMP_CCOP);
 #elif defined(TARGET_M68K)
diff --git a/cputlb.c b/cputlb.c
index d068ee597e..3c99c34ac8 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -543,10 +543,8 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
 #undef MMUSUFFIX
 
 #define MMUSUFFIX _cmmu
-#undef GETPC_ADJ
-#define GETPC_ADJ 0
-#undef GETRA
-#define GETRA() ((uintptr_t)0)
+#undef GETPC
+#define GETPC() ((uintptr_t)0)
 #define SOFTMMU_CODE_ACCESS
 
 #define SHIFT 0
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index a0e87be88f..008e09a3c1 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -349,13 +349,12 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
     tb_next->jmp_list_first = (uintptr_t)tb | n;
 }
 
-/* GETRA is the true target of the return instruction that we'll execute,
-   defined here for simplicity of defining the follow-up macros.  */
+/* GETPC is the true target of the return instruction that we'll execute.  */
 #if defined(CONFIG_TCG_INTERPRETER)
 extern uintptr_t tci_tb_ptr;
-# define GETRA() tci_tb_ptr
+# define GETPC() tci_tb_ptr
 #else
-# define GETRA() \
+# define GETPC() \
     ((uintptr_t)__builtin_extract_return_addr(__builtin_return_address(0)))
 #endif
 
@@ -368,8 +367,6 @@ extern uintptr_t tci_tb_ptr;
    smaller than 4 bytes, so we don't worry about special-casing this.  */
 #define GETPC_ADJ   2
 
-#define GETPC()  (GETRA() - GETPC_ADJ)
-
 #if !defined(CONFIG_USER_ONLY)
 
 struct MemoryRegion *iotlb_to_region(CPUState *cpu,
diff --git a/softmmu_template.h b/softmmu_template.h
index 284ab2c7b2..27ed2694df 100644
--- a/softmmu_template.h
+++ b/softmmu_template.h
@@ -146,14 +146,11 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
     unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
-    int a_bits = get_alignment_bits(get_memop(oi));
+    unsigned a_bits = get_alignment_bits(get_memop(oi));
     uintptr_t haddr;
     DATA_TYPE res;
 
-    /* Adjust the given return address.  */
-    retaddr -= GETPC_ADJ;
-
-    if (a_bits > 0 && (addr & ((1 << a_bits) - 1)) != 0) {
+    if (addr & ((1 << a_bits) - 1)) {
         cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
                              mmu_idx, retaddr);
     }
@@ -193,10 +190,8 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
     do_unaligned_access:
         addr1 = addr & ~(DATA_SIZE - 1);
         addr2 = addr1 + DATA_SIZE;
-        /* Note the adjustment at the beginning of the function.
-           Undo that for the recursion.  */
-        res1 = helper_le_ld_name(env, addr1, oi, retaddr + GETPC_ADJ);
-        res2 = helper_le_ld_name(env, addr2, oi, retaddr + GETPC_ADJ);
+        res1 = helper_le_ld_name(env, addr1, oi, retaddr);
+        res2 = helper_le_ld_name(env, addr2, oi, retaddr);
         shift = (addr & (DATA_SIZE - 1)) * 8;
 
         /* Little-endian combine.  */
@@ -220,14 +215,11 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
     unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
-    int a_bits = get_alignment_bits(get_memop(oi));
+    unsigned a_bits = get_alignment_bits(get_memop(oi));
     uintptr_t haddr;
     DATA_TYPE res;
 
-    /* Adjust the given return address.  */
-    retaddr -= GETPC_ADJ;
-
-    if (a_bits > 0 && (addr & ((1 << a_bits) - 1)) != 0) {
+    if (addr & ((1 << a_bits) - 1)) {
         cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
                              mmu_idx, retaddr);
     }
@@ -267,10 +259,8 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
     do_unaligned_access:
         addr1 = addr & ~(DATA_SIZE - 1);
         addr2 = addr1 + DATA_SIZE;
-        /* Note the adjustment at the beginning of the function.
-           Undo that for the recursion.  */
-        res1 = helper_be_ld_name(env, addr1, oi, retaddr + GETPC_ADJ);
-        res2 = helper_be_ld_name(env, addr2, oi, retaddr + GETPC_ADJ);
+        res1 = helper_be_ld_name(env, addr1, oi, retaddr);
+        res2 = helper_be_ld_name(env, addr2, oi, retaddr);
         shift = (addr & (DATA_SIZE - 1)) * 8;
 
         /* Big-endian combine.  */
@@ -331,13 +321,10 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
     unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
-    int a_bits = get_alignment_bits(get_memop(oi));
+    unsigned a_bits = get_alignment_bits(get_memop(oi));
     uintptr_t haddr;
 
-    /* Adjust the given return address.  */
-    retaddr -= GETPC_ADJ;
-
-    if (a_bits > 0 && (addr & ((1 << a_bits) - 1)) != 0) {
+    if (addr & ((1 << a_bits) - 1)) {
         cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
                              mmu_idx, retaddr);
     }
@@ -391,10 +378,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
         for (i = 0; i < DATA_SIZE; ++i) {
             /* Little-endian extract.  */
             uint8_t val8 = val >> (i * 8);
-            /* Note the adjustment at the beginning of the function.
-               Undo that for the recursion.  */
             glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
-                                            oi, retaddr + GETPC_ADJ);
+                                            oi, retaddr);
         }
         return;
     }
@@ -414,13 +399,10 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
     unsigned mmu_idx = get_mmuidx(oi);
     int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
     target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
-    int a_bits = get_alignment_bits(get_memop(oi));
+    unsigned a_bits = get_alignment_bits(get_memop(oi));
     uintptr_t haddr;
 
-    /* Adjust the given return address.  */
-    retaddr -= GETPC_ADJ;
-
-    if (a_bits > 0 && (addr & ((1 << a_bits) - 1)) != 0) {
+    if (addr & ((1 << a_bits) - 1)) {
         cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
                              mmu_idx, retaddr);
     }
@@ -474,10 +456,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
         for (i = 0; i < DATA_SIZE; ++i) {
             /* Big-endian extract.  */
             uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
-            /* Note the adjustment at the beginning of the function.
-               Undo that for the recursion.  */
             glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
-                                            oi, retaddr + GETPC_ADJ);
+                                            oi, retaddr);
         }
         return;
     }
diff --git a/target-alpha/translate.c b/target-alpha/translate.c
index 0ea0e6e146..c27c7b9cc4 100644
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -2338,11 +2338,11 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
             break;
         case 0x4000:
             /* MB */
-            /* No-op */
+            tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
             break;
         case 0x4400:
             /* WMB */
-            /* No-op */
+            tcg_gen_mb(TCG_MO_ST_ST | TCG_BAR_SC);
             break;
         case 0x8000:
             /* FETCH */
diff --git a/target-arm/helper.c b/target-arm/helper.c
index 5484c15d1a..25f612d493 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -8310,12 +8310,12 @@ void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in)
              * this purpose use the actual register value passed to us
              * so that we get the fault address right.
              */
-            helper_ret_stb_mmu(env, vaddr_in, 0, oi, GETRA());
+            helper_ret_stb_mmu(env, vaddr_in, 0, oi, GETPC());
             /* Now we can populate the other TLB entries, if any */
             for (i = 0; i < maxidx; i++) {
                 uint64_t va = vaddr + TARGET_PAGE_SIZE * i;
                 if (va != (vaddr_in & TARGET_PAGE_MASK)) {
-                    helper_ret_stb_mmu(env, va, 0, oi, GETRA());
+                    helper_ret_stb_mmu(env, va, 0, oi, GETPC());
                 }
             }
         }
@@ -8332,7 +8332,7 @@ void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in)
          *    bounce buffer was in use
          */
         for (i = 0; i < blocklen; i++) {
-            helper_ret_stb_mmu(env, vaddr + i, 0, oi, GETRA());
+            helper_ret_stb_mmu(env, vaddr + i, 0, oi, GETPC());
         }
     }
 #else
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index f5e29d20a1..ddf52f5e79 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1294,6 +1294,8 @@ static void gen_clrex(DisasContext *s, uint32_t insn)
 static void handle_sync(DisasContext *s, uint32_t insn,
                         unsigned int op1, unsigned int op2, unsigned int crm)
 {
+    TCGBar bar;
+
     if (op1 != 3) {
         unallocated_encoding(s);
         return;
@@ -1305,7 +1307,18 @@ static void handle_sync(DisasContext *s, uint32_t insn,
         return;
     case 4: /* DSB */
     case 5: /* DMB */
-        /* We don't emulate caches so barriers are no-ops */
+        switch (crm & 3) {
+        case 1: /* MBReqTypes_Reads */
+            bar = TCG_BAR_SC | TCG_MO_LD_LD | TCG_MO_LD_ST;
+            break;
+        case 2: /* MBReqTypes_Writes */
+            bar = TCG_BAR_SC | TCG_MO_ST_ST;
+            break;
+        default: /* MBReqTypes_All */
+            bar = TCG_BAR_SC | TCG_MO_ALL;
+            break;
+        }
+        tcg_gen_mb(bar);
         return;
     case 6: /* ISB */
         /* We need to break the TB after this insn to execute
@@ -1934,7 +1947,13 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn)
         if (!is_store) {
             s->is_ldex = true;
             gen_load_exclusive(s, rt, rt2, tcg_addr, size, is_pair);
+            if (is_lasr) {
+                tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+            }
         } else {
+            if (is_lasr) {
+                tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+            }
             gen_store_exclusive(s, rs, rt, rt2, tcg_addr, size, is_pair);
         }
     } else {
@@ -1943,11 +1962,17 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn)
 
         /* Generate ISS for non-exclusive accesses including LASR.  */
         if (is_store) {
+            if (is_lasr) {
+                tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
+            }
             do_gpr_st(s, tcg_rt, tcg_addr, size,
                       true, rt, iss_sf, is_lasr);
         } else {
             do_gpr_ld(s, tcg_rt, tcg_addr, size, false, false,
                       true, rt, iss_sf, is_lasr);
+            if (is_lasr) {
+                tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);
+            }
         }
     }
 }
diff --git a/target-arm/translate.c b/target-arm/translate.c
index bd5d5cb576..693d4bc6a2 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -8083,7 +8083,7 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
             case 4: /* dsb */
             case 5: /* dmb */
                 ARCH(7);
-                /* We don't emulate caches so these are a no-op.  */
+                tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
                 return;
             case 6: /* isb */
                 /* We need to break the TB after this insn to execute
@@ -10432,7 +10432,7 @@ static int disas_thumb2_insn(CPUARMState *env, DisasContext *s, uint16_t insn_hw
                             break;
                         case 4: /* dsb */
                         case 5: /* dmb */
-                            /* These execute as NOPs.  */
+                            tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
                             break;
                         case 6: /* isb */
                             /* We need to break the TB after this insn
diff --git a/target-i386/translate.c b/target-i386/translate.c
index fa2ac48173..9447557911 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -8012,13 +8012,21 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
                 || (prefixes & PREFIX_LOCK)) {
                 goto illegal_op;
             }
+            tcg_gen_mb(TCG_MO_ST_ST | TCG_BAR_SC);
             break;
         case 0xe8 ... 0xef: /* lfence */
+            if (!(s->cpuid_features & CPUID_SSE)
+                || (prefixes & PREFIX_LOCK)) {
+                goto illegal_op;
+            }
+            tcg_gen_mb(TCG_MO_LD_LD | TCG_BAR_SC);
+            break;
         case 0xf0 ... 0xf7: /* mfence */
             if (!(s->cpuid_features & CPUID_SSE2)
                 || (prefixes & PREFIX_LOCK)) {
                 goto illegal_op;
             }
+            tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
             break;
 
         default:
diff --git a/target-mips/op_helper.c b/target-mips/op_helper.c
index ea2f2abe19..7af4c2f084 100644
--- a/target-mips/op_helper.c
+++ b/target-mips/op_helper.c
@@ -4122,10 +4122,10 @@ void helper_msa_ld_ ## TYPE(CPUMIPSState *env, uint32_t wd,             \
 }
 
 #if !defined(CONFIG_USER_ONLY)
-MSA_LD_DF(DF_BYTE,   b, helper_ret_ldub_mmu, oi, GETRA())
-MSA_LD_DF(DF_HALF,   h, helper_ret_lduw_mmu, oi, GETRA())
-MSA_LD_DF(DF_WORD,   w, helper_ret_ldul_mmu, oi, GETRA())
-MSA_LD_DF(DF_DOUBLE, d, helper_ret_ldq_mmu,  oi, GETRA())
+MSA_LD_DF(DF_BYTE,   b, helper_ret_ldub_mmu, oi, GETPC())
+MSA_LD_DF(DF_HALF,   h, helper_ret_lduw_mmu, oi, GETPC())
+MSA_LD_DF(DF_WORD,   w, helper_ret_ldul_mmu, oi, GETPC())
+MSA_LD_DF(DF_DOUBLE, d, helper_ret_ldq_mmu,  oi, GETPC())
 #else
 MSA_LD_DF(DF_BYTE,   b, cpu_ldub_data)
 MSA_LD_DF(DF_HALF,   h, cpu_lduw_data)
@@ -4161,17 +4161,17 @@ void helper_msa_st_ ## TYPE(CPUMIPSState *env, uint32_t wd,             \
     int mmu_idx = cpu_mmu_index(env, false);				\
     int i;                                                              \
     MEMOP_IDX(DF)                                                       \
-    ensure_writable_pages(env, addr, mmu_idx, GETRA());                 \
+    ensure_writable_pages(env, addr, mmu_idx, GETPC());                 \
     for (i = 0; i < DF_ELEMENTS(DF); i++) {                             \
         ST_INSN(env, addr + (i << DF), pwd->TYPE[i], ##__VA_ARGS__);    \
     }                                                                   \
 }
 
 #if !defined(CONFIG_USER_ONLY)
-MSA_ST_DF(DF_BYTE,   b, helper_ret_stb_mmu, oi, GETRA())
-MSA_ST_DF(DF_HALF,   h, helper_ret_stw_mmu, oi, GETRA())
-MSA_ST_DF(DF_WORD,   w, helper_ret_stl_mmu, oi, GETRA())
-MSA_ST_DF(DF_DOUBLE, d, helper_ret_stq_mmu, oi, GETRA())
+MSA_ST_DF(DF_BYTE,   b, helper_ret_stb_mmu, oi, GETPC())
+MSA_ST_DF(DF_HALF,   h, helper_ret_stw_mmu, oi, GETPC())
+MSA_ST_DF(DF_WORD,   w, helper_ret_stl_mmu, oi, GETPC())
+MSA_ST_DF(DF_DOUBLE, d, helper_ret_stq_mmu, oi, GETPC())
 #else
 MSA_ST_DF(DF_BYTE,   b, cpu_stb_data)
 MSA_ST_DF(DF_HALF,   h, cpu_stw_data)
diff --git a/tcg/README b/tcg/README
index ce8bebab37..1d48aa963f 100644
--- a/tcg/README
+++ b/tcg/README
@@ -402,6 +402,23 @@ double-word product T0.  The later is returned in two single-word outputs.
 
 Similar to mulu2, except the two inputs T1 and T2 are signed.
 
+********* Memory Barrier support
+
+* mb <$arg>
+
+Generate a target memory barrier instruction to ensure memory ordering as being
+enforced by a corresponding guest memory barrier instruction. The ordering
+enforced by the backend may be stricter than the ordering required by the guest.
+It cannot be weaker. This opcode takes a constant argument which is required to
+generate the appropriate barrier instruction. The backend should take care to
+emit the target barrier instruction only when necessary i.e., for SMP guests and
+when MTTCG is enabled.
+
+The guest translators should generate this opcode for all guest instructions
+which have ordering side effects.
+
+Please see docs/atomics.txt for more information on memory barriers.
+
 ********* 64-bit guest on 32-bit host support
 
 The following opcodes are internal to TCG.  Thus they are to be implemented by
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 08b2d031aa..1939d3528f 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -372,6 +372,11 @@ typedef enum {
     I3510_EOR       = 0x4a000000,
     I3510_EON       = 0x4a200000,
     I3510_ANDS      = 0x6a000000,
+
+    /* System instructions.  */
+    DMB_ISH         = 0xd50338bf,
+    DMB_LD          = 0x00000100,
+    DMB_ST          = 0x00000200,
 } AArch64Insn;
 
 static inline uint32_t tcg_in32(TCGContext *s)
@@ -981,6 +986,18 @@ static inline void tcg_out_addsub2(TCGContext *s, int ext, TCGReg rl,
     tcg_out_mov(s, ext, orig_rl, rl);
 }
 
+static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
+{
+    static const uint32_t sync[] = {
+        [0 ... TCG_MO_ALL]            = DMB_ISH | DMB_LD | DMB_ST,
+        [TCG_MO_ST_ST]                = DMB_ISH | DMB_ST,
+        [TCG_MO_LD_LD]                = DMB_ISH | DMB_LD,
+        [TCG_MO_LD_ST]                = DMB_ISH | DMB_LD,
+        [TCG_MO_LD_ST | TCG_MO_LD_LD] = DMB_ISH | DMB_LD,
+    };
+    tcg_out32(s, sync[a0 & TCG_MO_ALL]);
+}
+
 #ifdef CONFIG_SOFTMMU
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     TCGMemOpIdx oi, uintptr_t ra)
@@ -1081,23 +1098,24 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc,
     int tlb_offset = is_read ?
         offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
         : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write);
-    int a_bits = get_alignment_bits(opc);
+    unsigned a_bits = get_alignment_bits(opc);
+    unsigned s_bits = opc & MO_SIZE;
+    unsigned a_mask = (1u << a_bits) - 1;
+    unsigned s_mask = (1u << s_bits) - 1;
     TCGReg base = TCG_AREG0, x3;
     uint64_t tlb_mask;
 
     /* For aligned accesses, we check the first byte and include the alignment
        bits within the address.  For unaligned access, we check that we don't
        cross pages using the address of the last byte of the access.  */
-    if (a_bits >= 0) {
-        /* A byte access or an alignment check required */
-        tlb_mask = TARGET_PAGE_MASK | ((1 << a_bits) - 1);
+    if (a_bits >= s_bits) {
         x3 = addr_reg;
     } else {
         tcg_out_insn(s, 3401, ADDI, TARGET_LONG_BITS == 64,
-                     TCG_REG_X3, addr_reg, (1 << (opc & MO_SIZE)) - 1);
-        tlb_mask = TARGET_PAGE_MASK;
+                     TCG_REG_X3, addr_reg, s_mask - a_mask);
         x3 = TCG_REG_X3;
     }
+    tlb_mask = (uint64_t)TARGET_PAGE_MASK | a_mask;
 
     /* Extract the TLB index from the address into X0.
        X0<CPU_TLB_BITS:0> =
@@ -1648,6 +1666,10 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_insn(s, 3508, SMULH, TCG_TYPE_I64, a0, a1, a2);
         break;
 
+    case INDEX_op_mb:
+        tcg_out_mb(s, a0);
+        break;
+
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
@@ -1772,6 +1794,7 @@ static const TCGTargetOpDef aarch64_op_defs[] = {
     { INDEX_op_muluh_i64, { "r", "r", "r" } },
     { INDEX_op_mulsh_i64, { "r", "r", "r" } },
 
+    { INDEX_op_mb, { } },
     { -1 },
 };
 
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index 172febafdd..ffa0d40660 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -313,6 +313,10 @@ typedef enum {
     INSN_LDRD_REG  = 0x000000d0,
     INSN_STRD_IMM  = 0x004000f0,
     INSN_STRD_REG  = 0x000000f0,
+
+    INSN_DMB_ISH   = 0x5bf07ff5,
+    INSN_DMB_MCR   = 0xba0f07ee,
+
 } ARMInsn;
 
 #define SHIFT_IMM_LSL(im)	(((im) << 7) | 0x00)
@@ -1066,6 +1070,15 @@ static inline void tcg_out_goto_label(TCGContext *s, int cond, TCGLabel *l)
     }
 }
 
+static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
+{
+    if (use_armv7_instructions) {
+        tcg_out32(s, INSN_DMB_ISH);
+    } else if (use_armv6_instructions) {
+        tcg_out32(s, INSN_DMB_MCR);
+    }
+}
+
 #ifdef CONFIG_SOFTMMU
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     int mmu_idx, uintptr_t ra)
@@ -1168,7 +1181,7 @@ QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
    containing the addend of the tlb entry.  Clobbers R0, R1, R2, TMP.  */
 
 static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
-                               TCGMemOp s_bits, int mem_index, bool is_load)
+                               TCGMemOp opc, int mem_index, bool is_load)
 {
     TCGReg base = TCG_AREG0;
     int cmp_off =
@@ -1176,6 +1189,8 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
          ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
          : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
     int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
+    unsigned s_bits = opc & MO_SIZE;
+    unsigned a_bits = get_alignment_bits(opc);
 
     /* Should generate something like the following:
      *   shr    tmp, addrlo, #TARGET_PAGE_BITS                    (1)
@@ -1216,10 +1231,13 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
         }
     }
 
-    /* Check alignment.  */
-    if (s_bits) {
-        tcg_out_dat_imm(s, COND_AL, ARITH_TST,
-                        0, addrlo, (1 << s_bits) - 1);
+    /* Check alignment.  We don't support inline unaligned acceses,
+       but we can easily support overalignment checks.  */
+    if (a_bits < s_bits) {
+        a_bits = s_bits;
+    }
+    if (a_bits) {
+        tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, (1 << a_bits) - 1);
     }
 
     /* Load the tlb addend.  */
@@ -1499,7 +1517,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
 
 #ifdef CONFIG_SOFTMMU
     mem_index = get_mmuidx(oi);
-    addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 1);
+    addend = tcg_out_tlb_read(s, addrlo, addrhi, opc, mem_index, 1);
 
     /* This a conditional BL only to load a pointer within this opcode into LR
        for the slow path.  We will not be using the value for a tail call.  */
@@ -1630,7 +1648,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 
 #ifdef CONFIG_SOFTMMU
     mem_index = get_mmuidx(oi);
-    addend = tcg_out_tlb_read(s, addrlo, addrhi, opc & MO_SIZE, mem_index, 0);
+    addend = tcg_out_tlb_read(s, addrlo, addrhi, opc, mem_index, 0);
 
     tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
 
@@ -1923,6 +1941,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_udiv(s, COND_AL, args[0], args[1], args[2]);
         break;
 
+    case INDEX_op_mb:
+        tcg_out_mb(s, args[0]);
+        break;
+
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
@@ -1997,6 +2019,7 @@ static const TCGTargetOpDef arm_op_defs[] = {
     { INDEX_op_div_i32, { "r", "r", "r" } },
     { INDEX_op_divu_i32, { "r", "r", "r" } },
 
+    { INDEX_op_mb, { } },
     { -1 },
 };
 
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 6f8cdca756..cf7536bd52 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -686,6 +686,18 @@ static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
     }
 }
 
+static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
+{
+    /* Given the strength of x86 memory ordering, we only need care for
+       store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
+       faster than "mfence", so don't bother with the sse insn.  */
+    if (a0 & TCG_MO_ST_LD) {
+        tcg_out8(s, 0xf0);
+        tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
+        tcg_out8(s, 0);
+    }
+}
+
 static inline void tcg_out_push(TCGContext *s, int reg)
 {
     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
@@ -1202,7 +1214,10 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     TCGType ttype = TCG_TYPE_I32;
     TCGType tlbtype = TCG_TYPE_I32;
     int trexw = 0, hrexw = 0, tlbrexw = 0;
-    int a_bits = get_alignment_bits(opc);
+    unsigned a_bits = get_alignment_bits(opc);
+    unsigned s_bits = opc & MO_SIZE;
+    unsigned a_mask = (1 << a_bits) - 1;
+    unsigned s_mask = (1 << s_bits) - 1;
     target_ulong tlb_mask;
 
     if (TCG_TARGET_REG_BITS == 64) {
@@ -1220,17 +1235,15 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     }
 
     tcg_out_mov(s, tlbtype, r0, addrlo);
-    if (a_bits >= 0) {
-        /* A byte access or an alignment check required */
+    /* If the required alignment is at least as large as the access, simply
+       copy the address and mask.  For lesser alignments, check that we don't
+       cross pages for the complete access.  */
+    if (a_bits >= s_bits) {
         tcg_out_mov(s, ttype, r1, addrlo);
-        tlb_mask = TARGET_PAGE_MASK | ((1 << a_bits) - 1);
     } else {
-        /* For unaligned access check that we don't cross pages using
-           the page address of the last byte.  */
-        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo,
-                             (1 << (opc & MO_SIZE)) - 1);
-        tlb_mask = TARGET_PAGE_MASK;
+        tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
     }
+    tlb_mask = TARGET_PAGE_MASK | a_mask;
 
     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
@@ -2130,6 +2143,9 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         }
         break;
 
+    case INDEX_op_mb:
+        tcg_out_mb(s, args[0]);
+        break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
@@ -2195,6 +2211,8 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_add2_i32, { "r", "r", "0", "1", "ri", "ri" } },
     { INDEX_op_sub2_i32, { "r", "r", "0", "1", "ri", "ri" } },
 
+    { INDEX_op_mb, { } },
+
 #if TCG_TARGET_REG_BITS == 32
     { INDEX_op_brcond2_i32, { "r", "r", "ri", "ri" } },
     { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } },
diff --git a/tcg/ia64/tcg-target.inc.c b/tcg/ia64/tcg-target.inc.c
index c91f39281b..b04d716c3d 100644
--- a/tcg/ia64/tcg-target.inc.c
+++ b/tcg/ia64/tcg-target.inc.c
@@ -247,6 +247,7 @@ enum {
     OPC_LD4_M3                = 0x0a080000000ull,
     OPC_LD8_M1                = 0x080c0000000ull,
     OPC_LD8_M3                = 0x0a0c0000000ull,
+    OPC_MF_M24                = 0x00110000000ull,
     OPC_MUX1_I3               = 0x0eca0000000ull,
     OPC_NOP_B9                = 0x04008000000ull,
     OPC_NOP_F16               = 0x00008000000ull,
@@ -1496,10 +1497,18 @@ QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
    R1, R3 are clobbered, leaving R56 free for...
    BSWAP_1, BSWAP_2 and I-slot insns for swapping data for store.  */
 static inline void tcg_out_qemu_tlb(TCGContext *s, TCGReg addr_reg,
-                                    TCGMemOp s_bits, int off_rw, int off_add,
+                                    TCGMemOp opc, int off_rw, int off_add,
                                     uint64_t bswap1, uint64_t bswap2)
 {
-     /*
+    unsigned s_bits = opc & MO_SIZE;
+    unsigned a_bits = get_alignment_bits(opc);
+
+    /* We don't support unaligned accesses, but overalignment is easy.  */
+    if (a_bits < s_bits) {
+        a_bits = s_bits;
+    }
+
+    /*
         .mii
         mov	r2 = off_rw
         extr.u	r3 = addr_reg, ...		# extract tlb page
@@ -1521,7 +1530,7 @@ static inline void tcg_out_qemu_tlb(TCGContext *s, TCGReg addr_reg,
         cmp.eq	p6, p7 = r3, r58
         nop
         ;;
-      */
+    */
     tcg_out_bundle(s, miI,
                    tcg_opc_movi_a(TCG_REG_P0, TCG_REG_R2, off_rw),
                    tcg_opc_i11(TCG_REG_P0, OPC_EXTR_U_I11, TCG_REG_R3,
@@ -1536,8 +1545,8 @@ static inline void tcg_out_qemu_tlb(TCGContext *s, TCGReg addr_reg,
                                TCG_REG_R3, 63 - CPU_TLB_ENTRY_BITS,
                                63 - CPU_TLB_ENTRY_BITS),
                    tcg_opc_i14(TCG_REG_P0, OPC_DEP_I14, TCG_REG_R1, 0,
-                               TCG_REG_R57, 63 - s_bits,
-                               TARGET_PAGE_BITS - s_bits - 1));
+                               TCG_REG_R57, 63 - a_bits,
+                               TARGET_PAGE_BITS - a_bits - 1));
     tcg_out_bundle(s, MmI,
                    tcg_opc_a1 (TCG_REG_P0, OPC_ADD_A1,
                                TCG_REG_R2, TCG_REG_R2, TCG_REG_R3),
@@ -1661,7 +1670,7 @@ static inline void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args)
     s_bits = opc & MO_SIZE;
 
     /* Read the TLB entry */
-    tcg_out_qemu_tlb(s, addr_reg, s_bits,
+    tcg_out_qemu_tlb(s, addr_reg, opc,
                      offsetof(CPUArchState, tlb_table[mem_index][0].addr_read),
                      offsetof(CPUArchState, tlb_table[mem_index][0].addend),
                      INSN_NOP_I, INSN_NOP_I);
@@ -1739,7 +1748,7 @@ static inline void tcg_out_qemu_st(TCGContext *s, const TCGArg *args)
         pre1 = tcg_opc_ext_i(TCG_REG_P0, opc, TCG_REG_R58, data_reg);
     }
 
-    tcg_out_qemu_tlb(s, addr_reg, s_bits,
+    tcg_out_qemu_tlb(s, addr_reg, opc,
                      offsetof(CPUArchState, tlb_table[mem_index][0].addr_write),
                      offsetof(CPUArchState, tlb_table[mem_index][0].addend),
                      pre1, pre2);
@@ -2223,6 +2232,9 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_qemu_st(s, args);
         break;
 
+    case INDEX_op_mb:
+        tcg_out_bundle(s, mmI, OPC_MF_M24, INSN_NOP_M, INSN_NOP_I);
+        break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
@@ -2336,6 +2348,7 @@ static const TCGTargetOpDef ia64_op_defs[] = {
     { INDEX_op_qemu_st_i32, { "SZ", "r" } },
     { INDEX_op_qemu_st_i64, { "SZ", "r" } },
 
+    { INDEX_op_mb, { } },
     { -1 },
 };
 
diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
index 2f9be48139..abce6026f8 100644
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@@ -292,6 +292,7 @@ typedef enum {
     OPC_JALR     = OPC_SPECIAL | 0x09,
     OPC_MOVZ     = OPC_SPECIAL | 0x0A,
     OPC_MOVN     = OPC_SPECIAL | 0x0B,
+    OPC_SYNC     = OPC_SPECIAL | 0x0F,
     OPC_MFHI     = OPC_SPECIAL | 0x10,
     OPC_MFLO     = OPC_SPECIAL | 0x12,
     OPC_MULT     = OPC_SPECIAL | 0x18,
@@ -339,6 +340,14 @@ typedef enum {
      * backwards-compatible at the assembly level.
      */
     OPC_MUL      = use_mips32r6_instructions ? OPC_MUL_R6 : OPC_MUL_R5,
+
+    /* MIPS r6 introduced names for weaker variants of SYNC.  These are
+       backward compatible to previous architecture revisions.  */
+    OPC_SYNC_WMB     = OPC_SYNC | 0x04 << 5,
+    OPC_SYNC_MB      = OPC_SYNC | 0x10 << 5,
+    OPC_SYNC_ACQUIRE = OPC_SYNC | 0x11 << 5,
+    OPC_SYNC_RELEASE = OPC_SYNC | 0x12 << 5,
+    OPC_SYNC_RMB     = OPC_SYNC | 0x13 << 5,
 } MIPSInsn;
 
 /*
@@ -1040,7 +1049,9 @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
                              TCGReg addrh, TCGMemOpIdx oi,
                              tcg_insn_unit *label_ptr[2], bool is_load)
 {
-    TCGMemOp s_bits = get_memop(oi) & MO_SIZE;
+    TCGMemOp opc = get_memop(oi);
+    unsigned s_bits = opc & MO_SIZE;
+    unsigned a_bits = get_alignment_bits(opc);
     int mem_index = get_mmuidx(oi);
     int cmp_off
         = (is_load
@@ -1071,10 +1082,15 @@ static void tcg_out_tlb_load(TCGContext *s, TCGReg base, TCGReg addrl,
     tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, TCG_REG_A0,
                     cmp_off + (TARGET_LONG_BITS == 64 ? LO_OFF : 0));
 
+    /* We don't currently support unaligned accesses.
+       We could do so with mips32r6.  */
+    if (a_bits < s_bits) {
+        a_bits = s_bits;
+    }
     /* Mask the page bits, keeping the alignment bits to compare against.
        In between on 32-bit targets, load the tlb addend for the fast path.  */
     tcg_out_movi(s, TCG_TYPE_I32, TCG_TMP1,
-                 TARGET_PAGE_MASK | ((1 << s_bits) - 1));
+                 TARGET_PAGE_MASK | ((1 << a_bits) - 1));
     if (TARGET_LONG_BITS == 32) {
         tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, TCG_REG_A0, add_off);
     }
@@ -1377,6 +1393,22 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64)
 #endif
 }
 
+static void tcg_out_mb(TCGContext *s, TCGArg a0)
+{
+    static const MIPSInsn sync[] = {
+        /* Note that SYNC_MB is a slightly weaker than SYNC 0,
+           as the former is an ordering barrier and the latter
+           is a completion barrier.  */
+        [0 ... TCG_MO_ALL]            = OPC_SYNC_MB,
+        [TCG_MO_LD_LD]                = OPC_SYNC_RMB,
+        [TCG_MO_ST_ST]                = OPC_SYNC_WMB,
+        [TCG_MO_LD_ST]                = OPC_SYNC_RELEASE,
+        [TCG_MO_LD_ST | TCG_MO_ST_ST] = OPC_SYNC_RELEASE,
+        [TCG_MO_LD_ST | TCG_MO_LD_LD] = OPC_SYNC_ACQUIRE,
+    };
+    tcg_out32(s, sync[a0 & TCG_MO_ALL]);
+}
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
 {
@@ -1646,6 +1678,9 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                         const_args[4], const_args[5], true);
         break;
 
+    case INDEX_op_mb:
+        tcg_out_mb(s, a0);
+        break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
@@ -1726,6 +1761,8 @@ static const TCGTargetOpDef mips_op_defs[] = {
     { INDEX_op_qemu_ld_i64, { "L", "L", "lZ", "lZ" } },
     { INDEX_op_qemu_st_i64, { "SZ", "SZ", "SZ", "SZ" } },
 #endif
+
+    { INDEX_op_mb, { } },
     { -1 },
 };
 
diff --git a/tcg/optimize.c b/tcg/optimize.c
index cffe89b525..9998ac7413 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -542,6 +542,7 @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
 void tcg_optimize(TCGContext *s)
 {
     int oi, oi_next, nb_temps, nb_globals;
+    TCGArg *prev_mb_args = NULL;
 
     /* Array VALS has an element for each temp.
        If this temp holds a constant then its value is kept in VALS' element.
@@ -1295,5 +1296,43 @@ void tcg_optimize(TCGContext *s)
             }
             break;
         }
+
+        /* Eliminate duplicate and redundant fence instructions.  */
+        if (prev_mb_args) {
+            switch (opc) {
+            case INDEX_op_mb:
+                /* Merge two barriers of the same type into one,
+                 * or a weaker barrier into a stronger one,
+                 * or two weaker barriers into a stronger one.
+                 *   mb X; mb Y => mb X|Y
+                 *   mb; strl => mb; st
+                 *   ldaq; mb => ld; mb
+                 *   ldaq; strl => ld; mb; st
+                 * Other combinations are also merged into a strong
+                 * barrier.  This is stricter than specified but for
+                 * the purposes of TCG is better than not optimizing.
+                 */
+                prev_mb_args[0] |= args[0];
+                tcg_op_remove(s, op);
+                break;
+
+            default:
+                /* Opcodes that end the block stop the optimization.  */
+                if ((def->flags & TCG_OPF_BB_END) == 0) {
+                    break;
+                }
+                /* fallthru */
+            case INDEX_op_qemu_ld_i32:
+            case INDEX_op_qemu_ld_i64:
+            case INDEX_op_qemu_st_i32:
+            case INDEX_op_qemu_st_i64:
+            case INDEX_op_call:
+                /* Opcodes that touch guest memory stop the optimization.  */
+                prev_mb_args = NULL;
+                break;
+            }
+        } else if (opc == INDEX_op_mb) {
+            prev_mb_args = args;
+        }
     }
 }
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index eaf1bd9bfd..a3262cfb0c 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -469,6 +469,10 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define STHX   XO31(407)
 #define STWX   XO31(151)
 
+#define EIEIO  XO31(854)
+#define HWSYNC XO31(598)
+#define LWSYNC (HWSYNC | (1u << 21))
+
 #define SPR(a, b) ((((a)<<5)|(b))<<11)
 #define LR     SPR(8, 0)
 #define CTR    SPR(9, 0)
@@ -1243,6 +1247,18 @@ static void tcg_out_brcond2 (TCGContext *s, const TCGArg *args,
     tcg_out_bc(s, BC | BI(7, CR_EQ) | BO_COND_TRUE, arg_label(args[5]));
 }
 
+static void tcg_out_mb(TCGContext *s, TCGArg a0)
+{
+    uint32_t insn = HWSYNC;
+    a0 &= TCG_MO_ALL;
+    if (a0 == TCG_MO_LD_LD) {
+        insn = LWSYNC;
+    } else if (a0 == TCG_MO_ST_ST) {
+        insn = EIEIO;
+    }
+    tcg_out32(s, insn);
+}
+
 #ifdef __powerpc64__
 void ppc_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr)
 {
@@ -1404,8 +1420,8 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp opc,
            : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
     int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
     TCGReg base = TCG_AREG0;
-    TCGMemOp s_bits = opc & MO_SIZE;
-    int a_bits = get_alignment_bits(opc);
+    unsigned s_bits = opc & MO_SIZE;
+    unsigned a_bits = get_alignment_bits(opc);
 
     /* Extract the page index, shifted into place for tlb index.  */
     if (TCG_TARGET_REG_BITS == 64) {
@@ -1458,39 +1474,43 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGMemOp opc,
     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_R3, TCG_REG_R3, add_off);
 
     /* Clear the non-page, non-alignment bits from the address */
-    if (TCG_TARGET_REG_BITS == 32 || TARGET_LONG_BITS == 32) {
-        /* We don't support unaligned accesses on 32-bits, preserve
-         * the bottom bits and thus trigger a comparison failure on
-         * unaligned accesses
+    if (TCG_TARGET_REG_BITS == 32) {
+        /* We don't support unaligned accesses on 32-bits.
+         * Preserve the bottom bits and thus trigger a comparison
+         * failure on unaligned accesses.
          */
-        if (a_bits < 0) {
+        if (a_bits < s_bits) {
             a_bits = s_bits;
         }
         tcg_out_rlw(s, RLWINM, TCG_REG_R0, addrlo, 0,
                     (32 - a_bits) & 31, 31 - TARGET_PAGE_BITS);
-    } else if (a_bits) {
-        /* More than byte access, we need to handle alignment */
-        if (a_bits > 0) {
-            /* Alignment required by the front-end, same as 32-bits */
-            tcg_out_rld(s, RLDICL, TCG_REG_R0, addrlo,
+    } else {
+        TCGReg t = addrlo;
+
+        /* If the access is unaligned, we need to make sure we fail if we
+         * cross a page boundary.  The trick is to add the access size-1
+         * to the address before masking the low bits.  That will make the
+         * address overflow to the next page if we cross a page boundary,
+         * which will then force a mismatch of the TLB compare.
+         */
+        if (a_bits < s_bits) {
+            unsigned a_mask = (1 << a_bits) - 1;
+            unsigned s_mask = (1 << s_bits) - 1;
+            tcg_out32(s, ADDI | TAI(TCG_REG_R0, t, s_mask - a_mask));
+            t = TCG_REG_R0;
+        }
+
+        /* Mask the address for the requested alignment.  */
+        if (TARGET_LONG_BITS == 32) {
+            tcg_out_rlw(s, RLWINM, TCG_REG_R0, t, 0,
+                        (32 - a_bits) & 31, 31 - TARGET_PAGE_BITS);
+        } else if (a_bits == 0) {
+            tcg_out_rld(s, RLDICR, TCG_REG_R0, t, 0, 63 - TARGET_PAGE_BITS);
+        } else {
+            tcg_out_rld(s, RLDICL, TCG_REG_R0, t,
                         64 - TARGET_PAGE_BITS, TARGET_PAGE_BITS - a_bits);
             tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, TARGET_PAGE_BITS, 0);
-       } else {
-           /* We support unaligned accesses, we need to make sure we fail
-            * if we cross a page boundary. The trick is to add the
-            * access_size-1 to the address before masking the low bits.
-            * That will make the address overflow to the next page if we
-            * cross a page boundary which will then force a mismatch of
-            * the TLB compare since the next page cannot possibly be in
-            * the same TLB index.
-            */
-            tcg_out32(s, ADDI | TAI(TCG_REG_R0, addrlo, (1 << s_bits) - 1));
-            tcg_out_rld(s, RLDICR, TCG_REG_R0, TCG_REG_R0,
-                        0, 63 - TARGET_PAGE_BITS);
         }
-    } else {
-        /* Byte access, just chop off the bits below the page index */
-        tcg_out_rld(s, RLDICR, TCG_REG_R0, addrlo, 0, 63 - TARGET_PAGE_BITS);
     }
 
     if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
@@ -2449,6 +2469,10 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
         tcg_out32(s, MULHD | TAB(args[0], args[1], args[2]));
         break;
 
+    case INDEX_op_mb:
+        tcg_out_mb(s, args[0]);
+        break;
+
     case INDEX_op_mov_i32:   /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_movi_i32:  /* Always emitted via tcg_out_movi.  */
@@ -2596,6 +2620,7 @@ static const TCGTargetOpDef ppc_op_defs[] = {
     { INDEX_op_qemu_st_i64, { "S", "S", "S", "S" } },
 #endif
 
+    { INDEX_op_mb, { } },
     { -1 },
 };
 
diff --git a/tcg/s390/tcg-target.inc.c b/tcg/s390/tcg-target.inc.c
index 5a7495b063..253d4a0a0b 100644
--- a/tcg/s390/tcg-target.inc.c
+++ b/tcg/s390/tcg-target.inc.c
@@ -343,6 +343,7 @@ static tcg_insn_unit *tb_ret_addr;
 #define FACILITY_EXT_IMM	(1ULL << (63 - 21))
 #define FACILITY_GEN_INST_EXT	(1ULL << (63 - 34))
 #define FACILITY_LOAD_ON_COND   (1ULL << (63 - 45))
+#define FACILITY_FAST_BCR_SER   FACILITY_LOAD_ON_COND
 
 static uint64_t facilities;
 
@@ -1505,21 +1506,18 @@ QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1])
 static TCGReg tcg_out_tlb_read(TCGContext* s, TCGReg addr_reg, TCGMemOp opc,
                                int mem_index, bool is_ld)
 {
-    int a_bits = get_alignment_bits(opc);
+    unsigned s_bits = opc & MO_SIZE;
+    unsigned a_bits = get_alignment_bits(opc);
+    unsigned s_mask = (1 << s_bits) - 1;
+    unsigned a_mask = (1 << a_bits) - 1;
     int ofs, a_off;
     uint64_t tlb_mask;
 
     /* For aligned accesses, we check the first byte and include the alignment
        bits within the address.  For unaligned access, we check that we don't
        cross pages using the address of the last byte of the access.  */
-    if (a_bits >= 0) {
-        /* A byte access or an alignment check required */
-        a_off = 0;
-        tlb_mask = TARGET_PAGE_MASK | ((1 << a_bits) - 1);
-    } else {
-        a_off = (1 << (opc & MO_SIZE)) - 1;
-        tlb_mask = TARGET_PAGE_MASK;
-    }
+    a_off = (a_bits >= s_bits ? 0 : s_mask - a_mask);
+    tlb_mask = (uint64_t)TARGET_PAGE_MASK | a_mask;
 
     if (facilities & FACILITY_GEN_INST_EXT) {
         tcg_out_risbg(s, TCG_REG_R2, addr_reg,
@@ -2172,6 +2170,15 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tgen_deposit(s, args[0], args[2], args[3], args[4]);
         break;
 
+    case INDEX_op_mb:
+        /* The host memory model is quite strong, we simply need to
+           serialize the instruction stream.  */
+        if (args[0] & TCG_MO_ST_LD) {
+            tcg_out_insn(s, RR, BCR,
+                         facilities & FACILITY_FAST_BCR_SER ? 14 : 15, 0);
+        }
+        break;
+
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
@@ -2293,6 +2300,7 @@ static const TCGTargetOpDef s390_op_defs[] = {
     { INDEX_op_movcond_i64, { "r", "r", "rC", "r", "0" } },
     { INDEX_op_deposit_i64, { "r", "0", "r" } },
 
+    { INDEX_op_mb, { } },
     { -1 },
 };
 
diff --git a/tcg/sparc/tcg-target.inc.c b/tcg/sparc/tcg-target.inc.c
index 8e98172ca0..700c43487f 100644
--- a/tcg/sparc/tcg-target.inc.c
+++ b/tcg/sparc/tcg-target.inc.c
@@ -249,6 +249,8 @@ static const int tcg_target_call_oarg_regs[] = {
 #define STWA       (INSN_OP(3) | INSN_OP3(0x14))
 #define STXA       (INSN_OP(3) | INSN_OP3(0x1e))
 
+#define MEMBAR     (INSN_OP(2) | INSN_OP3(0x28) | INSN_RS1(15) | (1 << 13))
+
 #ifndef ASI_PRIMARY_LITTLE
 #define ASI_PRIMARY_LITTLE 0x88
 #endif
@@ -835,6 +837,12 @@ static void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
     tcg_out_nop(s);
 }
 
+static void tcg_out_mb(TCGContext *s, TCGArg a0)
+{
+    /* Note that the TCG memory order constants mirror the Sparc MEMBAR.  */
+    tcg_out32(s, MEMBAR | (a0 & TCG_MO_ALL));
+}
+
 #ifdef CONFIG_SOFTMMU
 static tcg_insn_unit *qemu_ld_trampoline[16];
 static tcg_insn_unit *qemu_st_trampoline[16];
@@ -996,19 +1004,25 @@ static void tcg_target_qemu_prologue(TCGContext *s)
    is in the returned register, maybe %o0.  The TLB addend is in %o1.  */
 
 static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg addr, int mem_index,
-                               TCGMemOp s_bits, int which)
+                               TCGMemOp opc, int which)
 {
     const TCGReg r0 = TCG_REG_O0;
     const TCGReg r1 = TCG_REG_O1;
     const TCGReg r2 = TCG_REG_O2;
+    unsigned s_bits = opc & MO_SIZE;
+    unsigned a_bits = get_alignment_bits(opc);
     int tlb_ofs;
 
     /* Shift the page number down.  */
     tcg_out_arithi(s, r1, addr, TARGET_PAGE_BITS, SHIFT_SRL);
 
-    /* Mask out the page offset, except for the required alignment.  */
+    /* Mask out the page offset, except for the required alignment.
+       We don't support unaligned accesses.  */
+    if (a_bits < s_bits) {
+        a_bits = s_bits;
+    }
     tcg_out_movi(s, TCG_TYPE_TL, TCG_REG_T1,
-                 TARGET_PAGE_MASK | ((1 << s_bits) - 1));
+                 TARGET_PAGE_MASK | ((1 << a_bits) - 1));
 
     /* Mask the tlb index.  */
     tcg_out_arithi(s, r1, r1, CPU_TLB_SIZE - 1, ARITH_AND);
@@ -1087,7 +1101,7 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg data, TCGReg addr,
     tcg_insn_unit *func;
     tcg_insn_unit *label_ptr;
 
-    addrz = tcg_out_tlb_load(s, addr, memi, memop & MO_SIZE,
+    addrz = tcg_out_tlb_load(s, addr, memi, memop,
                              offsetof(CPUTLBEntry, addr_read));
 
     /* The fast path is exactly one insn.  Thus we can perform the
@@ -1169,7 +1183,7 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg data, TCGReg addr,
     tcg_insn_unit *func;
     tcg_insn_unit *label_ptr;
 
-    addrz = tcg_out_tlb_load(s, addr, memi, memop & MO_SIZE,
+    addrz = tcg_out_tlb_load(s, addr, memi, memop,
                              offsetof(CPUTLBEntry, addr_write));
 
     /* The fast path is exactly one insn.  Thus we can perform the entire
@@ -1460,6 +1474,10 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 	tcg_out_arithc(s, a0, TCG_REG_G0, a1, const_args[1], c);
 	break;
 
+    case INDEX_op_mb:
+        tcg_out_mb(s, a0);
+        break;
+
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
@@ -1561,6 +1579,7 @@ static const TCGTargetOpDef sparc_op_defs[] = {
     { INDEX_op_qemu_st_i32, { "sZ", "A" } },
     { INDEX_op_qemu_st_i64, { "SZ", "A" } },
 
+    { INDEX_op_mb, { } },
     { -1 },
 };
 
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 0243c99094..291d50bb7d 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -148,6 +148,23 @@ void tcg_gen_op6(TCGContext *ctx, TCGOpcode opc, TCGArg a1, TCGArg a2,
     tcg_emit_op(ctx, opc, pi);
 }
 
+void tcg_gen_mb(TCGBar mb_type)
+{
+    bool emit_barriers = true;
+
+#ifndef CONFIG_USER_ONLY
+    /* TODO: When MTTCG is available for system mode, we will check
+     * the following condition and enable emit_barriers
+     * (qemu_tcg_mttcg_enabled() && smp_cpus > 1)
+     */
+    emit_barriers = false;
+#endif
+
+    if (emit_barriers) {
+        tcg_gen_op1(&tcg_ctx, INDEX_op_mb, mb_type);
+    }
+}
+
 /* 32 bit ops */
 
 void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index f217e80747..02cb376681 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -261,6 +261,8 @@ static inline void tcg_gen_br(TCGLabel *l)
     tcg_gen_op1(&tcg_ctx, INDEX_op_br, label_arg(l));
 }
 
+void tcg_gen_mb(TCGBar);
+
 /* Helper calls. */
 
 /* 32 bit ops */
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 6d0410c4b9..45528d2192 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -42,6 +42,8 @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END)
 # define IMPL64  TCG_OPF_64BIT
 #endif
 
+DEF(mb, 0, 0, 1, 0)
+
 DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT)
 DEF(movi_i32, 1, 0, 1, TCG_OPF_NOT_PRESENT)
 DEF(setcond_i32, 1, 2, 1, 0)
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 9bf31bbc84..c9949aa79b 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -287,20 +287,19 @@ typedef enum TCGMemOp {
      * MO_ALIGN accesses will result in a call to the CPU's
      * do_unaligned_access hook if the guest address is not aligned.
      * The default depends on whether the target CPU defines ALIGNED_ONLY.
+     *
      * Some architectures (e.g. ARMv8) need the address which is aligned
      * to a size more than the size of the memory access.
-     * To support such check it's enough the current costless alignment
-     * check implementation in QEMU, but we need to support
-     * an alignment size specifying.
-     * MO_ALIGN supposes a natural alignment
-     * (i.e. the alignment size is the size of a memory access).
-     * Note that an alignment size must be equal or greater
-     * than an access size.
+     * Some architectures (e.g. SPARCv9) need an address which is aligned,
+     * but less strictly than the natural alignment.
+     *
+     * MO_ALIGN supposes the alignment size is the size of a memory access.
+     *
      * There are three options:
-     * - an alignment to the size of an access (MO_ALIGN);
-     * - an alignment to the specified size that is equal or greater than
-     *   an access size (MO_ALIGN_x where 'x' is a size in bytes);
      * - unaligned access permitted (MO_UNALN).
+     * - an alignment to the size of an access (MO_ALIGN);
+     * - an alignment to a specified size, which may be more or less than
+     *   the access size (MO_ALIGN_x where 'x' is a size in bytes);
      */
     MO_ASHIFT = 4,
     MO_AMASK = 7 << MO_ASHIFT,
@@ -353,38 +352,26 @@ typedef enum TCGMemOp {
  * @memop: TCGMemOp value
  *
  * Extract the alignment size from the memop.
- *
- * Returns: 0 in case of byte access (which is always aligned);
- *          positive value - number of alignment bits;
- *          negative value if unaligned access enabled
- *          and this is not a byte access.
  */
-static inline int get_alignment_bits(TCGMemOp memop)
+static inline unsigned get_alignment_bits(TCGMemOp memop)
 {
-    int a = memop & MO_AMASK;
-    int s = memop & MO_SIZE;
-    int r;
+    unsigned a = memop & MO_AMASK;
 
     if (a == MO_UNALN) {
-        /* Negative value if unaligned access enabled,
-         * or zero value in case of byte access.
-         */
-        return -s;
+        /* No alignment required.  */
+        a = 0;
     } else if (a == MO_ALIGN) {
-        /* A natural alignment: return a number of access size bits */
-        r = s;
+        /* A natural alignment requirement.  */
+        a = memop & MO_SIZE;
     } else {
-        /* Specific alignment size. It must be equal or greater
-         * than the access size.
-         */
-        r = a >> MO_ASHIFT;
-        tcg_debug_assert(r >= s);
+        /* A specific alignment requirement.  */
+        a = a >> MO_ASHIFT;
     }
 #if defined(CONFIG_SOFTMMU)
     /* The requested alignment cannot overlap the TLB flags.  */
-    tcg_debug_assert((TLB_FLAGS_MASK & ((1 << r) - 1)) == 0);
+    tcg_debug_assert((TLB_FLAGS_MASK & ((1 << a) - 1)) == 0);
 #endif
-    return r;
+    return a;
 }
 
 typedef tcg_target_ulong TCGArg;
@@ -478,6 +465,23 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
 #define TCG_CALL_DUMMY_TCGV     MAKE_TCGV_I32(-1)
 #define TCG_CALL_DUMMY_ARG      ((TCGArg)(-1))
 
+typedef enum {
+    /* Used to indicate the type of accesses on which ordering
+       is to be ensured.  Modeled after SPARC barriers.  */
+    TCG_MO_LD_LD  = 0x01,
+    TCG_MO_ST_LD  = 0x02,
+    TCG_MO_LD_ST  = 0x04,
+    TCG_MO_ST_ST  = 0x08,
+    TCG_MO_ALL    = 0x0F,  /* OR of the above */
+
+    /* Used to indicate the kind of ordering which is to be ensured by the
+       instruction.  These types are derived from x86/aarch64 instructions.
+       It should be noted that these are different from C11 semantics.  */
+    TCG_BAR_LDAQ  = 0x10,  /* Following ops will not come forward */
+    TCG_BAR_STRL  = 0x20,  /* Previous ops will not be delayed */
+    TCG_BAR_SC    = 0x30,  /* No ops cross barrier; OR of the above */
+} TCGBar;
+
 /* Conditions.  Note that these are laid out for easy manipulation by
    the functions below:
      bit 0 is used for inverting;
diff --git a/tcg/tci/tcg-target.inc.c b/tcg/tci/tcg-target.inc.c
index 3c47ea7a9e..9dbf4d5512 100644
--- a/tcg/tci/tcg-target.inc.c
+++ b/tcg/tci/tcg-target.inc.c
@@ -255,6 +255,7 @@ static const TCGTargetOpDef tcg_target_op_defs[] = {
     { INDEX_op_bswap32_i32, { R, R } },
 #endif
 
+    { INDEX_op_mb, { } },
     { -1 },
 };
 
@@ -800,6 +801,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
         }
         tcg_out_i(s, *args++);
         break;
+    case INDEX_op_mb:
+        break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
diff --git a/tci.c b/tci.c
index b488c0d8e4..4bdc645f2a 100644
--- a/tci.c
+++ b/tci.c
@@ -1236,6 +1236,10 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr)
                 tcg_abort();
             }
             break;
+        case INDEX_op_mb:
+            /* Ensure ordering for all kinds */
+            smp_mb();
+            break;
         default:
             TODO();
             break;
diff --git a/translate-all.c b/translate-all.c
index b6663dc91d..e9bc90c654 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -260,6 +260,8 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
     int64_t ti = profile_getclock();
 #endif
 
+    searched_pc -= GETPC_ADJ;
+
     if (searched_pc < host_pc) {
         return -1;
     }
diff --git a/user-exec.c b/user-exec.c
index 95f9f97c5c..6db075884d 100644
--- a/user-exec.c
+++ b/user-exec.c
@@ -105,8 +105,11 @@ static inline int handle_cpu_signal(uintptr_t pc, unsigned long address,
     if (ret == 0) {
         return 1; /* the MMU fault was handled without causing real CPU fault */
     }
-    /* now we have a real cpu fault */
-    cpu_restore_state(cpu, pc);
+
+    /* Now we have a real cpu fault.  Since this is the exact location of
+     * the exception, we must undo the adjustment done by cpu_restore_state
+     * for handling call return addresses.  */
+    cpu_restore_state(cpu, pc + GETPC_ADJ);
 
     sigprocmask(SIG_SETMASK, old_set, NULL);
     cpu_loop_exit(cpu);