From 53d284554cfb476a43807fe94fa59909ed5d9ff8 Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Tue, 23 Oct 2018 03:57:11 +0100
Subject: [PATCH 01/10] cputlb: Move tlb_lock to CPUTLBCommon

This is the first of several moves to reduce the size of the
CPU_COMMON_TLB macro and improve some locality of refernce.

Tested-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c      | 48 ++++++++++++++++++++---------------------
 include/exec/cpu-defs.h | 17 ++++++++++++---
 2 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index af57aca5e4..d4e07056be 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -78,7 +78,7 @@ void tlb_init(CPUState *cpu)
 {
     CPUArchState *env = cpu->env_ptr;
 
-    qemu_spin_init(&env->tlb_lock);
+    qemu_spin_init(&env->tlb_c.lock);
 }
 
 /* flush_all_helper: run fn across all cpus
@@ -134,15 +134,15 @@ static void tlb_flush_nocheck(CPUState *cpu)
     tlb_debug("(count: %zu)\n", tlb_flush_count());
 
     /*
-     * tlb_table/tlb_v_table updates from any thread must hold tlb_lock.
+     * tlb_table/tlb_v_table updates from any thread must hold tlb_c.lock.
      * However, updates from the owner thread (as is the case here; see the
      * above assert_cpu_is_self) do not need atomic_set because all reads
      * that do not hold the lock are performed by the same owner thread.
      */
-    qemu_spin_lock(&env->tlb_lock);
+    qemu_spin_lock(&env->tlb_c.lock);
     memset(env->tlb_table, -1, sizeof(env->tlb_table));
     memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
-    qemu_spin_unlock(&env->tlb_lock);
+    qemu_spin_unlock(&env->tlb_c.lock);
 
     cpu_tb_jmp_cache_clear(cpu);
 
@@ -195,7 +195,7 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 
     tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);
 
-    qemu_spin_lock(&env->tlb_lock);
+    qemu_spin_lock(&env->tlb_c.lock);
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
 
         if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
@@ -205,7 +205,7 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
             memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
         }
     }
-    qemu_spin_unlock(&env->tlb_lock);
+    qemu_spin_unlock(&env->tlb_c.lock);
 
     cpu_tb_jmp_cache_clear(cpu);
 
@@ -262,7 +262,7 @@ static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry,
            tlb_hit_page(tlb_entry->addr_code, page);
 }
 
-/* Called with tlb_lock held */
+/* Called with tlb_c.lock held */
 static inline void tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
                                           target_ulong page)
 {
@@ -271,7 +271,7 @@ static inline void tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
     }
 }
 
-/* Called with tlb_lock held */
+/* Called with tlb_c.lock held */
 static inline void tlb_flush_vtlb_page_locked(CPUArchState *env, int mmu_idx,
                                               target_ulong page)
 {
@@ -304,12 +304,12 @@ static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
     }
 
     addr &= TARGET_PAGE_MASK;
-    qemu_spin_lock(&env->tlb_lock);
+    qemu_spin_lock(&env->tlb_c.lock);
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
         tlb_flush_entry_locked(tlb_entry(env, mmu_idx, addr), addr);
         tlb_flush_vtlb_page_locked(env, mmu_idx, addr);
     }
-    qemu_spin_unlock(&env->tlb_lock);
+    qemu_spin_unlock(&env->tlb_c.lock);
 
     tb_flush_jmp_cache(cpu, addr);
 }
@@ -345,14 +345,14 @@ static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
     tlb_debug("flush page addr:"TARGET_FMT_lx" mmu_idx:0x%lx\n",
               addr, mmu_idx_bitmap);
 
-    qemu_spin_lock(&env->tlb_lock);
+    qemu_spin_lock(&env->tlb_c.lock);
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
         if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
             tlb_flush_entry_locked(tlb_entry(env, mmu_idx, addr), addr);
             tlb_flush_vtlb_page_locked(env, mmu_idx, addr);
         }
     }
-    qemu_spin_unlock(&env->tlb_lock);
+    qemu_spin_unlock(&env->tlb_c.lock);
 
     tb_flush_jmp_cache(cpu, addr);
 }
@@ -479,7 +479,7 @@ void tlb_unprotect_code(ram_addr_t ram_addr)
  * te->addr_write with atomic_set. We don't need to worry about this for
  * oversized guests as MTTCG is disabled for them.
  *
- * Called with tlb_lock held.
+ * Called with tlb_c.lock held.
  */
 static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
                                          uintptr_t start, uintptr_t length)
@@ -501,7 +501,7 @@ static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
 }
 
 /*
- * Called with tlb_lock held.
+ * Called with tlb_c.lock held.
  * Called only from the vCPU context, i.e. the TLB's owner thread.
  */
 static inline void copy_tlb_helper_locked(CPUTLBEntry *d, const CPUTLBEntry *s)
@@ -511,7 +511,7 @@ static inline void copy_tlb_helper_locked(CPUTLBEntry *d, const CPUTLBEntry *s)
 
 /* This is a cross vCPU call (i.e. another vCPU resetting the flags of
  * the target vCPU).
- * We must take tlb_lock to avoid racing with another vCPU update. The only
+ * We must take tlb_c.lock to avoid racing with another vCPU update. The only
  * thing actually updated is the target TLB entry ->addr_write flags.
  */
 void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
@@ -521,7 +521,7 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
     int mmu_idx;
 
     env = cpu->env_ptr;
-    qemu_spin_lock(&env->tlb_lock);
+    qemu_spin_lock(&env->tlb_c.lock);
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
         unsigned int i;
 
@@ -535,10 +535,10 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
                                          length);
         }
     }
-    qemu_spin_unlock(&env->tlb_lock);
+    qemu_spin_unlock(&env->tlb_c.lock);
 }
 
-/* Called with tlb_lock held */
+/* Called with tlb_c.lock held */
 static inline void tlb_set_dirty1_locked(CPUTLBEntry *tlb_entry,
                                          target_ulong vaddr)
 {
@@ -557,7 +557,7 @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
     assert_cpu_is_self(cpu);
 
     vaddr &= TARGET_PAGE_MASK;
-    qemu_spin_lock(&env->tlb_lock);
+    qemu_spin_lock(&env->tlb_c.lock);
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
         tlb_set_dirty1_locked(tlb_entry(env, mmu_idx, vaddr), vaddr);
     }
@@ -568,7 +568,7 @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
             tlb_set_dirty1_locked(&env->tlb_v_table[mmu_idx][k], vaddr);
         }
     }
-    qemu_spin_unlock(&env->tlb_lock);
+    qemu_spin_unlock(&env->tlb_c.lock);
 }
 
 /* Our TLB does not support large pages, so remember the area covered by
@@ -669,7 +669,7 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
      * a longer critical section, but this is not a concern since the TLB lock
      * is unlikely to be contended.
      */
-    qemu_spin_lock(&env->tlb_lock);
+    qemu_spin_lock(&env->tlb_c.lock);
 
     /* Make sure there's no cached translation for the new page.  */
     tlb_flush_vtlb_page_locked(env, mmu_idx, vaddr_page);
@@ -736,7 +736,7 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     }
 
     copy_tlb_helper_locked(te, &tn);
-    qemu_spin_unlock(&env->tlb_lock);
+    qemu_spin_unlock(&env->tlb_c.lock);
 }
 
 /* Add a new TLB entry, but without specifying the memory
@@ -917,11 +917,11 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
             /* Found entry in victim tlb, swap tlb and iotlb.  */
             CPUTLBEntry tmptlb, *tlb = &env->tlb_table[mmu_idx][index];
 
-            qemu_spin_lock(&env->tlb_lock);
+            qemu_spin_lock(&env->tlb_c.lock);
             copy_tlb_helper_locked(&tmptlb, tlb);
             copy_tlb_helper_locked(tlb, vtlb);
             copy_tlb_helper_locked(vtlb, &tmptlb);
-            qemu_spin_unlock(&env->tlb_lock);
+            qemu_spin_unlock(&env->tlb_c.lock);
 
             CPUIOTLBEntry tmpio, *io = &env->iotlb[mmu_idx][index];
             CPUIOTLBEntry *vio = &env->iotlb_v[mmu_idx][vidx];
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index 4ff62f32bf..9005923b4d 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -141,10 +141,21 @@ typedef struct CPUIOTLBEntry {
     MemTxAttrs attrs;
 } CPUIOTLBEntry;
 
+/*
+ * Data elements that are shared between all MMU modes.
+ */
+typedef struct CPUTLBCommon {
+    /* lock serializes updates to tlb_table and tlb_v_table */
+    QemuSpin lock;
+} CPUTLBCommon;
+
+/*
+ * The meaning of each of the MMU modes is defined in the target code.
+ * Note that NB_MMU_MODES is not yet defined; we can only reference it
+ * within preprocessor defines that will be expanded later.
+ */
 #define CPU_COMMON_TLB \
-    /* The meaning of the MMU modes is defined in the target code. */   \
-    /* tlb_lock serializes updates to tlb_table and tlb_v_table */      \
-    QemuSpin tlb_lock;                                                  \
+    CPUTLBCommon tlb_c;                                                 \
     CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];                  \
     CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE];               \
     CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE];                    \

From 8ab102667e4c700082e2e923905e0b443518795c Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Sat, 20 Oct 2018 13:28:04 -0700
Subject: [PATCH 02/10] cputlb: Remove tcg_enabled hack from tlb_flush_nocheck

The bugs this was working around were fixed with commits
022d6378c7fd  target/unicore32: remove tlb_flush from uc32_init_fn
6e11beecfde0  target/alpha: remove tlb_flush from alpha_cpu_initfn

Tested-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index d4e07056be..d080769c83 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -122,13 +122,6 @@ static void tlb_flush_nocheck(CPUState *cpu)
 {
     CPUArchState *env = cpu->env_ptr;
 
-    /* The QOM tests will trigger tlb_flushes without setting up TCG
-     * so we bug out here in that case.
-     */
-    if (!tcg_enabled()) {
-        return;
-    }
-
     assert_cpu_is_self(cpu);
     atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
     tlb_debug("(count: %zu)\n", tlb_flush_count());

From 60a2ad7d86e7379e6669806bedaa6cfdf4f2c2f4 Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Sat, 20 Oct 2018 13:54:46 -0700
Subject: [PATCH 03/10] cputlb: Move cpu->pending_tlb_flush to
 env->tlb_c.pending_flush

Protect it with the tlb_lock instead of using atomics.
The move puts it in or near the same cacheline as the lock;
using the lock means we don't need a second atomic operation
in order to perform the update.  Which makes it cheap to also
update pending_flush in tlb_flush_by_mmuidx_async_work.

Tested-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c      | 35 +++++++++++++++++++++++------------
 include/exec/cpu-defs.h |  8 +++++++-
 include/qom/cpu.h       |  6 ------
 3 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index d080769c83..abcd08a8a2 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -133,6 +133,7 @@ static void tlb_flush_nocheck(CPUState *cpu)
      * that do not hold the lock are performed by the same owner thread.
      */
     qemu_spin_lock(&env->tlb_c.lock);
+    env->tlb_c.pending_flush = 0;
     memset(env->tlb_table, -1, sizeof(env->tlb_table));
     memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
     qemu_spin_unlock(&env->tlb_c.lock);
@@ -142,8 +143,6 @@ static void tlb_flush_nocheck(CPUState *cpu)
     env->vtlb_index = 0;
     env->tlb_flush_addr = -1;
     env->tlb_flush_mask = 0;
-
-    atomic_mb_set(&cpu->pending_tlb_flush, 0);
 }
 
 static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)
@@ -154,8 +153,15 @@ static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)
 void tlb_flush(CPUState *cpu)
 {
     if (cpu->created && !qemu_cpu_is_self(cpu)) {
-        if (atomic_mb_read(&cpu->pending_tlb_flush) != ALL_MMUIDX_BITS) {
-            atomic_mb_set(&cpu->pending_tlb_flush, ALL_MMUIDX_BITS);
+        CPUArchState *env = cpu->env_ptr;
+        uint16_t pending;
+
+        qemu_spin_lock(&env->tlb_c.lock);
+        pending = env->tlb_c.pending_flush;
+        env->tlb_c.pending_flush = ALL_MMUIDX_BITS;
+        qemu_spin_unlock(&env->tlb_c.lock);
+
+        if (pending != ALL_MMUIDX_BITS) {
             async_run_on_cpu(cpu, tlb_flush_global_async_work,
                              RUN_ON_CPU_NULL);
         }
@@ -189,6 +195,8 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
     tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);
 
     qemu_spin_lock(&env->tlb_c.lock);
+    env->tlb_c.pending_flush &= ~mmu_idx_bitmask;
+
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
 
         if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
@@ -210,19 +218,22 @@ void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
     tlb_debug("mmu_idx: 0x%" PRIx16 "\n", idxmap);
 
     if (!qemu_cpu_is_self(cpu)) {
-        uint16_t pending_flushes = idxmap;
-        pending_flushes &= ~atomic_mb_read(&cpu->pending_tlb_flush);
+        CPUArchState *env = cpu->env_ptr;
+        uint16_t pending, to_clean;
 
-        if (pending_flushes) {
-            tlb_debug("reduced mmu_idx: 0x%" PRIx16 "\n", pending_flushes);
+        qemu_spin_lock(&env->tlb_c.lock);
+        pending = env->tlb_c.pending_flush;
+        to_clean = idxmap & ~pending;
+        env->tlb_c.pending_flush = pending | idxmap;
+        qemu_spin_unlock(&env->tlb_c.lock);
 
-            atomic_or(&cpu->pending_tlb_flush, pending_flushes);
+        if (to_clean) {
+            tlb_debug("reduced mmu_idx: 0x%" PRIx16 "\n", to_clean);
             async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,
-                             RUN_ON_CPU_HOST_INT(pending_flushes));
+                             RUN_ON_CPU_HOST_INT(to_clean));
         }
     } else {
-        tlb_flush_by_mmuidx_async_work(cpu,
-                                       RUN_ON_CPU_HOST_INT(idxmap));
+        tlb_flush_by_mmuidx_async_work(cpu, RUN_ON_CPU_HOST_INT(idxmap));
     }
 }
 
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index 9005923b4d..659c73d2a1 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -145,8 +145,14 @@ typedef struct CPUIOTLBEntry {
  * Data elements that are shared between all MMU modes.
  */
 typedef struct CPUTLBCommon {
-    /* lock serializes updates to tlb_table and tlb_v_table */
+    /* Serialize updates to tlb_table and tlb_v_table, and others as noted. */
     QemuSpin lock;
+    /*
+     * Within pending_flush, for each bit N, there exists an outstanding
+     * cross-cpu flush for mmu_idx N.  Further cross-cpu flushes to that
+     * mmu_idx may be discarded.  Protected by tlb_c.lock.
+     */
+    uint16_t pending_flush;
 } CPUTLBCommon;
 
 /*
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index def0c64308..1396f53e5b 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -429,12 +429,6 @@ struct CPUState {
 
     struct hax_vcpu_state *hax_vcpu;
 
-    /* The pending_tlb_flush flag is set and cleared atomically to
-     * avoid potential races. The aim of the flag is to avoid
-     * unnecessary flushes.
-     */
-    uint16_t pending_tlb_flush;
-
     int hvf_fd;
 
     /* track IOMMUs whose translations we've cached in the TCG TLB */

From 1308e0267174daaf557dac8366ea2ba615d3337f Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Wed, 17 Oct 2018 11:48:40 -0700
Subject: [PATCH 04/10] cputlb: Split large page tracking per mmu_idx

The set of large pages in the kernel is probably not the same
as the set of large pages in the application.  Forcing one
range to cover both will flush more often than necessary.

This allows tlb_flush_page_async_work to flush just the one
mmu_idx implicated, which in turn allows us to remove
tlb_check_page_and_flush_by_mmuidx_async_work.

Tested-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c      | 138 ++++++++++++++++++----------------------
 include/exec/cpu-defs.h |  14 +++-
 2 files changed, 73 insertions(+), 79 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index abcd08a8a2..8060ec99d7 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -113,6 +113,14 @@ size_t tlb_flush_count(void)
     return count;
 }
 
+static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
+{
+    memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
+    memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
+    env->tlb_d[mmu_idx].large_page_addr = -1;
+    env->tlb_d[mmu_idx].large_page_mask = -1;
+}
+
 /* This is OK because CPU architectures generally permit an
  * implementation to drop entries from the TLB at any time, so
  * flushing more entries than required is only an efficiency issue,
@@ -121,6 +129,7 @@ size_t tlb_flush_count(void)
 static void tlb_flush_nocheck(CPUState *cpu)
 {
     CPUArchState *env = cpu->env_ptr;
+    int mmu_idx;
 
     assert_cpu_is_self(cpu);
     atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
@@ -134,15 +143,14 @@ static void tlb_flush_nocheck(CPUState *cpu)
      */
     qemu_spin_lock(&env->tlb_c.lock);
     env->tlb_c.pending_flush = 0;
-    memset(env->tlb_table, -1, sizeof(env->tlb_table));
-    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+        tlb_flush_one_mmuidx_locked(env, mmu_idx);
+    }
     qemu_spin_unlock(&env->tlb_c.lock);
 
     cpu_tb_jmp_cache_clear(cpu);
 
     env->vtlb_index = 0;
-    env->tlb_flush_addr = -1;
-    env->tlb_flush_mask = 0;
 }
 
 static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)
@@ -192,25 +200,19 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 
     assert_cpu_is_self(cpu);
 
-    tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);
+    tlb_debug("mmu_idx:0x%04lx\n", mmu_idx_bitmask);
 
     qemu_spin_lock(&env->tlb_c.lock);
     env->tlb_c.pending_flush &= ~mmu_idx_bitmask;
 
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-
         if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
-            tlb_debug("%d\n", mmu_idx);
-
-            memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
-            memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
+            tlb_flush_one_mmuidx_locked(env, mmu_idx);
         }
     }
     qemu_spin_unlock(&env->tlb_c.lock);
 
     cpu_tb_jmp_cache_clear(cpu);
-
-    tlb_debug("done\n");
 }
 
 void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
@@ -287,6 +289,24 @@ static inline void tlb_flush_vtlb_page_locked(CPUArchState *env, int mmu_idx,
     }
 }
 
+static void tlb_flush_page_locked(CPUArchState *env, int midx,
+                                  target_ulong page)
+{
+    target_ulong lp_addr = env->tlb_d[midx].large_page_addr;
+    target_ulong lp_mask = env->tlb_d[midx].large_page_mask;
+
+    /* Check if we need to flush due to large pages.  */
+    if ((page & lp_mask) == lp_addr) {
+        tlb_debug("forcing full flush midx %d ("
+                  TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
+                  midx, lp_addr, lp_mask);
+        tlb_flush_one_mmuidx_locked(env, midx);
+    } else {
+        tlb_flush_entry_locked(tlb_entry(env, midx, page), page);
+        tlb_flush_vtlb_page_locked(env, midx, page);
+    }
+}
+
 static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
 {
     CPUArchState *env = cpu->env_ptr;
@@ -295,23 +315,12 @@ static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
 
     assert_cpu_is_self(cpu);
 
-    tlb_debug("page :" TARGET_FMT_lx "\n", addr);
-
-    /* Check if we need to flush due to large pages.  */
-    if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {
-        tlb_debug("forcing full flush ("
-                  TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
-                  env->tlb_flush_addr, env->tlb_flush_mask);
-
-        tlb_flush(cpu);
-        return;
-    }
+    tlb_debug("page addr:" TARGET_FMT_lx "\n", addr);
 
     addr &= TARGET_PAGE_MASK;
     qemu_spin_lock(&env->tlb_c.lock);
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        tlb_flush_entry_locked(tlb_entry(env, mmu_idx, addr), addr);
-        tlb_flush_vtlb_page_locked(env, mmu_idx, addr);
+        tlb_flush_page_locked(env, mmu_idx, addr);
     }
     qemu_spin_unlock(&env->tlb_c.lock);
 
@@ -346,14 +355,13 @@ static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
 
     assert_cpu_is_self(cpu);
 
-    tlb_debug("flush page addr:"TARGET_FMT_lx" mmu_idx:0x%lx\n",
+    tlb_debug("page addr:" TARGET_FMT_lx " mmu_map:0x%lx\n",
               addr, mmu_idx_bitmap);
 
     qemu_spin_lock(&env->tlb_c.lock);
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
         if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
-            tlb_flush_entry_locked(tlb_entry(env, mmu_idx, addr), addr);
-            tlb_flush_vtlb_page_locked(env, mmu_idx, addr);
+            tlb_flush_page_locked(env, mmu_idx, addr);
         }
     }
     qemu_spin_unlock(&env->tlb_c.lock);
@@ -361,29 +369,6 @@ static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
     tb_flush_jmp_cache(cpu, addr);
 }
 
-static void tlb_check_page_and_flush_by_mmuidx_async_work(CPUState *cpu,
-                                                          run_on_cpu_data data)
-{
-    CPUArchState *env = cpu->env_ptr;
-    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
-    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
-    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
-
-    tlb_debug("addr:"TARGET_FMT_lx" mmu_idx: %04lx\n", addr, mmu_idx_bitmap);
-
-    /* Check if we need to flush due to large pages.  */
-    if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {
-        tlb_debug("forced full flush ("
-                  TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
-                  env->tlb_flush_addr, env->tlb_flush_mask);
-
-        tlb_flush_by_mmuidx_async_work(cpu,
-                                       RUN_ON_CPU_HOST_INT(mmu_idx_bitmap));
-    } else {
-        tlb_flush_page_by_mmuidx_async_work(cpu, data);
-    }
-}
-
 void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
 {
     target_ulong addr_and_mmu_idx;
@@ -395,10 +380,10 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
     addr_and_mmu_idx |= idxmap;
 
     if (!qemu_cpu_is_self(cpu)) {
-        async_run_on_cpu(cpu, tlb_check_page_and_flush_by_mmuidx_async_work,
+        async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_work,
                          RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
     } else {
-        tlb_check_page_and_flush_by_mmuidx_async_work(
+        tlb_flush_page_by_mmuidx_async_work(
             cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
     }
 }
@@ -406,7 +391,7 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
 void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr,
                                        uint16_t idxmap)
 {
-    const run_on_cpu_func fn = tlb_check_page_and_flush_by_mmuidx_async_work;
+    const run_on_cpu_func fn = tlb_flush_page_by_mmuidx_async_work;
     target_ulong addr_and_mmu_idx;
 
     tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
@@ -420,10 +405,10 @@ void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr,
 }
 
 void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
-                                                            target_ulong addr,
-                                                            uint16_t idxmap)
+                                              target_ulong addr,
+                                              uint16_t idxmap)
 {
-    const run_on_cpu_func fn = tlb_check_page_and_flush_by_mmuidx_async_work;
+    const run_on_cpu_func fn = tlb_flush_page_by_mmuidx_async_work;
     target_ulong addr_and_mmu_idx;
 
     tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
@@ -577,25 +562,26 @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
 
 /* Our TLB does not support large pages, so remember the area covered by
    large pages and trigger a full TLB flush if these are invalidated.  */
-static void tlb_add_large_page(CPUArchState *env, target_ulong vaddr,
-                               target_ulong size)
+static void tlb_add_large_page(CPUArchState *env, int mmu_idx,
+                               target_ulong vaddr, target_ulong size)
 {
-    target_ulong mask = ~(size - 1);
+    target_ulong lp_addr = env->tlb_d[mmu_idx].large_page_addr;
+    target_ulong lp_mask = ~(size - 1);
 
-    if (env->tlb_flush_addr == (target_ulong)-1) {
-        env->tlb_flush_addr = vaddr & mask;
-        env->tlb_flush_mask = mask;
-        return;
+    if (lp_addr == (target_ulong)-1) {
+        /* No previous large page.  */
+        lp_addr = vaddr;
+    } else {
+        /* Extend the existing region to include the new page.
+           This is a compromise between unnecessary flushes and
+           the cost of maintaining a full variable size TLB.  */
+        lp_mask &= env->tlb_d[mmu_idx].large_page_mask;
+        while (((lp_addr ^ vaddr) & lp_mask) != 0) {
+            lp_mask <<= 1;
+        }
     }
-    /* Extend the existing region to include the new page.
-       This is a compromise between unnecessary flushes and the cost
-       of maintaining a full variable size TLB.  */
-    mask &= env->tlb_flush_mask;
-    while (((env->tlb_flush_addr ^ vaddr) & mask) != 0) {
-        mask <<= 1;
-    }
-    env->tlb_flush_addr &= mask;
-    env->tlb_flush_mask = mask;
+    env->tlb_d[mmu_idx].large_page_addr = lp_addr & lp_mask;
+    env->tlb_d[mmu_idx].large_page_mask = lp_mask;
 }
 
 /* Add a new TLB entry. At most one entry for a given virtual address
@@ -622,12 +608,10 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
 
     assert_cpu_is_self(cpu);
 
-    if (size < TARGET_PAGE_SIZE) {
+    if (size <= TARGET_PAGE_SIZE) {
         sz = TARGET_PAGE_SIZE;
     } else {
-        if (size > TARGET_PAGE_SIZE) {
-            tlb_add_large_page(env, vaddr, size);
-        }
+        tlb_add_large_page(env, mmu_idx, vaddr, size);
         sz = size;
     }
     vaddr_page = vaddr & TARGET_PAGE_MASK;
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index 659c73d2a1..df8ae18d9d 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -141,6 +141,17 @@ typedef struct CPUIOTLBEntry {
     MemTxAttrs attrs;
 } CPUIOTLBEntry;
 
+typedef struct CPUTLBDesc {
+    /*
+     * Describe a region covering all of the large pages allocated
+     * into the tlb.  When any page within this region is flushed,
+     * we must flush the entire tlb.  The region is matched if
+     * (addr & large_page_mask) == large_page_addr.
+     */
+    target_ulong large_page_addr;
+    target_ulong large_page_mask;
+} CPUTLBDesc;
+
 /*
  * Data elements that are shared between all MMU modes.
  */
@@ -162,13 +173,12 @@ typedef struct CPUTLBCommon {
  */
 #define CPU_COMMON_TLB \
     CPUTLBCommon tlb_c;                                                 \
+    CPUTLBDesc tlb_d[NB_MMU_MODES];                                     \
     CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];                  \
     CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE];               \
     CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE];                    \
     CPUIOTLBEntry iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];                 \
     size_t tlb_flush_count;                                             \
-    target_ulong tlb_flush_addr;                                        \
-    target_ulong tlb_flush_mask;                                        \
     target_ulong vtlb_index;                                            \
 
 #else

From d5363e5849c245a4880edbd0cdbc29c73694fac0 Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Fri, 19 Oct 2018 12:46:18 -0700
Subject: [PATCH 05/10] cputlb: Move env->vtlb_index to env->tlb_d.vindex

The rest of the tlb victim cache is per-tlb,
the next use index should be as well.

Tested-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c      | 5 ++---
 include/exec/cpu-defs.h | 5 +++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 8060ec99d7..2cd3886fd6 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -119,6 +119,7 @@ static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
     memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
     env->tlb_d[mmu_idx].large_page_addr = -1;
     env->tlb_d[mmu_idx].large_page_mask = -1;
+    env->tlb_d[mmu_idx].vindex = 0;
 }
 
 /* This is OK because CPU architectures generally permit an
@@ -149,8 +150,6 @@ static void tlb_flush_nocheck(CPUState *cpu)
     qemu_spin_unlock(&env->tlb_c.lock);
 
     cpu_tb_jmp_cache_clear(cpu);
-
-    env->vtlb_index = 0;
 }
 
 static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)
@@ -667,7 +666,7 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
      * different page; otherwise just overwrite the stale data.
      */
     if (!tlb_hit_page_anyprot(te, vaddr_page)) {
-        unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE;
+        unsigned vidx = env->tlb_d[mmu_idx].vindex++ % CPU_VTLB_SIZE;
         CPUTLBEntry *tv = &env->tlb_v_table[mmu_idx][vidx];
 
         /* Evict the old entry into the victim tlb.  */
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index df8ae18d9d..181c0dbfa4 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -150,6 +150,8 @@ typedef struct CPUTLBDesc {
      */
     target_ulong large_page_addr;
     target_ulong large_page_mask;
+    /* The next index to use in the tlb victim table.  */
+    size_t vindex;
 } CPUTLBDesc;
 
 /*
@@ -178,8 +180,7 @@ typedef struct CPUTLBCommon {
     CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE];               \
     CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE];                    \
     CPUIOTLBEntry iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];                 \
-    size_t tlb_flush_count;                                             \
-    target_ulong vtlb_index;                                            \
+    size_t tlb_flush_count;
 
 #else
 

From 64f2674bbc695345786b4adbb0feae4f08591f9d Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Tue, 23 Oct 2018 06:01:01 +0100
Subject: [PATCH 06/10] cputlb: Merge tlb_flush_nocheck into
 tlb_flush_by_mmuidx_async_work

The difference between the two sets of APIs is now miniscule.

This allows tlb_flush, tlb_flush_all_cpus, and tlb_flush_all_cpus_synced
to be merged with their corresponding by_mmuidx functions as well.  For
accounting, consider mmu_idx_bitmask = ALL_MMUIDX_BITS to be a full flush.

Tested-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 93 +++++++++++-----------------------------------
 1 file changed, 21 insertions(+), 72 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 2cd3886fd6..a3a417e81a 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -122,75 +122,6 @@ static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
     env->tlb_d[mmu_idx].vindex = 0;
 }
 
-/* This is OK because CPU architectures generally permit an
- * implementation to drop entries from the TLB at any time, so
- * flushing more entries than required is only an efficiency issue,
- * not a correctness issue.
- */
-static void tlb_flush_nocheck(CPUState *cpu)
-{
-    CPUArchState *env = cpu->env_ptr;
-    int mmu_idx;
-
-    assert_cpu_is_self(cpu);
-    atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
-    tlb_debug("(count: %zu)\n", tlb_flush_count());
-
-    /*
-     * tlb_table/tlb_v_table updates from any thread must hold tlb_c.lock.
-     * However, updates from the owner thread (as is the case here; see the
-     * above assert_cpu_is_self) do not need atomic_set because all reads
-     * that do not hold the lock are performed by the same owner thread.
-     */
-    qemu_spin_lock(&env->tlb_c.lock);
-    env->tlb_c.pending_flush = 0;
-    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        tlb_flush_one_mmuidx_locked(env, mmu_idx);
-    }
-    qemu_spin_unlock(&env->tlb_c.lock);
-
-    cpu_tb_jmp_cache_clear(cpu);
-}
-
-static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)
-{
-    tlb_flush_nocheck(cpu);
-}
-
-void tlb_flush(CPUState *cpu)
-{
-    if (cpu->created && !qemu_cpu_is_self(cpu)) {
-        CPUArchState *env = cpu->env_ptr;
-        uint16_t pending;
-
-        qemu_spin_lock(&env->tlb_c.lock);
-        pending = env->tlb_c.pending_flush;
-        env->tlb_c.pending_flush = ALL_MMUIDX_BITS;
-        qemu_spin_unlock(&env->tlb_c.lock);
-
-        if (pending != ALL_MMUIDX_BITS) {
-            async_run_on_cpu(cpu, tlb_flush_global_async_work,
-                             RUN_ON_CPU_NULL);
-        }
-    } else {
-        tlb_flush_nocheck(cpu);
-    }
-}
-
-void tlb_flush_all_cpus(CPUState *src_cpu)
-{
-    const run_on_cpu_func fn = tlb_flush_global_async_work;
-    flush_all_helper(src_cpu, fn, RUN_ON_CPU_NULL);
-    fn(src_cpu, RUN_ON_CPU_NULL);
-}
-
-void tlb_flush_all_cpus_synced(CPUState *src_cpu)
-{
-    const run_on_cpu_func fn = tlb_flush_global_async_work;
-    flush_all_helper(src_cpu, fn, RUN_ON_CPU_NULL);
-    async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_NULL);
-}
-
 static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 {
     CPUArchState *env = cpu->env_ptr;
@@ -212,13 +143,17 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
     qemu_spin_unlock(&env->tlb_c.lock);
 
     cpu_tb_jmp_cache_clear(cpu);
+
+    if (mmu_idx_bitmask == ALL_MMUIDX_BITS) {
+        atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
+    }
 }
 
 void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
 {
     tlb_debug("mmu_idx: 0x%" PRIx16 "\n", idxmap);
 
-    if (!qemu_cpu_is_self(cpu)) {
+    if (cpu->created && !qemu_cpu_is_self(cpu)) {
         CPUArchState *env = cpu->env_ptr;
         uint16_t pending, to_clean;
 
@@ -238,6 +173,11 @@ void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
     }
 }
 
+void tlb_flush(CPUState *cpu)
+{
+    tlb_flush_by_mmuidx(cpu, ALL_MMUIDX_BITS);
+}
+
 void tlb_flush_by_mmuidx_all_cpus(CPUState *src_cpu, uint16_t idxmap)
 {
     const run_on_cpu_func fn = tlb_flush_by_mmuidx_async_work;
@@ -248,8 +188,12 @@ void tlb_flush_by_mmuidx_all_cpus(CPUState *src_cpu, uint16_t idxmap)
     fn(src_cpu, RUN_ON_CPU_HOST_INT(idxmap));
 }
 
-void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
-                                                       uint16_t idxmap)
+void tlb_flush_all_cpus(CPUState *src_cpu)
+{
+    tlb_flush_by_mmuidx_all_cpus(src_cpu, ALL_MMUIDX_BITS);
+}
+
+void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *src_cpu, uint16_t idxmap)
 {
     const run_on_cpu_func fn = tlb_flush_by_mmuidx_async_work;
 
@@ -259,6 +203,11 @@ void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
     async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_HOST_INT(idxmap));
 }
 
+void tlb_flush_all_cpus_synced(CPUState *src_cpu)
+{
+    tlb_flush_by_mmuidx_all_cpus_synced(src_cpu, ALL_MMUIDX_BITS);
+}
+
 static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry,
                                         target_ulong page)
 {

From f8144c6c1e7574a8f2da05709a81bfc8b9215d77 Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Fri, 19 Oct 2018 14:25:09 -0700
Subject: [PATCH 07/10] cputlb: Merge tlb_flush_page into
 tlb_flush_page_by_mmuidx

The difference between the two sets of APIs is now miniscule.

Tested-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c | 58 ++++++++++------------------------------------
 1 file changed, 12 insertions(+), 46 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index a3a417e81a..1f7764de94 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -255,38 +255,6 @@ static void tlb_flush_page_locked(CPUArchState *env, int midx,
     }
 }
 
-static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
-{
-    CPUArchState *env = cpu->env_ptr;
-    target_ulong addr = (target_ulong) data.target_ptr;
-    int mmu_idx;
-
-    assert_cpu_is_self(cpu);
-
-    tlb_debug("page addr:" TARGET_FMT_lx "\n", addr);
-
-    addr &= TARGET_PAGE_MASK;
-    qemu_spin_lock(&env->tlb_c.lock);
-    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        tlb_flush_page_locked(env, mmu_idx, addr);
-    }
-    qemu_spin_unlock(&env->tlb_c.lock);
-
-    tb_flush_jmp_cache(cpu, addr);
-}
-
-void tlb_flush_page(CPUState *cpu, target_ulong addr)
-{
-    tlb_debug("page :" TARGET_FMT_lx "\n", addr);
-
-    if (!qemu_cpu_is_self(cpu)) {
-        async_run_on_cpu(cpu, tlb_flush_page_async_work,
-                         RUN_ON_CPU_TARGET_PTR(addr));
-    } else {
-        tlb_flush_page_async_work(cpu, RUN_ON_CPU_TARGET_PTR(addr));
-    }
-}
-
 /* As we are going to hijack the bottom bits of the page address for a
  * mmuidx bit mask we need to fail to build if we can't do that
  */
@@ -336,6 +304,11 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
     }
 }
 
+void tlb_flush_page(CPUState *cpu, target_ulong addr)
+{
+    tlb_flush_page_by_mmuidx(cpu, addr, ALL_MMUIDX_BITS);
+}
+
 void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr,
                                        uint16_t idxmap)
 {
@@ -352,6 +325,11 @@ void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr,
     fn(src_cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
 }
 
+void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr)
+{
+    tlb_flush_page_by_mmuidx_all_cpus(src, addr, ALL_MMUIDX_BITS);
+}
+
 void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
                                               target_ulong addr,
                                               uint16_t idxmap)
@@ -369,21 +347,9 @@ void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
     async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
 }
 
-void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr)
+void tlb_flush_page_all_cpus_synced(CPUState *src, target_ulong addr)
 {
-    const run_on_cpu_func fn = tlb_flush_page_async_work;
-
-    flush_all_helper(src, fn, RUN_ON_CPU_TARGET_PTR(addr));
-    fn(src, RUN_ON_CPU_TARGET_PTR(addr));
-}
-
-void tlb_flush_page_all_cpus_synced(CPUState *src,
-                                                  target_ulong addr)
-{
-    const run_on_cpu_func fn = tlb_flush_page_async_work;
-
-    flush_all_helper(src, fn, RUN_ON_CPU_TARGET_PTR(addr));
-    async_safe_run_on_cpu(src, fn, RUN_ON_CPU_TARGET_PTR(addr));
+    tlb_flush_page_by_mmuidx_all_cpus_synced(src, addr, ALL_MMUIDX_BITS);
 }
 
 /* update the TLBs so that writes to code in the virtual page 'addr'

From e09de0a20d42e5e76e91ffc7f9f4787e225e1ec2 Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Fri, 19 Oct 2018 14:36:43 -0700
Subject: [PATCH 08/10] cputlb: Count "partial" and "elided" tlb flushes

Our only statistic so far was "full" tlb flushes, where all mmu_idx
are flushed at the same time.

Now count "partial" tlb flushes where sets of mmu_idx are flushed,
but the set is not maximal.  Account one per mmu_idx flushed, as
that is the unit of work performed.

We don't actually count elided flushes yet, but go ahead and change
the interface presented to the monitor all at once.

Tested-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c        | 18 +++++++++++++-----
 accel/tcg/translate-all.c |  8 ++++++--
 include/exec/cpu-defs.h   | 12 ++++++++++--
 include/exec/cputlb.h     |  2 +-
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 1f7764de94..e60628c350 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -100,17 +100,21 @@ static void flush_all_helper(CPUState *src, run_on_cpu_func fn,
     }
 }
 
-size_t tlb_flush_count(void)
+void tlb_flush_counts(size_t *pfull, size_t *ppart, size_t *pelide)
 {
     CPUState *cpu;
-    size_t count = 0;
+    size_t full = 0, part = 0, elide = 0;
 
     CPU_FOREACH(cpu) {
         CPUArchState *env = cpu->env_ptr;
 
-        count += atomic_read(&env->tlb_flush_count);
+        full += atomic_read(&env->tlb_c.full_flush_count);
+        part += atomic_read(&env->tlb_c.part_flush_count);
+        elide += atomic_read(&env->tlb_c.elide_flush_count);
     }
-    return count;
+    *pfull = full;
+    *ppart = part;
+    *pelide = elide;
 }
 
 static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
@@ -145,7 +149,11 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
     cpu_tb_jmp_cache_clear(cpu);
 
     if (mmu_idx_bitmask == ALL_MMUIDX_BITS) {
-        atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
+        atomic_set(&env->tlb_c.full_flush_count,
+                   env->tlb_c.full_flush_count + 1);
+    } else {
+        atomic_set(&env->tlb_c.part_flush_count,
+                   env->tlb_c.part_flush_count + ctpop16(mmu_idx_bitmask));
     }
 }
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 356dcd0948..639f0b2728 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -2290,7 +2290,7 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
 {
     struct tb_tree_stats tst = {};
     struct qht_stats hst;
-    size_t nb_tbs;
+    size_t nb_tbs, flush_full, flush_part, flush_elide;
 
     tcg_tb_foreach(tb_tree_stats_iter, &tst);
     nb_tbs = tst.nb_tbs;
@@ -2326,7 +2326,11 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
     cpu_fprintf(f, "TB flush count      %u\n",
                 atomic_read(&tb_ctx.tb_flush_count));
     cpu_fprintf(f, "TB invalidate count %zu\n", tcg_tb_phys_invalidate_count());
-    cpu_fprintf(f, "TLB flush count     %zu\n", tlb_flush_count());
+
+    tlb_flush_counts(&flush_full, &flush_part, &flush_elide);
+    cpu_fprintf(f, "TLB full flushes    %zu\n", flush_full);
+    cpu_fprintf(f, "TLB partial flushes %zu\n", flush_part);
+    cpu_fprintf(f, "TLB elided flushes  %zu\n", flush_elide);
     tcg_dump_info(f, cpu_fprintf);
 }
 
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index 181c0dbfa4..c7b501d627 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -166,6 +166,15 @@ typedef struct CPUTLBCommon {
      * mmu_idx may be discarded.  Protected by tlb_c.lock.
      */
     uint16_t pending_flush;
+
+    /*
+     * Statistics.  These are not lock protected, but are read and
+     * written atomically.  This allows the monitor to print a snapshot
+     * of the stats without interfering with the cpu.
+     */
+    size_t full_flush_count;
+    size_t part_flush_count;
+    size_t elide_flush_count;
 } CPUTLBCommon;
 
 /*
@@ -179,8 +188,7 @@ typedef struct CPUTLBCommon {
     CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];                  \
     CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE];               \
     CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE];                    \
-    CPUIOTLBEntry iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];                 \
-    size_t tlb_flush_count;
+    CPUIOTLBEntry iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];
 
 #else
 
diff --git a/include/exec/cputlb.h b/include/exec/cputlb.h
index c91db211bc..5373188be3 100644
--- a/include/exec/cputlb.h
+++ b/include/exec/cputlb.h
@@ -23,6 +23,6 @@
 /* cputlb.c */
 void tlb_protect_code(ram_addr_t ram_addr);
 void tlb_unprotect_code(ram_addr_t ram_addr);
-size_t tlb_flush_count(void);
+void tlb_flush_counts(size_t *full, size_t *part, size_t *elide);
 #endif
 #endif

From 3d1523ced6060cdfe9e768a814d064067ccabfe5 Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Sat, 20 Oct 2018 12:04:57 -0700
Subject: [PATCH 09/10] cputlb: Filter flushes on already clean tlbs

Especially for guests with large numbers of tlbs, like ARM or PPC,
we may well not use all of them in between flush operations.
Remember which tlbs have been used since the last flush, and
avoid any useless flushing.

Tested-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c      | 35 +++++++++++++++++++++++++----------
 include/exec/cpu-defs.h |  7 ++++++-
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index e60628c350..f6c37bc4db 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -79,6 +79,9 @@ void tlb_init(CPUState *cpu)
     CPUArchState *env = cpu->env_ptr;
 
     qemu_spin_init(&env->tlb_c.lock);
+
+    /* Ensure that cpu_reset performs a full flush.  */
+    env->tlb_c.dirty = ALL_MMUIDX_BITS;
 }
 
 /* flush_all_helper: run fn across all cpus
@@ -129,31 +132,40 @@ static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
 static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 {
     CPUArchState *env = cpu->env_ptr;
-    unsigned long mmu_idx_bitmask = data.host_int;
-    int mmu_idx;
+    uint16_t asked = data.host_int;
+    uint16_t all_dirty, work, to_clean;
 
     assert_cpu_is_self(cpu);
 
-    tlb_debug("mmu_idx:0x%04lx\n", mmu_idx_bitmask);
+    tlb_debug("mmu_idx:0x%04" PRIx16 "\n", asked);
 
     qemu_spin_lock(&env->tlb_c.lock);
-    env->tlb_c.pending_flush &= ~mmu_idx_bitmask;
 
-    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
-            tlb_flush_one_mmuidx_locked(env, mmu_idx);
-        }
+    all_dirty = env->tlb_c.dirty;
+    to_clean = asked & all_dirty;
+    all_dirty &= ~to_clean;
+    env->tlb_c.dirty = all_dirty;
+
+    for (work = to_clean; work != 0; work &= work - 1) {
+        int mmu_idx = ctz32(work);
+        tlb_flush_one_mmuidx_locked(env, mmu_idx);
     }
+
     qemu_spin_unlock(&env->tlb_c.lock);
 
     cpu_tb_jmp_cache_clear(cpu);
 
-    if (mmu_idx_bitmask == ALL_MMUIDX_BITS) {
+    if (to_clean == ALL_MMUIDX_BITS) {
         atomic_set(&env->tlb_c.full_flush_count,
                    env->tlb_c.full_flush_count + 1);
     } else {
         atomic_set(&env->tlb_c.part_flush_count,
-                   env->tlb_c.part_flush_count + ctpop16(mmu_idx_bitmask));
+                   env->tlb_c.part_flush_count + ctpop16(to_clean));
+        if (to_clean != asked) {
+            atomic_set(&env->tlb_c.elide_flush_count,
+                       env->tlb_c.elide_flush_count +
+                       ctpop16(asked & ~to_clean));
+        }
     }
 }
 
@@ -581,6 +593,9 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
      */
     qemu_spin_lock(&env->tlb_c.lock);
 
+    /* Note that the tlb is no longer clean.  */
+    env->tlb_c.dirty |= 1 << mmu_idx;
+
     /* Make sure there's no cached translation for the new page.  */
     tlb_flush_vtlb_page_locked(env, mmu_idx, vaddr_page);
 
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index c7b501d627..ca0fea8b27 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -166,7 +166,12 @@ typedef struct CPUTLBCommon {
      * mmu_idx may be discarded.  Protected by tlb_c.lock.
      */
     uint16_t pending_flush;
-
+    /*
+     * Within dirty, for each bit N, modifications have been made to
+     * mmu_idx N since the last time that mmu_idx was flushed.
+     * Protected by tlb_c.lock.
+     */
+    uint16_t dirty;
     /*
      * Statistics.  These are not lock protected, but are read and
      * written atomically.  This allows the monitor to print a snapshot

From ab6511053015b9cc636915e2c2b97305cbf044f0 Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Tue, 23 Oct 2018 06:58:03 +0100
Subject: [PATCH 10/10] cputlb: Remove tlb_c.pending_flushes

This is essentially redundant with tlb_c.dirty.

Tested-by: Emilio G. Cota <cota@braap.org>
Reviewed-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c      | 16 ++--------------
 include/exec/cpu-defs.h |  6 ------
 2 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index f6c37bc4db..af6bd8ccf9 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -174,20 +174,8 @@ void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
     tlb_debug("mmu_idx: 0x%" PRIx16 "\n", idxmap);
 
     if (cpu->created && !qemu_cpu_is_self(cpu)) {
-        CPUArchState *env = cpu->env_ptr;
-        uint16_t pending, to_clean;
-
-        qemu_spin_lock(&env->tlb_c.lock);
-        pending = env->tlb_c.pending_flush;
-        to_clean = idxmap & ~pending;
-        env->tlb_c.pending_flush = pending | idxmap;
-        qemu_spin_unlock(&env->tlb_c.lock);
-
-        if (to_clean) {
-            tlb_debug("reduced mmu_idx: 0x%" PRIx16 "\n", to_clean);
-            async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,
-                             RUN_ON_CPU_HOST_INT(to_clean));
-        }
+        async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,
+                         RUN_ON_CPU_HOST_INT(idxmap));
     } else {
         tlb_flush_by_mmuidx_async_work(cpu, RUN_ON_CPU_HOST_INT(idxmap));
     }
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index ca0fea8b27..6a60f94a41 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -160,12 +160,6 @@ typedef struct CPUTLBDesc {
 typedef struct CPUTLBCommon {
     /* Serialize updates to tlb_table and tlb_v_table, and others as noted. */
     QemuSpin lock;
-    /*
-     * Within pending_flush, for each bit N, there exists an outstanding
-     * cross-cpu flush for mmu_idx N.  Further cross-cpu flushes to that
-     * mmu_idx may be discarded.  Protected by tlb_c.lock.
-     */
-    uint16_t pending_flush;
     /*
      * Within dirty, for each bit N, modifications have been made to
      * mmu_idx N since the last time that mmu_idx was flushed.