tcg: Use tlb_fill probe from tlb_vaddr_to_host

[mirror_qemu.git] / accel / tcg / cputlb.c
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c

index 0a721bb9c40e28193afdd4fac59e42b9d77e5137..685e0f2ee4e0965b4120b8d47a5d63cdd9e2a890 100644 (file)
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -6,7 +6,7 @@
   * This library is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
   * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
   *
   * This library is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -32,6 +32,7 @@
  #include "exec/log.h"
  #include "exec/helper-proto.h"
  #include "qemu/atomic.h"
+#include "qemu/atomic128.h"
  
  /* DEBUG defines, enable DEBUG_TLB_LOG to log to the CPU_LOG_MMU target */
  /* #define DEBUG_TLB */
@@ -58,9 +59,9 @@
      } \
  } while (0)
  
-#define assert_cpu_is_self(this_cpu) do {                         \
+#define assert_cpu_is_self(cpu) do {                              \
          if (DEBUG_TLB_GATE) {                                     \
-            g_assert(!cpu->created || qemu_cpu_is_self(cpu));     \
+            g_assert(!(cpu)->created || qemu_cpu_is_self(cpu));   \
          }                                                         \
      } while (0)
  
@@ -73,155 +74,280 @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
  QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
  #define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
  
-/* flush_all_helper: run fn across all cpus
- *
- * If the wait flag is set then the src cpu's helper will be queued as
- * "safe" work and the loop exited creating a synchronisation point
- * where all queued work will be finished before execution starts
- * again.
- */
-static void flush_all_helper(CPUState *src, run_on_cpu_func fn,
-                             run_on_cpu_data d)
+static inline size_t sizeof_tlb(CPUArchState *env, uintptr_t mmu_idx)
  {
-    CPUState *cpu;
+    return env->tlb_mask[mmu_idx] + (1 << CPU_TLB_ENTRY_BITS);
+}
  
-    CPU_FOREACH(cpu) {
-        if (cpu != src) {
-            async_run_on_cpu(cpu, fn, d);
-        }
-    }
+static void tlb_window_reset(CPUTLBWindow *window, int64_t ns,
+                             size_t max_entries)
+{
+    window->begin_ns = ns;
+    window->max_entries = max_entries;
  }
  
-size_t tlb_flush_count(void)
+static void tlb_dyn_init(CPUArchState *env)
  {
-    CPUState *cpu;
-    size_t count = 0;
+    int i;
  
-    CPU_FOREACH(cpu) {
-        CPUArchState *env = cpu->env_ptr;
+    for (i = 0; i < NB_MMU_MODES; i++) {
+        CPUTLBDesc *desc = &env->tlb_d[i];
+        size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS;
  
-        count += atomic_read(&env->tlb_flush_count);
+        tlb_window_reset(&desc->window, get_clock_realtime(), 0);
+        desc->n_used_entries = 0;
+        env->tlb_mask[i] = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
+        env->tlb_table[i] = g_new(CPUTLBEntry, n_entries);
+        env->iotlb[i] = g_new(CPUIOTLBEntry, n_entries);
      }
-    return count;
  }
  
-/* This is OK because CPU architectures generally permit an
- * implementation to drop entries from the TLB at any time, so
- * flushing more entries than required is only an efficiency issue,
- * not a correctness issue.
+/**
+ * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
+ * @env: CPU that owns the TLB
+ * @mmu_idx: MMU index of the TLB
+ *
+ * Called with tlb_lock_held.
+ *
+ * We have two main constraints when resizing a TLB: (1) we only resize it
+ * on a TLB flush (otherwise we'd have to take a perf hit by either rehashing
+ * the array or unnecessarily flushing it), which means we do not control how
+ * frequently the resizing can occur; (2) we don't have access to the guest's
+ * future scheduling decisions, and therefore have to decide the magnitude of
+ * the resize based on past observations.
+ *
+ * In general, a memory-hungry process can benefit greatly from an appropriately
+ * sized TLB, since a guest TLB miss is very expensive. This doesn't mean that
+ * we just have to make the TLB as large as possible; while an oversized TLB
+ * results in minimal TLB miss rates, it also takes longer to be flushed
+ * (flushes can be _very_ frequent), and the reduced locality can also hurt
+ * performance.
+ *
+ * To achieve near-optimal performance for all kinds of workloads, we:
+ *
+ * 1. Aggressively increase the size of the TLB when the use rate of the
+ * TLB being flushed is high, since it is likely that in the near future this
+ * memory-hungry process will execute again, and its memory hungriness will
+ * probably be similar.
+ *
+ * 2. Slowly reduce the size of the TLB as the use rate declines over a
+ * reasonably large time window. The rationale is that if in such a time window
+ * we have not observed a high TLB use rate, it is likely that we won't observe
+ * it in the near future. In that case, once a time window expires we downsize
+ * the TLB to match the maximum use rate observed in the window.
+ *
+ * 3. Try to keep the maximum use rate in a time window in the 30-70% range,
+ * since in that range performance is likely near-optimal. Recall that the TLB
+ * is direct mapped, so we want the use rate to be low (or at least not too
+ * high), since otherwise we are likely to have a significant amount of
+ * conflict misses.
   */
-static void tlb_flush_nocheck(CPUState *cpu)
+static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
  {
-    CPUArchState *env = cpu->env_ptr;
+    CPUTLBDesc *desc = &env->tlb_d[mmu_idx];
+    size_t old_size = tlb_n_entries(env, mmu_idx);
+    size_t rate;
+    size_t new_size = old_size;
+    int64_t now = get_clock_realtime();
+    int64_t window_len_ms = 100;
+    int64_t window_len_ns = window_len_ms * 1000 * 1000;
+    bool window_expired = now > desc->window.begin_ns + window_len_ns;
+
+    if (desc->n_used_entries > desc->window.max_entries) {
+        desc->window.max_entries = desc->n_used_entries;
+    }
+    rate = desc->window.max_entries * 100 / old_size;
+
+    if (rate > 70) {
+        new_size = MIN(old_size << 1, 1 << CPU_TLB_DYN_MAX_BITS);
+    } else if (rate < 30 && window_expired) {
+        size_t ceil = pow2ceil(desc->window.max_entries);
+        size_t expected_rate = desc->window.max_entries * 100 / ceil;
+
+        /*
+         * Avoid undersizing when the max number of entries seen is just below
+         * a pow2. For instance, if max_entries == 1025, the expected use rate
+         * would be 1025/2048==50%. However, if max_entries == 1023, we'd get
+         * 1023/1024==99.9% use rate, so we'd likely end up doubling the size
+         * later. Thus, make sure that the expected use rate remains below 70%.
+         * (and since we double the size, that means the lowest rate we'd
+         * expect to get is 35%, which is still in the 30-70% range where
+         * we consider that the size is appropriate.)
+         */
+        if (expected_rate > 70) {
+            ceil *= 2;
+        }
+        new_size = MAX(ceil, 1 << CPU_TLB_DYN_MIN_BITS);
+    }
  
-    /* The QOM tests will trigger tlb_flushes without setting up TCG
-     * so we bug out here in that case.
-     */
-    if (!tcg_enabled()) {
+    if (new_size == old_size) {
+        if (window_expired) {
+            tlb_window_reset(&desc->window, now, desc->n_used_entries);
+        }
          return;
      }
  
-    assert_cpu_is_self(cpu);
-    atomic_set(&env->tlb_flush_count, env->tlb_flush_count + 1);
-    tlb_debug("(count: %zu)\n", tlb_flush_count());
+    g_free(env->tlb_table[mmu_idx]);
+    g_free(env->iotlb[mmu_idx]);
  
-    tb_lock();
+    tlb_window_reset(&desc->window, now, 0);
+    /* desc->n_used_entries is cleared by the caller */
+    env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;
+    env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size);
+    env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size);
+    /*
+     * If the allocations fail, try smaller sizes. We just freed some
+     * memory, so going back to half of new_size has a good chance of working.
+     * Increased memory pressure elsewhere in the system might cause the
+     * allocations to fail though, so we progressively reduce the allocation
+     * size, aborting if we cannot even allocate the smallest TLB we support.
+     */
+    while (env->tlb_table[mmu_idx] == NULL || env->iotlb[mmu_idx] == NULL) {
+        if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) {
+            error_report("%s: %s", __func__, strerror(errno));
+            abort();
+        }
+        new_size = MAX(new_size >> 1, 1 << CPU_TLB_DYN_MIN_BITS);
+        env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;
  
-    memset(env->tlb_table, -1, sizeof(env->tlb_table));
-    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
-    cpu_tb_jmp_cache_clear(cpu);
+        g_free(env->tlb_table[mmu_idx]);
+        g_free(env->iotlb[mmu_idx]);
+        env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size);
+        env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size);
+    }
+}
  
-    env->vtlb_index = 0;
-    env->tlb_flush_addr = -1;
-    env->tlb_flush_mask = 0;
+static inline void tlb_table_flush_by_mmuidx(CPUArchState *env, int mmu_idx)
+{
+    tlb_mmu_resize_locked(env, mmu_idx);
+    memset(env->tlb_table[mmu_idx], -1, sizeof_tlb(env, mmu_idx));
+    env->tlb_d[mmu_idx].n_used_entries = 0;
+}
  
-    tb_unlock();
+static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
+{
+    env->tlb_d[mmu_idx].n_used_entries++;
+}
  
-    atomic_mb_set(&cpu->pending_tlb_flush, 0);
+static inline void tlb_n_used_entries_dec(CPUArchState *env, uintptr_t mmu_idx)
+{
+    env->tlb_d[mmu_idx].n_used_entries--;
  }
  
-static void tlb_flush_global_async_work(CPUState *cpu, run_on_cpu_data data)
+void tlb_init(CPUState *cpu)
  {
-    tlb_flush_nocheck(cpu);
+    CPUArchState *env = cpu->env_ptr;
+
+    qemu_spin_init(&env->tlb_c.lock);
+
+    /* Ensure that cpu_reset performs a full flush.  */
+    env->tlb_c.dirty = ALL_MMUIDX_BITS;
+
+    tlb_dyn_init(env);
  }
  
-void tlb_flush(CPUState *cpu)
+/* flush_all_helper: run fn across all cpus
+ *
+ * If the wait flag is set then the src cpu's helper will be queued as
+ * "safe" work and the loop exited creating a synchronisation point
+ * where all queued work will be finished before execution starts
+ * again.
+ */
+static void flush_all_helper(CPUState *src, run_on_cpu_func fn,
+                             run_on_cpu_data d)
  {
-    if (cpu->created && !qemu_cpu_is_self(cpu)) {
-        if (atomic_mb_read(&cpu->pending_tlb_flush) != ALL_MMUIDX_BITS) {
-            atomic_mb_set(&cpu->pending_tlb_flush, ALL_MMUIDX_BITS);
-            async_run_on_cpu(cpu, tlb_flush_global_async_work,
-                             RUN_ON_CPU_NULL);
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        if (cpu != src) {
+            async_run_on_cpu(cpu, fn, d);
          }
-    } else {
-        tlb_flush_nocheck(cpu);
      }
  }
  
-void tlb_flush_all_cpus(CPUState *src_cpu)
+void tlb_flush_counts(size_t *pfull, size_t *ppart, size_t *pelide)
  {
-    const run_on_cpu_func fn = tlb_flush_global_async_work;
-    flush_all_helper(src_cpu, fn, RUN_ON_CPU_NULL);
-    fn(src_cpu, RUN_ON_CPU_NULL);
+    CPUState *cpu;
+    size_t full = 0, part = 0, elide = 0;
+
+    CPU_FOREACH(cpu) {
+        CPUArchState *env = cpu->env_ptr;
+
+        full += atomic_read(&env->tlb_c.full_flush_count);
+        part += atomic_read(&env->tlb_c.part_flush_count);
+        elide += atomic_read(&env->tlb_c.elide_flush_count);
+    }
+    *pfull = full;
+    *ppart = part;
+    *pelide = elide;
  }
  
-void tlb_flush_all_cpus_synced(CPUState *src_cpu)
+static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
  {
-    const run_on_cpu_func fn = tlb_flush_global_async_work;
-    flush_all_helper(src_cpu, fn, RUN_ON_CPU_NULL);
-    async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_NULL);
+    tlb_table_flush_by_mmuidx(env, mmu_idx);
+    memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
+    env->tlb_d[mmu_idx].large_page_addr = -1;
+    env->tlb_d[mmu_idx].large_page_mask = -1;
+    env->tlb_d[mmu_idx].vindex = 0;
  }
  
  static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
  {
      CPUArchState *env = cpu->env_ptr;
-    unsigned long mmu_idx_bitmask = data.host_int;
-    int mmu_idx;
+    uint16_t asked = data.host_int;
+    uint16_t all_dirty, work, to_clean;
  
      assert_cpu_is_self(cpu);
  
-    tb_lock();
-
-    tlb_debug("start: mmu_idx:0x%04lx\n", mmu_idx_bitmask);
+    tlb_debug("mmu_idx:0x%04" PRIx16 "\n", asked);
  
-    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+    qemu_spin_lock(&env->tlb_c.lock);
  
-        if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
-            tlb_debug("%d\n", mmu_idx);
+    all_dirty = env->tlb_c.dirty;
+    to_clean = asked & all_dirty;
+    all_dirty &= ~to_clean;
+    env->tlb_c.dirty = all_dirty;
  
-            memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
-            memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
-        }
+    for (work = to_clean; work != 0; work &= work - 1) {
+        int mmu_idx = ctz32(work);
+        tlb_flush_one_mmuidx_locked(env, mmu_idx);
      }
  
-    cpu_tb_jmp_cache_clear(cpu);
+    qemu_spin_unlock(&env->tlb_c.lock);
  
-    tlb_debug("done\n");
+    cpu_tb_jmp_cache_clear(cpu);
  
-    tb_unlock();
+    if (to_clean == ALL_MMUIDX_BITS) {
+        atomic_set(&env->tlb_c.full_flush_count,
+                   env->tlb_c.full_flush_count + 1);
+    } else {
+        atomic_set(&env->tlb_c.part_flush_count,
+                   env->tlb_c.part_flush_count + ctpop16(to_clean));
+        if (to_clean != asked) {
+            atomic_set(&env->tlb_c.elide_flush_count,
+                       env->tlb_c.elide_flush_count +
+                       ctpop16(asked & ~to_clean));
+        }
+    }
  }
  
  void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
  {
      tlb_debug("mmu_idx: 0x%" PRIx16 "\n", idxmap);
  
-    if (!qemu_cpu_is_self(cpu)) {
-        uint16_t pending_flushes = idxmap;
-        pending_flushes &= ~atomic_mb_read(&cpu->pending_tlb_flush);
-
-        if (pending_flushes) {
-            tlb_debug("reduced mmu_idx: 0x%" PRIx16 "\n", pending_flushes);
-
-            atomic_or(&cpu->pending_tlb_flush, pending_flushes);
-            async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,
-                             RUN_ON_CPU_HOST_INT(pending_flushes));
-        }
+    if (cpu->created && !qemu_cpu_is_self(cpu)) {
+        async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,
+                         RUN_ON_CPU_HOST_INT(idxmap));
      } else {
-        tlb_flush_by_mmuidx_async_work(cpu,
-                                       RUN_ON_CPU_HOST_INT(idxmap));
+        tlb_flush_by_mmuidx_async_work(cpu, RUN_ON_CPU_HOST_INT(idxmap));
      }
  }
  
+void tlb_flush(CPUState *cpu)
+{
+    tlb_flush_by_mmuidx(cpu, ALL_MMUIDX_BITS);
+}
+
  void tlb_flush_by_mmuidx_all_cpus(CPUState *src_cpu, uint16_t idxmap)
  {
      const run_on_cpu_func fn = tlb_flush_by_mmuidx_async_work;
@@ -232,8 +358,12 @@ void tlb_flush_by_mmuidx_all_cpus(CPUState *src_cpu, uint16_t idxmap)
      fn(src_cpu, RUN_ON_CPU_HOST_INT(idxmap));
  }
  
-void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
-                                                       uint16_t idxmap)
+void tlb_flush_all_cpus(CPUState *src_cpu)
+{
+    tlb_flush_by_mmuidx_all_cpus(src_cpu, ALL_MMUIDX_BITS);
+}
+
+void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *src_cpu, uint16_t idxmap)
  {
      const run_on_cpu_func fn = tlb_flush_by_mmuidx_async_work;
  
@@ -243,67 +373,70 @@ void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
      async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_HOST_INT(idxmap));
  }
  
-
-
-static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
+void tlb_flush_all_cpus_synced(CPUState *src_cpu)
  {
-    if (addr == (tlb_entry->addr_read &
-                 (TARGET_PAGE_MASK | TLB_INVALID_MASK)) ||
-        addr == (tlb_entry->addr_write &
-                 (TARGET_PAGE_MASK | TLB_INVALID_MASK)) ||
-        addr == (tlb_entry->addr_code &
-                 (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
-        memset(tlb_entry, -1, sizeof(*tlb_entry));
-    }
+    tlb_flush_by_mmuidx_all_cpus_synced(src_cpu, ALL_MMUIDX_BITS);
  }
  
-static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
+static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry,
+                                        target_ulong page)
  {
-    CPUArchState *env = cpu->env_ptr;
-    target_ulong addr = (target_ulong) data.target_ptr;
-    int i;
-    int mmu_idx;
-
-    assert_cpu_is_self(cpu);
-
-    tlb_debug("page :" TARGET_FMT_lx "\n", addr);
+    return tlb_hit_page(tlb_entry->addr_read, page) ||
+           tlb_hit_page(tlb_addr_write(tlb_entry), page) ||
+           tlb_hit_page(tlb_entry->addr_code, page);
+}
  
-    /* Check if we need to flush due to large pages.  */
-    if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {
-        tlb_debug("forcing full flush ("
-                  TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
-                  env->tlb_flush_addr, env->tlb_flush_mask);
+/**
+ * tlb_entry_is_empty - return true if the entry is not in use
+ * @te: pointer to CPUTLBEntry
+ */
+static inline bool tlb_entry_is_empty(const CPUTLBEntry *te)
+{
+    return te->addr_read == -1 && te->addr_write == -1 && te->addr_code == -1;
+}
  
-        tlb_flush(cpu);
-        return;
+/* Called with tlb_c.lock held */
+static inline bool tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
+                                          target_ulong page)
+{
+    if (tlb_hit_page_anyprot(tlb_entry, page)) {
+        memset(tlb_entry, -1, sizeof(*tlb_entry));
+        return true;
      }
+    return false;
+}
  
-    addr &= TARGET_PAGE_MASK;
-    i = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr);
-    }
+/* Called with tlb_c.lock held */
+static inline void tlb_flush_vtlb_page_locked(CPUArchState *env, int mmu_idx,
+                                              target_ulong page)
+{
+    int k;
  
-    /* check whether there are entries that need to be flushed in the vtlb */
-    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        int k;
-        for (k = 0; k < CPU_VTLB_SIZE; k++) {
-            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
+    assert_cpu_is_self(ENV_GET_CPU(env));
+    for (k = 0; k < CPU_VTLB_SIZE; k++) {
+        if (tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page)) {
+            tlb_n_used_entries_dec(env, mmu_idx);
          }
      }
-
-    tb_flush_jmp_cache(cpu, addr);
  }
  
-void tlb_flush_page(CPUState *cpu, target_ulong addr)
+static void tlb_flush_page_locked(CPUArchState *env, int midx,
+                                  target_ulong page)
  {
-    tlb_debug("page :" TARGET_FMT_lx "\n", addr);
+    target_ulong lp_addr = env->tlb_d[midx].large_page_addr;
+    target_ulong lp_mask = env->tlb_d[midx].large_page_mask;
  
-    if (!qemu_cpu_is_self(cpu)) {
-        async_run_on_cpu(cpu, tlb_flush_page_async_work,
-                         RUN_ON_CPU_TARGET_PTR(addr));
+    /* Check if we need to flush due to large pages.  */
+    if ((page & lp_mask) == lp_addr) {
+        tlb_debug("forcing full flush midx %d ("
+                  TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
+                  midx, lp_addr, lp_mask);
+        tlb_flush_one_mmuidx_locked(env, midx);
      } else {
-        tlb_flush_page_async_work(cpu, RUN_ON_CPU_TARGET_PTR(addr));
+        if (tlb_flush_entry_locked(tlb_entry(env, midx, page), page)) {
+            tlb_n_used_entries_dec(env, midx);
+        }
+        tlb_flush_vtlb_page_locked(env, midx, page);
      }
  }
  
@@ -319,52 +452,24 @@ static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
      target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
      target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
      unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
-    int page = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
      int mmu_idx;
-    int i;
  
      assert_cpu_is_self(cpu);
  
-    tlb_debug("page:%d addr:"TARGET_FMT_lx" mmu_idx:0x%lx\n",
-              page, addr, mmu_idx_bitmap);
+    tlb_debug("page addr:" TARGET_FMT_lx " mmu_map:0x%lx\n",
+              addr, mmu_idx_bitmap);
  
+    qemu_spin_lock(&env->tlb_c.lock);
      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
          if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
-            tlb_flush_entry(&env->tlb_table[mmu_idx][page], addr);
-
-            /* check whether there are vltb entries that need to be flushed */
-            for (i = 0; i < CPU_VTLB_SIZE; i++) {
-                tlb_flush_entry(&env->tlb_v_table[mmu_idx][i], addr);
-            }
+            tlb_flush_page_locked(env, mmu_idx, addr);
          }
      }
+    qemu_spin_unlock(&env->tlb_c.lock);
  
      tb_flush_jmp_cache(cpu, addr);
  }
  
-static void tlb_check_page_and_flush_by_mmuidx_async_work(CPUState *cpu,
-                                                          run_on_cpu_data data)
-{
-    CPUArchState *env = cpu->env_ptr;
-    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
-    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
-    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
-
-    tlb_debug("addr:"TARGET_FMT_lx" mmu_idx: %04lx\n", addr, mmu_idx_bitmap);
-
-    /* Check if we need to flush due to large pages.  */
-    if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {
-        tlb_debug("forced full flush ("
-                  TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
-                  env->tlb_flush_addr, env->tlb_flush_mask);
-
-        tlb_flush_by_mmuidx_async_work(cpu,
-                                       RUN_ON_CPU_HOST_INT(mmu_idx_bitmap));
-    } else {
-        tlb_flush_page_by_mmuidx_async_work(cpu, data);
-    }
-}
-
  void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
  {
      target_ulong addr_and_mmu_idx;
@@ -376,18 +481,23 @@ void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
      addr_and_mmu_idx |= idxmap;
  
      if (!qemu_cpu_is_self(cpu)) {
-        async_run_on_cpu(cpu, tlb_check_page_and_flush_by_mmuidx_async_work,
+        async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_work,
                           RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
      } else {
-        tlb_check_page_and_flush_by_mmuidx_async_work(
+        tlb_flush_page_by_mmuidx_async_work(
              cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
      }
  }
  
+void tlb_flush_page(CPUState *cpu, target_ulong addr)
+{
+    tlb_flush_page_by_mmuidx(cpu, addr, ALL_MMUIDX_BITS);
+}
+
  void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr,
                                         uint16_t idxmap)
  {
-    const run_on_cpu_func fn = tlb_check_page_and_flush_by_mmuidx_async_work;
+    const run_on_cpu_func fn = tlb_flush_page_by_mmuidx_async_work;
      target_ulong addr_and_mmu_idx;
  
      tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
@@ -400,11 +510,16 @@ void tlb_flush_page_by_mmuidx_all_cpus(CPUState *src_cpu, target_ulong addr,
      fn(src_cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
  }
  
+void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr)
+{
+    tlb_flush_page_by_mmuidx_all_cpus(src, addr, ALL_MMUIDX_BITS);
+}
+
  void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
-                                                            target_ulong addr,
-                                                            uint16_t idxmap)
+                                              target_ulong addr,
+                                              uint16_t idxmap)
  {
-    const run_on_cpu_func fn = tlb_check_page_and_flush_by_mmuidx_async_work;
+    const run_on_cpu_func fn = tlb_flush_page_by_mmuidx_async_work;
      target_ulong addr_and_mmu_idx;
  
      tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%"PRIx16"\n", addr, idxmap);
@@ -417,21 +532,9 @@ void tlb_flush_page_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
      async_safe_run_on_cpu(src_cpu, fn, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
  }
  
-void tlb_flush_page_all_cpus(CPUState *src, target_ulong addr)
+void tlb_flush_page_all_cpus_synced(CPUState *src, target_ulong addr)
  {
-    const run_on_cpu_func fn = tlb_flush_page_async_work;
-
-    flush_all_helper(src, fn, RUN_ON_CPU_TARGET_PTR(addr));
-    fn(src, RUN_ON_CPU_TARGET_PTR(addr));
-}
-
-void tlb_flush_page_all_cpus_synced(CPUState *src,
-                                                  target_ulong addr)
-{
-    const run_on_cpu_func fn = tlb_flush_page_async_work;
-
-    flush_all_helper(src, fn, RUN_ON_CPU_TARGET_PTR(addr));
-    async_safe_run_on_cpu(src, fn, RUN_ON_CPU_TARGET_PTR(addr));
+    tlb_flush_page_by_mmuidx_all_cpus_synced(src, addr, ALL_MMUIDX_BITS);
  }
  
  /* update the TLBs so that writes to code in the virtual page 'addr'
@@ -460,72 +563,44 @@ void tlb_unprotect_code(ram_addr_t ram_addr)
   * most usual is detecting writes to code regions which may invalidate
   * generated code.
   *
- * Because we want other vCPUs to respond to changes straight away we
- * update the te->addr_write field atomically. If the TLB entry has
- * been changed by the vCPU in the mean time we skip the update.
+ * Other vCPUs might be reading their TLBs during guest execution, so we update
+ * te->addr_write with atomic_set. We don't need to worry about this for
+ * oversized guests as MTTCG is disabled for them.
   *
- * As this function uses atomic accesses we also need to ensure
- * updates to tlb_entries follow the same access rules. We don't need
- * to worry about this for oversized guests as MTTCG is disabled for
- * them.
+ * Called with tlb_c.lock held.
   */
-
-static void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,
-                           uintptr_t length)
+static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
+                                         uintptr_t start, uintptr_t length)
  {
-#if TCG_OVERSIZED_GUEST
      uintptr_t addr = tlb_entry->addr_write;
  
      if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
          addr &= TARGET_PAGE_MASK;
          addr += tlb_entry->addend;
          if ((addr - start) < length) {
+#if TCG_OVERSIZED_GUEST
              tlb_entry->addr_write |= TLB_NOTDIRTY;
-        }
-    }
  #else
-    /* paired with atomic_mb_set in tlb_set_page_with_attrs */
-    uintptr_t orig_addr = atomic_mb_read(&tlb_entry->addr_write);
-    uintptr_t addr = orig_addr;
-
-    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
-        addr &= TARGET_PAGE_MASK;
-        addr += atomic_read(&tlb_entry->addend);
-        if ((addr - start) < length) {
-            uintptr_t notdirty_addr = orig_addr | TLB_NOTDIRTY;
-            atomic_cmpxchg(&tlb_entry->addr_write, orig_addr, notdirty_addr);
+            atomic_set(&tlb_entry->addr_write,
+                       tlb_entry->addr_write | TLB_NOTDIRTY);
+#endif
          }
      }
-#endif
  }
  
-/* For atomic correctness when running MTTCG we need to use the right
- * primitives when copying entries */
-static inline void copy_tlb_helper(CPUTLBEntry *d, CPUTLBEntry *s,
-                                   bool atomic_set)
+/*
+ * Called with tlb_c.lock held.
+ * Called only from the vCPU context, i.e. the TLB's owner thread.
+ */
+static inline void copy_tlb_helper_locked(CPUTLBEntry *d, const CPUTLBEntry *s)
  {
-#if TCG_OVERSIZED_GUEST
      *d = *s;
-#else
-    if (atomic_set) {
-        d->addr_read = s->addr_read;
-        d->addr_code = s->addr_code;
-        atomic_set(&d->addend, atomic_read(&s->addend));
-        /* Pairs with flag setting in tlb_reset_dirty_range */
-        atomic_mb_set(&d->addr_write, atomic_read(&s->addr_write));
-    } else {
-        d->addr_read = s->addr_read;
-        d->addr_write = atomic_read(&s->addr_write);
-        d->addr_code = s->addr_code;
-        d->addend = atomic_read(&s->addend);
-    }
-#endif
  }
  
  /* This is a cross vCPU call (i.e. another vCPU resetting the flags of
- * the target vCPU). As such care needs to be taken that we don't
- * dangerously race with another vCPU update. The only thing actually
- * updated is the target TLB entry ->addr_write flags.
+ * the target vCPU).
+ * We must take tlb_c.lock to avoid racing with another vCPU update. The only
+ * thing actually updated is the target TLB entry ->addr_write flags.
   */
  void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
  {
@@ -534,22 +609,27 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
      int mmu_idx;
  
      env = cpu->env_ptr;
+    qemu_spin_lock(&env->tlb_c.lock);
      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
          unsigned int i;
+        unsigned int n = tlb_n_entries(env, mmu_idx);
  
-        for (i = 0; i < CPU_TLB_SIZE; i++) {
-            tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i],
-                                  start1, length);
+        for (i = 0; i < n; i++) {
+            tlb_reset_dirty_range_locked(&env->tlb_table[mmu_idx][i], start1,
+                                         length);
          }
  
          for (i = 0; i < CPU_VTLB_SIZE; i++) {
-            tlb_reset_dirty_range(&env->tlb_v_table[mmu_idx][i],
-                                  start1, length);
+            tlb_reset_dirty_range_locked(&env->tlb_v_table[mmu_idx][i], start1,
+                                         length);
          }
      }
+    qemu_spin_unlock(&env->tlb_c.lock);
  }
  
-static inline void tlb_set_dirty1(CPUTLBEntry *tlb_entry, target_ulong vaddr)
+/* Called with tlb_c.lock held */
+static inline void tlb_set_dirty1_locked(CPUTLBEntry *tlb_entry,
+                                         target_ulong vaddr)
  {
      if (tlb_entry->addr_write == (vaddr | TLB_NOTDIRTY)) {
          tlb_entry->addr_write = vaddr;
@@ -561,46 +641,47 @@ static inline void tlb_set_dirty1(CPUTLBEntry *tlb_entry, target_ulong vaddr)
  void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
  {
      CPUArchState *env = cpu->env_ptr;
-    int i;
      int mmu_idx;
  
      assert_cpu_is_self(cpu);
  
      vaddr &= TARGET_PAGE_MASK;
-    i = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+    qemu_spin_lock(&env->tlb_c.lock);
      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        tlb_set_dirty1(&env->tlb_table[mmu_idx][i], vaddr);
+        tlb_set_dirty1_locked(tlb_entry(env, mmu_idx, vaddr), vaddr);
      }
  
      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
          int k;
          for (k = 0; k < CPU_VTLB_SIZE; k++) {
-            tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr);
+            tlb_set_dirty1_locked(&env->tlb_v_table[mmu_idx][k], vaddr);
          }
      }
+    qemu_spin_unlock(&env->tlb_c.lock);
  }
  
  /* Our TLB does not support large pages, so remember the area covered by
     large pages and trigger a full TLB flush if these are invalidated.  */
-static void tlb_add_large_page(CPUArchState *env, target_ulong vaddr,
-                               target_ulong size)
+static void tlb_add_large_page(CPUArchState *env, int mmu_idx,
+                               target_ulong vaddr, target_ulong size)
  {
-    target_ulong mask = ~(size - 1);
+    target_ulong lp_addr = env->tlb_d[mmu_idx].large_page_addr;
+    target_ulong lp_mask = ~(size - 1);
  
-    if (env->tlb_flush_addr == (target_ulong)-1) {
-        env->tlb_flush_addr = vaddr & mask;
-        env->tlb_flush_mask = mask;
-        return;
-    }
-    /* Extend the existing region to include the new page.
-       This is a compromise between unnecessary flushes and the cost
-       of maintaining a full variable size TLB.  */
-    mask &= env->tlb_flush_mask;
-    while (((env->tlb_flush_addr ^ vaddr) & mask) != 0) {
-        mask <<= 1;
+    if (lp_addr == (target_ulong)-1) {
+        /* No previous large page.  */
+        lp_addr = vaddr;
+    } else {
+        /* Extend the existing region to include the new page.
+           This is a compromise between unnecessary flushes and
+           the cost of maintaining a full variable size TLB.  */
+        lp_mask &= env->tlb_d[mmu_idx].large_page_mask;
+        while (((lp_addr ^ vaddr) & lp_mask) != 0) {
+            lp_mask <<= 1;
+        }
      }
-    env->tlb_flush_addr &= mask;
-    env->tlb_flush_mask = mask;
+    env->tlb_d[mmu_idx].large_page_addr = lp_addr & lp_mask;
+    env->tlb_d[mmu_idx].large_page_mask = lp_mask;
  }
  
  /* Add a new TLB entry. At most one entry for a given virtual address
@@ -620,28 +701,40 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
      target_ulong address;
      target_ulong code_address;
      uintptr_t addend;
-    CPUTLBEntry *te, *tv, tn;
-    hwaddr iotlb, xlat, sz;
-    unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE;
+    CPUTLBEntry *te, tn;
+    hwaddr iotlb, xlat, sz, paddr_page;
+    target_ulong vaddr_page;
      int asidx = cpu_asidx_from_attrs(cpu, attrs);
  
      assert_cpu_is_self(cpu);
-    assert(size >= TARGET_PAGE_SIZE);
-    if (size != TARGET_PAGE_SIZE) {
-        tlb_add_large_page(env, vaddr, size);
+
+    if (size <= TARGET_PAGE_SIZE) {
+        sz = TARGET_PAGE_SIZE;
+    } else {
+        tlb_add_large_page(env, mmu_idx, vaddr, size);
+        sz = size;
      }
+    vaddr_page = vaddr & TARGET_PAGE_MASK;
+    paddr_page = paddr & TARGET_PAGE_MASK;
  
-    sz = size;
-    section = address_space_translate_for_iotlb(cpu, asidx, paddr, &xlat, &sz,
-                                                attrs, &prot);
+    section = address_space_translate_for_iotlb(cpu, asidx, paddr_page,
+                                                &xlat, &sz, attrs, &prot);
      assert(sz >= TARGET_PAGE_SIZE);
  
      tlb_debug("vaddr=" TARGET_FMT_lx " paddr=0x" TARGET_FMT_plx
                " prot=%x idx=%d\n",
                vaddr, paddr, prot, mmu_idx);
  
-    address = vaddr;
-    if (!memory_region_is_ram(section->mr) && !memory_region_is_romd(section->mr)) {
+    address = vaddr_page;
+    if (size < TARGET_PAGE_SIZE) {
+        /*
+         * Slow-path the TLB entries; we will repeat the MMU check and TLB
+         * fill on every access.
+         */
+        address |= TLB_RECHECK;
+    }
+    if (!memory_region_is_ram(section->mr) &&
+        !memory_region_is_romd(section->mr)) {
          /* IO memory case */
          address |= TLB_MMIO;
          addend = 0;
@@ -651,18 +744,40 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
      }
  
      code_address = address;
-    iotlb = memory_region_section_get_iotlb(cpu, section, vaddr, paddr, xlat,
-                                            prot, &address);
+    iotlb = memory_region_section_get_iotlb(cpu, section, vaddr_page,
+                                            paddr_page, xlat, prot, &address);
+
+    index = tlb_index(env, mmu_idx, vaddr_page);
+    te = tlb_entry(env, mmu_idx, vaddr_page);
+
+    /*
+     * Hold the TLB lock for the rest of the function. We could acquire/release
+     * the lock several times in the function, but it is faster to amortize the
+     * acquisition cost by acquiring it just once. Note that this leads to
+     * a longer critical section, but this is not a concern since the TLB lock
+     * is unlikely to be contended.
+     */
+    qemu_spin_lock(&env->tlb_c.lock);
  
-    index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    te = &env->tlb_table[mmu_idx][index];
-    /* do not discard the translation in te, evict it into a victim tlb */
-    tv = &env->tlb_v_table[mmu_idx][vidx];
+    /* Note that the tlb is no longer clean.  */
+    env->tlb_c.dirty |= 1 << mmu_idx;
  
-    /* addr_write can race with tlb_reset_dirty_range */
-    copy_tlb_helper(tv, te, true);
+    /* Make sure there's no cached translation for the new page.  */
+    tlb_flush_vtlb_page_locked(env, mmu_idx, vaddr_page);
  
-    env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
+    /*
+     * Only evict the old entry to the victim tlb if it's for a
+     * different page; otherwise just overwrite the stale data.
+     */
+    if (!tlb_hit_page_anyprot(te, vaddr_page) && !tlb_entry_is_empty(te)) {
+        unsigned vidx = env->tlb_d[mmu_idx].vindex++ % CPU_VTLB_SIZE;
+        CPUTLBEntry *tv = &env->tlb_v_table[mmu_idx][vidx];
+
+        /* Evict the old entry into the victim tlb.  */
+        copy_tlb_helper_locked(tv, te);
+        env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
+        tlb_n_used_entries_dec(env, mmu_idx);
+    }
  
      /* refill the tlb */
      /*
@@ -670,18 +785,18 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
       * TARGET_PAGE_BITS, and either
       *  + the ram_addr_t of the page base of the target RAM (if NOTDIRTY or ROM)
       *  + the offset within section->mr of the page base (otherwise)
-     * We subtract the vaddr (which is page aligned and thus won't
+     * We subtract the vaddr_page (which is page aligned and thus won't
       * disturb the low bits) to give an offset which can be added to the
       * (non-page-aligned) vaddr of the eventual memory access to get
       * the MemoryRegion offset for the access. Note that the vaddr we
       * subtract here is that of the page base, and not the same as the
       * vaddr we add back in io_readx()/io_writex()/get_page_addr_code().
       */
-    env->iotlb[mmu_idx][index].addr = iotlb - vaddr;
+    env->iotlb[mmu_idx][index].addr = iotlb - vaddr_page;
      env->iotlb[mmu_idx][index].attrs = attrs;
  
      /* Now calculate the new entry */
-    tn.addend = addend - vaddr;
+    tn.addend = addend - vaddr_page;
      if (prot & PAGE_READ) {
          tn.addr_read = address;
      } else {
@@ -702,7 +817,7 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
              tn.addr_write = address | TLB_MMIO;
          } else if (memory_region_is_ram(section->mr)
                     && cpu_physical_memory_is_clean(
-                        memory_region_get_ram_addr(section->mr) + xlat)) {
+                       memory_region_get_ram_addr(section->mr) + xlat)) {
              tn.addr_write = address | TLB_NOTDIRTY;
          } else {
              tn.addr_write = address;
@@ -712,9 +827,9 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
          }
      }
  
-    /* Pairs with flag setting in tlb_reset_dirty_range */
-    copy_tlb_helper(te, &tn, true);
-    /* atomic_mb_set(&te->addr_write, write_address); */
+    copy_tlb_helper_locked(te, &tn);
+    tlb_n_used_entries_inc(env, mmu_idx);
+    qemu_spin_unlock(&env->tlb_c.lock);
  }
  
  /* Add a new TLB entry, but without specifying the memory
@@ -728,39 +843,6 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
                              prot, mmu_idx, size);
  }
  
-static void report_bad_exec(CPUState *cpu, target_ulong addr)
-{
-    /* Accidentally executing outside RAM or ROM is quite common for
-     * several user-error situations, so report it in a way that
-     * makes it clear that this isn't a QEMU bug and provide suggestions
-     * about what a user could do to fix things.
-     */
-    error_report("Trying to execute code outside RAM or ROM at 0x"
-                 TARGET_FMT_lx, addr);
-    error_printf("This usually means one of the following happened:\n\n"
-                 "(1) You told QEMU to execute a kernel for the wrong machine "
-                 "type, and it crashed on startup (eg trying to run a "
-                 "raspberry pi kernel on a versatilepb QEMU machine)\n"
-                 "(2) You didn't give QEMU a kernel or BIOS filename at all, "
-                 "and QEMU executed a ROM full of no-op instructions until "
-                 "it fell off the end\n"
-                 "(3) Your guest kernel has a bug and crashed by jumping "
-                 "off into nowhere\n\n"
-                 "This is almost always one of the first two, so check your "
-                 "command line and that you are using the right type of kernel "
-                 "for this machine.\n"
-                 "If you think option (3) is likely then you can try debugging "
-                 "your guest with the -d debug options; in particular "
-                 "-d guest_errors will cause the log to include a dump of the "
-                 "guest register state at this point.\n\n"
-                 "Execution cannot continue; stopping here.\n\n");
-
-    /* Report also to the logs, with more detail including register dump */
-    qemu_log_mask(LOG_GUEST_ERROR, "qemu: fatal: Trying to execute code "
-                  "outside RAM or ROM at 0x" TARGET_FMT_lx "\n", addr);
-    log_cpu_state_mask(LOG_GUEST_ERROR, cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP);
-}
-
  static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
  {
      ram_addr_t ram_addr;
@@ -773,9 +855,29 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
      return ram_addr;
  }
  
+/*
+ * Note: tlb_fill() can trigger a resize of the TLB. This means that all of the
+ * caller's prior references to the TLB table (e.g. CPUTLBEntry pointers) must
+ * be discarded and looked up again (e.g. via tlb_entry()).
+ */
+static void tlb_fill(CPUState *cpu, target_ulong addr, int size,
+                     MMUAccessType access_type, int mmu_idx, uintptr_t retaddr)
+{
+    CPUClass *cc = CPU_GET_CLASS(cpu);
+    bool ok;
+
+    /*
+     * This is not a probe, so only valid return is success; failure
+     * should result in exception + longjmp to the cpu loop.
+     */
+    ok = cc->tlb_fill(cpu, addr, size, access_type, mmu_idx, false, retaddr);
+    assert(ok);
+}
+
  static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
                           int mmu_idx,
-                         target_ulong addr, uintptr_t retaddr, int size)
+                         target_ulong addr, uintptr_t retaddr,
+                         bool recheck, MMUAccessType access_type, int size)
  {
      CPUState *cpu = ENV_GET_CPU(env);
      hwaddr mr_offset;
@@ -785,6 +887,30 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      bool locked = false;
      MemTxResult r;
  
+    if (recheck) {
+        /*
+         * This is a TLB_RECHECK access, where the MMU protection
+         * covers a smaller range than a target page, and we must
+         * repeat the MMU check here. This tlb_fill() call might
+         * longjump out if this access should cause a guest exception.
+         */
+        CPUTLBEntry *entry;
+        target_ulong tlb_addr;
+
+        tlb_fill(cpu, addr, size, access_type, mmu_idx, retaddr);
+
+        entry = tlb_entry(env, mmu_idx, addr);
+        tlb_addr = (access_type == MMU_DATA_LOAD ?
+                    entry->addr_read : entry->addr_code);
+        if (!(tlb_addr & ~(TARGET_PAGE_MASK | TLB_RECHECK))) {
+            /* RAM access */
+            uintptr_t haddr = addr + entry->addend;
+
+            return ldn_p((void *)haddr, size);
+        }
+        /* Fall through for handling IO accesses */
+    }
+
      section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
      mr = section->mr;
      mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
@@ -794,6 +920,7 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      }
  
      cpu->mem_io_vaddr = addr;
+    cpu->mem_io_access_type = access_type;
  
      if (mr->global_locking && !qemu_mutex_iothread_locked()) {
          qemu_mutex_lock_iothread();
@@ -806,7 +933,7 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
              section->offset_within_address_space -
              section->offset_within_region;
  
-        cpu_transaction_failed(cpu, physaddr, addr, size, MMU_DATA_LOAD,
+        cpu_transaction_failed(cpu, physaddr, addr, size, access_type,
                                 mmu_idx, iotlbentry->attrs, r, retaddr);
      }
      if (locked) {
@@ -819,7 +946,7 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
  static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
                        int mmu_idx,
                        uint64_t val, target_ulong addr,
-                      uintptr_t retaddr, int size)
+                      uintptr_t retaddr, bool recheck, int size)
  {
      CPUState *cpu = ENV_GET_CPU(env);
      hwaddr mr_offset;
@@ -828,6 +955,30 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      bool locked = false;
      MemTxResult r;
  
+    if (recheck) {
+        /*
+         * This is a TLB_RECHECK access, where the MMU protection
+         * covers a smaller range than a target page, and we must
+         * repeat the MMU check here. This tlb_fill() call might
+         * longjump out if this access should cause a guest exception.
+         */
+        CPUTLBEntry *entry;
+        target_ulong tlb_addr;
+
+        tlb_fill(cpu, addr, size, MMU_DATA_STORE, mmu_idx, retaddr);
+
+        entry = tlb_entry(env, mmu_idx, addr);
+        tlb_addr = tlb_addr_write(entry);
+        if (!(tlb_addr & ~(TARGET_PAGE_MASK | TLB_RECHECK))) {
+            /* RAM access */
+            uintptr_t haddr = addr + entry->addend;
+
+            stn_p((void *)haddr, size, val);
+            return;
+        }
+        /* Fall through for handling IO accesses */
+    }
+
      section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
      mr = section->mr;
      mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
@@ -856,23 +1007,37 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      }
  }
  
+static inline target_ulong tlb_read_ofs(CPUTLBEntry *entry, size_t ofs)
+{
+#if TCG_OVERSIZED_GUEST
+    return *(target_ulong *)((uintptr_t)entry + ofs);
+#else
+    /* ofs might correspond to .addr_write, so use atomic_read */
+    return atomic_read((target_ulong *)((uintptr_t)entry + ofs));
+#endif
+}
+
  /* Return true if ADDR is present in the victim tlb, and has been copied
     back to the main tlb.  */
  static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
                             size_t elt_ofs, target_ulong page)
  {
      size_t vidx;
+
+    assert_cpu_is_self(ENV_GET_CPU(env));
      for (vidx = 0; vidx < CPU_VTLB_SIZE; ++vidx) {
          CPUTLBEntry *vtlb = &env->tlb_v_table[mmu_idx][vidx];
-        target_ulong cmp = *(target_ulong *)((uintptr_t)vtlb + elt_ofs);
+        target_ulong cmp = tlb_read_ofs(vtlb, elt_ofs);
  
          if (cmp == page) {
              /* Found entry in victim tlb, swap tlb and iotlb.  */
              CPUTLBEntry tmptlb, *tlb = &env->tlb_table[mmu_idx][index];
  
-            copy_tlb_helper(&tmptlb, tlb, false);
-            copy_tlb_helper(tlb, vtlb, true);
-            copy_tlb_helper(vtlb, &tmptlb, true);
+            qemu_spin_lock(&env->tlb_c.lock);
+            copy_tlb_helper_locked(&tmptlb, tlb);
+            copy_tlb_helper_locked(tlb, vtlb);
+            copy_tlb_helper_locked(vtlb, &tmptlb);
+            qemu_spin_unlock(&env->tlb_c.lock);
  
              CPUIOTLBEntry tmpio, *io = &env->iotlb[mmu_idx][index];
              CPUIOTLBEntry *vio = &env->iotlb_v[mmu_idx][vidx];
@@ -895,61 +1060,33 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
   */
  tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
  {
-    int mmu_idx, index;
+    uintptr_t mmu_idx = cpu_mmu_index(env, true);
+    uintptr_t index = tlb_index(env, mmu_idx, addr);
+    CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
      void *p;
-    MemoryRegion *mr;
-    MemoryRegionSection *section;
-    CPUState *cpu = ENV_GET_CPU(env);
-    CPUIOTLBEntry *iotlbentry;
-    hwaddr physaddr, mr_offset;
-
-    index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    mmu_idx = cpu_mmu_index(env, true);
-    if (unlikely(env->tlb_table[mmu_idx][index].addr_code !=
-                 (addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK)))) {
-        if (!VICTIM_TLB_HIT(addr_read, addr)) {
+
+    if (unlikely(!tlb_hit(entry->addr_code, addr))) {
+        if (!VICTIM_TLB_HIT(addr_code, addr)) {
              tlb_fill(ENV_GET_CPU(env), addr, 0, MMU_INST_FETCH, mmu_idx, 0);
+            index = tlb_index(env, mmu_idx, addr);
+            entry = tlb_entry(env, mmu_idx, addr);
          }
+        assert(tlb_hit(entry->addr_code, addr));
      }
-    iotlbentry = &env->iotlb[mmu_idx][index];
-    section = iotlb_to_section(cpu, iotlbentry->addr, iotlbentry->attrs);
-    mr = section->mr;
-    if (memory_region_is_unassigned(mr)) {
-        qemu_mutex_lock_iothread();
-        if (memory_region_request_mmio_ptr(mr, addr)) {
-            qemu_mutex_unlock_iothread();
-            /* A MemoryRegion is potentially added so re-run the
-             * get_page_addr_code.
-             */
-            return get_page_addr_code(env, addr);
-        }
-        qemu_mutex_unlock_iothread();
-
-        /* Give the new-style cpu_transaction_failed() hook first chance
-         * to handle this.
-         * This is not the ideal place to detect and generate CPU
-         * exceptions for instruction fetch failure (for instance
-         * we don't know the length of the access that the CPU would
-         * use, and it would be better to go ahead and try the access
-         * and use the MemTXResult it produced). However it is the
-         * simplest place we have currently available for the check.
-         */
-        mr_offset = (iotlbentry->addr & TARGET_PAGE_MASK) + addr;
-        physaddr = mr_offset +
-            section->offset_within_address_space -
-            section->offset_within_region;
-        cpu_transaction_failed(cpu, physaddr, addr, 0, MMU_INST_FETCH, mmu_idx,
-                               iotlbentry->attrs, MEMTX_DECODE_ERROR, 0);
  
-        cpu_unassigned_access(cpu, addr, false, true, 0, 4);
-        /* The CPU's unassigned access hook might have longjumped out
-         * with an exception. If it didn't (or there was no hook) then
-         * we can't proceed further.
+    if (unlikely(entry->addr_code & (TLB_RECHECK | TLB_MMIO))) {
+        /*
+         * Return -1 if we can't translate and execute from an entire
+         * page of RAM here, which will cause us to execute by loading
+         * and translating one insn at a time, without caching:
+         *  - TLB_RECHECK: means the MMU protection covers a smaller range
+         *    than a target page, so we must redo the MMU check every insn
+         *  - TLB_MMIO: region is not backed by RAM
           */
-        report_bad_exec(cpu, addr);
-        exit(1);
+        return -1;
      }
-    p = (void *)((uintptr_t)addr + env->tlb_table[mmu_idx][index].addend);
+
+    p = (void *)((uintptr_t)addr + entry->addend);
      return qemu_ram_addr_from_host_nofail(p);
  }
  
@@ -962,11 +1099,10 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
  void probe_write(CPUArchState *env, target_ulong addr, int size, int mmu_idx,
                   uintptr_t retaddr)
  {
-    int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    target_ulong tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+    uintptr_t index = tlb_index(env, mmu_idx, addr);
+    CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
  
-    if ((addr & TARGET_PAGE_MASK)
-        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+    if (!tlb_hit(tlb_addr_write(entry), addr)) {
          /* TLB entry is for a different page */
          if (!VICTIM_TLB_HIT(addr_write, addr)) {
              tlb_fill(ENV_GET_CPU(env), addr, size, MMU_DATA_STORE,
@@ -975,6 +1111,56 @@ void probe_write(CPUArchState *env, target_ulong addr, int size, int mmu_idx,
      }
  }
  
+void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
+                        MMUAccessType access_type, int mmu_idx)
+{
+    CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
+    uintptr_t tlb_addr, page;
+    size_t elt_ofs;
+
+    switch (access_type) {
+    case MMU_DATA_LOAD:
+        elt_ofs = offsetof(CPUTLBEntry, addr_read);
+        break;
+    case MMU_DATA_STORE:
+        elt_ofs = offsetof(CPUTLBEntry, addr_write);
+        break;
+    case MMU_INST_FETCH:
+        elt_ofs = offsetof(CPUTLBEntry, addr_code);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    page = addr & TARGET_PAGE_MASK;
+    tlb_addr = tlb_read_ofs(entry, elt_ofs);
+
+    if (!tlb_hit_page(tlb_addr, page)) {
+        uintptr_t index = tlb_index(env, mmu_idx, addr);
+
+        if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page)) {
+            CPUState *cs = ENV_GET_CPU(env);
+            CPUClass *cc = CPU_GET_CLASS(cs);
+
+            if (!cc->tlb_fill(cs, addr, 0, access_type, mmu_idx, true, 0)) {
+                /* Non-faulting page table read failed.  */
+                return NULL;
+            }
+
+            /* TLB resize via tlb_fill may have moved the entry.  */
+            entry = tlb_entry(env, mmu_idx, addr);
+        }
+        tlb_addr = tlb_read_ofs(entry, elt_ofs);
+    }
+
+    if (tlb_addr & ~TARGET_PAGE_MASK) {
+        /* IO access */
+        return NULL;
+    }
+
+    return (void *)((uintptr_t)addr + entry->addend);
+}
+
  /* Probe for a read-modify-write atomic operation.  Do not allow unaligned
   * operations, or io operations to proceed.  Return the host address.  */
  static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
@@ -982,9 +1168,9 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
                                 NotDirtyInfo *ndi)
  {
      size_t mmu_idx = get_mmuidx(oi);
-    size_t index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    CPUTLBEntry *tlbe = &env->tlb_table[mmu_idx][index];
-    target_ulong tlb_addr = tlbe->addr_write;
+    uintptr_t index = tlb_index(env, mmu_idx, addr);
+    CPUTLBEntry *tlbe = tlb_entry(env, mmu_idx, addr);
+    target_ulong tlb_addr = tlb_addr_write(tlbe);
      TCGMemOp mop = get_memop(oi);
      int a_bits = get_alignment_bits(mop);
      int s_bits = mop & MO_SIZE;
@@ -1010,17 +1196,18 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
      }
  
      /* Check TLB entry and enforce page permissions.  */
-    if ((addr & TARGET_PAGE_MASK)
-        != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+    if (!tlb_hit(tlb_addr, addr)) {
          if (!VICTIM_TLB_HIT(addr_write, addr)) {
              tlb_fill(ENV_GET_CPU(env), addr, 1 << s_bits, MMU_DATA_STORE,
                       mmu_idx, retaddr);
+            index = tlb_index(env, mmu_idx, addr);
+            tlbe = tlb_entry(env, mmu_idx, addr);
          }
-        tlb_addr = tlbe->addr_write & ~TLB_INVALID_MASK;
+        tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
      }
  
-    /* Notice an IO access  */
-    if (unlikely(tlb_addr & TLB_MMIO)) {
+    /* Notice an IO access or a needs-MMU-lookup access */
+    if (unlikely(tlb_addr & (TLB_MMIO | TLB_RECHECK))) {
          /* There's really nothing that can be done to
             support this apart from stop-the-world.  */
          goto stop_the_world;
@@ -1103,7 +1290,7 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
  #include "atomic_template.h"
  #endif
  
-#ifdef CONFIG_ATOMIC128
+#if HAVE_CMPXCHG128 || HAVE_ATOMIC128
  #define DATA_SIZE 16
  #include "atomic_template.h"
  #endif