tcg: Use tlb_fill probe from tlb_vaddr_to_host

[mirror_qemu.git] / accel / tcg / cputlb.c
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c

index e60628c350e4d3616b0e53fe02a6826b9d6c3b54..685e0f2ee4e0965b4120b8d47a5d63cdd9e2a890 100644 (file)
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -6,7 +6,7 @@
   * This library is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
   * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
   *
   * This library is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -74,11 +74,176 @@ QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
  QEMU_BUILD_BUG_ON(NB_MMU_MODES > 16);
  #define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
  
+static inline size_t sizeof_tlb(CPUArchState *env, uintptr_t mmu_idx)
+{
+    return env->tlb_mask[mmu_idx] + (1 << CPU_TLB_ENTRY_BITS);
+}
+
+static void tlb_window_reset(CPUTLBWindow *window, int64_t ns,
+                             size_t max_entries)
+{
+    window->begin_ns = ns;
+    window->max_entries = max_entries;
+}
+
+static void tlb_dyn_init(CPUArchState *env)
+{
+    int i;
+
+    for (i = 0; i < NB_MMU_MODES; i++) {
+        CPUTLBDesc *desc = &env->tlb_d[i];
+        size_t n_entries = 1 << CPU_TLB_DYN_DEFAULT_BITS;
+
+        tlb_window_reset(&desc->window, get_clock_realtime(), 0);
+        desc->n_used_entries = 0;
+        env->tlb_mask[i] = (n_entries - 1) << CPU_TLB_ENTRY_BITS;
+        env->tlb_table[i] = g_new(CPUTLBEntry, n_entries);
+        env->iotlb[i] = g_new(CPUIOTLBEntry, n_entries);
+    }
+}
+
+/**
+ * tlb_mmu_resize_locked() - perform TLB resize bookkeeping; resize if necessary
+ * @env: CPU that owns the TLB
+ * @mmu_idx: MMU index of the TLB
+ *
+ * Called with tlb_lock_held.
+ *
+ * We have two main constraints when resizing a TLB: (1) we only resize it
+ * on a TLB flush (otherwise we'd have to take a perf hit by either rehashing
+ * the array or unnecessarily flushing it), which means we do not control how
+ * frequently the resizing can occur; (2) we don't have access to the guest's
+ * future scheduling decisions, and therefore have to decide the magnitude of
+ * the resize based on past observations.
+ *
+ * In general, a memory-hungry process can benefit greatly from an appropriately
+ * sized TLB, since a guest TLB miss is very expensive. This doesn't mean that
+ * we just have to make the TLB as large as possible; while an oversized TLB
+ * results in minimal TLB miss rates, it also takes longer to be flushed
+ * (flushes can be _very_ frequent), and the reduced locality can also hurt
+ * performance.
+ *
+ * To achieve near-optimal performance for all kinds of workloads, we:
+ *
+ * 1. Aggressively increase the size of the TLB when the use rate of the
+ * TLB being flushed is high, since it is likely that in the near future this
+ * memory-hungry process will execute again, and its memory hungriness will
+ * probably be similar.
+ *
+ * 2. Slowly reduce the size of the TLB as the use rate declines over a
+ * reasonably large time window. The rationale is that if in such a time window
+ * we have not observed a high TLB use rate, it is likely that we won't observe
+ * it in the near future. In that case, once a time window expires we downsize
+ * the TLB to match the maximum use rate observed in the window.
+ *
+ * 3. Try to keep the maximum use rate in a time window in the 30-70% range,
+ * since in that range performance is likely near-optimal. Recall that the TLB
+ * is direct mapped, so we want the use rate to be low (or at least not too
+ * high), since otherwise we are likely to have a significant amount of
+ * conflict misses.
+ */
+static void tlb_mmu_resize_locked(CPUArchState *env, int mmu_idx)
+{
+    CPUTLBDesc *desc = &env->tlb_d[mmu_idx];
+    size_t old_size = tlb_n_entries(env, mmu_idx);
+    size_t rate;
+    size_t new_size = old_size;
+    int64_t now = get_clock_realtime();
+    int64_t window_len_ms = 100;
+    int64_t window_len_ns = window_len_ms * 1000 * 1000;
+    bool window_expired = now > desc->window.begin_ns + window_len_ns;
+
+    if (desc->n_used_entries > desc->window.max_entries) {
+        desc->window.max_entries = desc->n_used_entries;
+    }
+    rate = desc->window.max_entries * 100 / old_size;
+
+    if (rate > 70) {
+        new_size = MIN(old_size << 1, 1 << CPU_TLB_DYN_MAX_BITS);
+    } else if (rate < 30 && window_expired) {
+        size_t ceil = pow2ceil(desc->window.max_entries);
+        size_t expected_rate = desc->window.max_entries * 100 / ceil;
+
+        /*
+         * Avoid undersizing when the max number of entries seen is just below
+         * a pow2. For instance, if max_entries == 1025, the expected use rate
+         * would be 1025/2048==50%. However, if max_entries == 1023, we'd get
+         * 1023/1024==99.9% use rate, so we'd likely end up doubling the size
+         * later. Thus, make sure that the expected use rate remains below 70%.
+         * (and since we double the size, that means the lowest rate we'd
+         * expect to get is 35%, which is still in the 30-70% range where
+         * we consider that the size is appropriate.)
+         */
+        if (expected_rate > 70) {
+            ceil *= 2;
+        }
+        new_size = MAX(ceil, 1 << CPU_TLB_DYN_MIN_BITS);
+    }
+
+    if (new_size == old_size) {
+        if (window_expired) {
+            tlb_window_reset(&desc->window, now, desc->n_used_entries);
+        }
+        return;
+    }
+
+    g_free(env->tlb_table[mmu_idx]);
+    g_free(env->iotlb[mmu_idx]);
+
+    tlb_window_reset(&desc->window, now, 0);
+    /* desc->n_used_entries is cleared by the caller */
+    env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;
+    env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size);
+    env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size);
+    /*
+     * If the allocations fail, try smaller sizes. We just freed some
+     * memory, so going back to half of new_size has a good chance of working.
+     * Increased memory pressure elsewhere in the system might cause the
+     * allocations to fail though, so we progressively reduce the allocation
+     * size, aborting if we cannot even allocate the smallest TLB we support.
+     */
+    while (env->tlb_table[mmu_idx] == NULL || env->iotlb[mmu_idx] == NULL) {
+        if (new_size == (1 << CPU_TLB_DYN_MIN_BITS)) {
+            error_report("%s: %s", __func__, strerror(errno));
+            abort();
+        }
+        new_size = MAX(new_size >> 1, 1 << CPU_TLB_DYN_MIN_BITS);
+        env->tlb_mask[mmu_idx] = (new_size - 1) << CPU_TLB_ENTRY_BITS;
+
+        g_free(env->tlb_table[mmu_idx]);
+        g_free(env->iotlb[mmu_idx]);
+        env->tlb_table[mmu_idx] = g_try_new(CPUTLBEntry, new_size);
+        env->iotlb[mmu_idx] = g_try_new(CPUIOTLBEntry, new_size);
+    }
+}
+
+static inline void tlb_table_flush_by_mmuidx(CPUArchState *env, int mmu_idx)
+{
+    tlb_mmu_resize_locked(env, mmu_idx);
+    memset(env->tlb_table[mmu_idx], -1, sizeof_tlb(env, mmu_idx));
+    env->tlb_d[mmu_idx].n_used_entries = 0;
+}
+
+static inline void tlb_n_used_entries_inc(CPUArchState *env, uintptr_t mmu_idx)
+{
+    env->tlb_d[mmu_idx].n_used_entries++;
+}
+
+static inline void tlb_n_used_entries_dec(CPUArchState *env, uintptr_t mmu_idx)
+{
+    env->tlb_d[mmu_idx].n_used_entries--;
+}
+
  void tlb_init(CPUState *cpu)
  {
      CPUArchState *env = cpu->env_ptr;
  
      qemu_spin_init(&env->tlb_c.lock);
+
+    /* Ensure that cpu_reset performs a full flush.  */
+    env->tlb_c.dirty = ALL_MMUIDX_BITS;
+
+    tlb_dyn_init(env);
  }
  
  /* flush_all_helper: run fn across all cpus
@@ -119,7 +284,7 @@ void tlb_flush_counts(size_t *pfull, size_t *ppart, size_t *pelide)
  
  static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
  {
-    memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
+    tlb_table_flush_by_mmuidx(env, mmu_idx);
      memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
      env->tlb_d[mmu_idx].large_page_addr = -1;
      env->tlb_d[mmu_idx].large_page_mask = -1;
@@ -129,31 +294,40 @@ static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
  static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
  {
      CPUArchState *env = cpu->env_ptr;
-    unsigned long mmu_idx_bitmask = data.host_int;
-    int mmu_idx;
+    uint16_t asked = data.host_int;
+    uint16_t all_dirty, work, to_clean;
  
      assert_cpu_is_self(cpu);
  
-    tlb_debug("mmu_idx:0x%04lx\n", mmu_idx_bitmask);
+    tlb_debug("mmu_idx:0x%04" PRIx16 "\n", asked);
  
      qemu_spin_lock(&env->tlb_c.lock);
-    env->tlb_c.pending_flush &= ~mmu_idx_bitmask;
  
-    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
-            tlb_flush_one_mmuidx_locked(env, mmu_idx);
-        }
+    all_dirty = env->tlb_c.dirty;
+    to_clean = asked & all_dirty;
+    all_dirty &= ~to_clean;
+    env->tlb_c.dirty = all_dirty;
+
+    for (work = to_clean; work != 0; work &= work - 1) {
+        int mmu_idx = ctz32(work);
+        tlb_flush_one_mmuidx_locked(env, mmu_idx);
      }
+
      qemu_spin_unlock(&env->tlb_c.lock);
  
      cpu_tb_jmp_cache_clear(cpu);
  
-    if (mmu_idx_bitmask == ALL_MMUIDX_BITS) {
+    if (to_clean == ALL_MMUIDX_BITS) {
          atomic_set(&env->tlb_c.full_flush_count,
                     env->tlb_c.full_flush_count + 1);
      } else {
          atomic_set(&env->tlb_c.part_flush_count,
-                   env->tlb_c.part_flush_count + ctpop16(mmu_idx_bitmask));
+                   env->tlb_c.part_flush_count + ctpop16(to_clean));
+        if (to_clean != asked) {
+            atomic_set(&env->tlb_c.elide_flush_count,
+                       env->tlb_c.elide_flush_count +
+                       ctpop16(asked & ~to_clean));
+        }
      }
  }
  
@@ -162,20 +336,8 @@ void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
      tlb_debug("mmu_idx: 0x%" PRIx16 "\n", idxmap);
  
      if (cpu->created && !qemu_cpu_is_self(cpu)) {
-        CPUArchState *env = cpu->env_ptr;
-        uint16_t pending, to_clean;
-
-        qemu_spin_lock(&env->tlb_c.lock);
-        pending = env->tlb_c.pending_flush;
-        to_clean = idxmap & ~pending;
-        env->tlb_c.pending_flush = pending | idxmap;
-        qemu_spin_unlock(&env->tlb_c.lock);
-
-        if (to_clean) {
-            tlb_debug("reduced mmu_idx: 0x%" PRIx16 "\n", to_clean);
-            async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,
-                             RUN_ON_CPU_HOST_INT(to_clean));
-        }
+        async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,
+                         RUN_ON_CPU_HOST_INT(idxmap));
      } else {
          tlb_flush_by_mmuidx_async_work(cpu, RUN_ON_CPU_HOST_INT(idxmap));
      }
@@ -224,13 +386,24 @@ static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry,
             tlb_hit_page(tlb_entry->addr_code, page);
  }
  
+/**
+ * tlb_entry_is_empty - return true if the entry is not in use
+ * @te: pointer to CPUTLBEntry
+ */
+static inline bool tlb_entry_is_empty(const CPUTLBEntry *te)
+{
+    return te->addr_read == -1 && te->addr_write == -1 && te->addr_code == -1;
+}
+
  /* Called with tlb_c.lock held */
-static inline void tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
+static inline bool tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
                                            target_ulong page)
  {
      if (tlb_hit_page_anyprot(tlb_entry, page)) {
          memset(tlb_entry, -1, sizeof(*tlb_entry));
+        return true;
      }
+    return false;
  }
  
  /* Called with tlb_c.lock held */
@@ -241,7 +414,9 @@ static inline void tlb_flush_vtlb_page_locked(CPUArchState *env, int mmu_idx,
  
      assert_cpu_is_self(ENV_GET_CPU(env));
      for (k = 0; k < CPU_VTLB_SIZE; k++) {
-        tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page);
+        if (tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page)) {
+            tlb_n_used_entries_dec(env, mmu_idx);
+        }
      }
  }
  
@@ -258,7 +433,9 @@ static void tlb_flush_page_locked(CPUArchState *env, int midx,
                    midx, lp_addr, lp_mask);
          tlb_flush_one_mmuidx_locked(env, midx);
      } else {
-        tlb_flush_entry_locked(tlb_entry(env, midx, page), page);
+        if (tlb_flush_entry_locked(tlb_entry(env, midx, page), page)) {
+            tlb_n_used_entries_dec(env, midx);
+        }
          tlb_flush_vtlb_page_locked(env, midx, page);
      }
  }
@@ -435,8 +612,9 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
      qemu_spin_lock(&env->tlb_c.lock);
      for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
          unsigned int i;
+        unsigned int n = tlb_n_entries(env, mmu_idx);
  
-        for (i = 0; i < CPU_TLB_SIZE; i++) {
+        for (i = 0; i < n; i++) {
              tlb_reset_dirty_range_locked(&env->tlb_table[mmu_idx][i], start1,
                                           length);
          }
@@ -581,6 +759,9 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
       */
      qemu_spin_lock(&env->tlb_c.lock);
  
+    /* Note that the tlb is no longer clean.  */
+    env->tlb_c.dirty |= 1 << mmu_idx;
+
      /* Make sure there's no cached translation for the new page.  */
      tlb_flush_vtlb_page_locked(env, mmu_idx, vaddr_page);
  
@@ -588,13 +769,14 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
       * Only evict the old entry to the victim tlb if it's for a
       * different page; otherwise just overwrite the stale data.
       */
-    if (!tlb_hit_page_anyprot(te, vaddr_page)) {
+    if (!tlb_hit_page_anyprot(te, vaddr_page) && !tlb_entry_is_empty(te)) {
          unsigned vidx = env->tlb_d[mmu_idx].vindex++ % CPU_VTLB_SIZE;
          CPUTLBEntry *tv = &env->tlb_v_table[mmu_idx][vidx];
  
          /* Evict the old entry into the victim tlb.  */
          copy_tlb_helper_locked(tv, te);
          env->iotlb_v[mmu_idx][vidx] = env->iotlb[mmu_idx][index];
+        tlb_n_used_entries_dec(env, mmu_idx);
      }
  
      /* refill the tlb */
@@ -646,6 +828,7 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
      }
  
      copy_tlb_helper_locked(te, &tn);
+    tlb_n_used_entries_inc(env, mmu_idx);
      qemu_spin_unlock(&env->tlb_c.lock);
  }
  
@@ -672,6 +855,25 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
      return ram_addr;
  }
  
+/*
+ * Note: tlb_fill() can trigger a resize of the TLB. This means that all of the
+ * caller's prior references to the TLB table (e.g. CPUTLBEntry pointers) must
+ * be discarded and looked up again (e.g. via tlb_entry()).
+ */
+static void tlb_fill(CPUState *cpu, target_ulong addr, int size,
+                     MMUAccessType access_type, int mmu_idx, uintptr_t retaddr)
+{
+    CPUClass *cc = CPU_GET_CLASS(cpu);
+    bool ok;
+
+    /*
+     * This is not a probe, so only valid return is success; failure
+     * should result in exception + longjmp to the cpu loop.
+     */
+    ok = cc->tlb_fill(cpu, addr, size, access_type, mmu_idx, false, retaddr);
+    assert(ok);
+}
+
  static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
                           int mmu_idx,
                           target_ulong addr, uintptr_t retaddr,
@@ -695,10 +897,11 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
          CPUTLBEntry *entry;
          target_ulong tlb_addr;
  
-        tlb_fill(cpu, addr, size, MMU_DATA_LOAD, mmu_idx, retaddr);
+        tlb_fill(cpu, addr, size, access_type, mmu_idx, retaddr);
  
          entry = tlb_entry(env, mmu_idx, addr);
-        tlb_addr = entry->addr_read;
+        tlb_addr = (access_type == MMU_DATA_LOAD ?
+                    entry->addr_read : entry->addr_code);
          if (!(tlb_addr & ~(TARGET_PAGE_MASK | TLB_RECHECK))) {
              /* RAM access */
              uintptr_t haddr = addr + entry->addend;
@@ -804,6 +1007,16 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
      }
  }
  
+static inline target_ulong tlb_read_ofs(CPUTLBEntry *entry, size_t ofs)
+{
+#if TCG_OVERSIZED_GUEST
+    return *(target_ulong *)((uintptr_t)entry + ofs);
+#else
+    /* ofs might correspond to .addr_write, so use atomic_read */
+    return atomic_read((target_ulong *)((uintptr_t)entry + ofs));
+#endif
+}
+
  /* Return true if ADDR is present in the victim tlb, and has been copied
     back to the main tlb.  */
  static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
@@ -814,14 +1027,7 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
      assert_cpu_is_self(ENV_GET_CPU(env));
      for (vidx = 0; vidx < CPU_VTLB_SIZE; ++vidx) {
          CPUTLBEntry *vtlb = &env->tlb_v_table[mmu_idx][vidx];
-        target_ulong cmp;
-
-        /* elt_ofs might correspond to .addr_write, so use atomic_read */
-#if TCG_OVERSIZED_GUEST
-        cmp = *(target_ulong *)((uintptr_t)vtlb + elt_ofs);
-#else
-        cmp = atomic_read((target_ulong *)((uintptr_t)vtlb + elt_ofs));
-#endif
+        target_ulong cmp = tlb_read_ofs(vtlb, elt_ofs);
  
          if (cmp == page) {
              /* Found entry in victim tlb, swap tlb and iotlb.  */
@@ -862,6 +1068,8 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
      if (unlikely(!tlb_hit(entry->addr_code, addr))) {
          if (!VICTIM_TLB_HIT(addr_code, addr)) {
              tlb_fill(ENV_GET_CPU(env), addr, 0, MMU_INST_FETCH, mmu_idx, 0);
+            index = tlb_index(env, mmu_idx, addr);
+            entry = tlb_entry(env, mmu_idx, addr);
          }
          assert(tlb_hit(entry->addr_code, addr));
      }
@@ -903,6 +1111,56 @@ void probe_write(CPUArchState *env, target_ulong addr, int size, int mmu_idx,
      }
  }
  
+void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
+                        MMUAccessType access_type, int mmu_idx)
+{
+    CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
+    uintptr_t tlb_addr, page;
+    size_t elt_ofs;
+
+    switch (access_type) {
+    case MMU_DATA_LOAD:
+        elt_ofs = offsetof(CPUTLBEntry, addr_read);
+        break;
+    case MMU_DATA_STORE:
+        elt_ofs = offsetof(CPUTLBEntry, addr_write);
+        break;
+    case MMU_INST_FETCH:
+        elt_ofs = offsetof(CPUTLBEntry, addr_code);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    page = addr & TARGET_PAGE_MASK;
+    tlb_addr = tlb_read_ofs(entry, elt_ofs);
+
+    if (!tlb_hit_page(tlb_addr, page)) {
+        uintptr_t index = tlb_index(env, mmu_idx, addr);
+
+        if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page)) {
+            CPUState *cs = ENV_GET_CPU(env);
+            CPUClass *cc = CPU_GET_CLASS(cs);
+
+            if (!cc->tlb_fill(cs, addr, 0, access_type, mmu_idx, true, 0)) {
+                /* Non-faulting page table read failed.  */
+                return NULL;
+            }
+
+            /* TLB resize via tlb_fill may have moved the entry.  */
+            entry = tlb_entry(env, mmu_idx, addr);
+        }
+        tlb_addr = tlb_read_ofs(entry, elt_ofs);
+    }
+
+    if (tlb_addr & ~TARGET_PAGE_MASK) {
+        /* IO access */
+        return NULL;
+    }
+
+    return (void *)((uintptr_t)addr + entry->addend);
+}
+
  /* Probe for a read-modify-write atomic operation.  Do not allow unaligned
   * operations, or io operations to proceed.  Return the host address.  */
  static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
@@ -942,6 +1200,8 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
          if (!VICTIM_TLB_HIT(addr_write, addr)) {
              tlb_fill(ENV_GET_CPU(env), addr, 1 << s_bits, MMU_DATA_STORE,
                       mmu_idx, retaddr);
+            index = tlb_index(env, mmu_idx, addr);
+            tlbe = tlb_entry(env, mmu_idx, addr);
          }
          tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
      }