Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20190903' into staging

[mirror_qemu.git] / exec.c
diff --git a/exec.c b/exec.c

index 2646207661d489e2aacd4bd75de4747729a03940..235d6bc88323432899f4f10b7f292548facc4df0 100644 (file)
--- a/exec.c
+++ b/exec.c
@@ -16,7 +16,9 @@
   * You should have received a copy of the GNU Lesser General Public
   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
   */
+
  #include "qemu/osdep.h"
+#include "qemu-common.h"
  #include "qapi/error.h"
  
  #include "qemu/cutils.h"
@@ -32,6 +34,7 @@
  #endif
  #include "sysemu/kvm.h"
  #include "sysemu/sysemu.h"
+#include "sysemu/tcg.h"
  #include "qemu/timer.h"
  #include "qemu/config-file.h"
  #include "qemu/error-report.h"
@@ -39,11 +42,10 @@
  #if defined(CONFIG_USER_ONLY)
  #include "qemu.h"
  #else /* !CONFIG_USER_ONLY */
-#include "hw/hw.h"
  #include "exec/memory.h"
  #include "exec/ioport.h"
  #include "sysemu/dma.h"
-#include "sysemu/numa.h"
+#include "sysemu/hostmem.h"
  #include "sysemu/hw_accel.h"
  #include "exec/address-spaces.h"
  #include "sysemu/xen-mapcache.h"
@@ -191,14 +193,12 @@ typedef struct subpage_t {
  #define PHYS_SECTION_UNASSIGNED 0
  #define PHYS_SECTION_NOTDIRTY 1
  #define PHYS_SECTION_ROM 2
-#define PHYS_SECTION_WATCH 3
  
  static void io_mem_init(void);
  static void memory_map_init(void);
+static void tcg_log_global_after_sync(MemoryListener *listener);
  static void tcg_commit(MemoryListener *listener);
  
-static MemoryRegion io_mem_watch;
-
  /**
   * CPUAddressSpace: all the information a CPU needs about an AddressSpace
   * @cpu: the CPU whose AddressSpace this is
@@ -903,6 +903,7 @@ void cpu_address_space_init(CPUState *cpu, int asidx,
      newas->cpu = cpu;
      newas->as = as;
      if (tcg_enabled()) {
+        newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
          newas->tcg_as_listener.commit = tcg_commit;
          memory_listener_register(&newas->tcg_as_listener, as);
      }
@@ -935,7 +936,7 @@ void cpu_exec_unrealizefn(CPUState *cpu)
  Property cpu_common_props[] = {
  #ifndef CONFIG_USER_ONLY
      /* Create a memory property for softmmu CPU object,
-     * so users can wire up its memory. (This can't go in qom/cpu.c
+     * so users can wire up its memory. (This can't go in hw/core/cpu.c
       * because that file is compiled only once for both user-mode
       * and system builds.) The default if no link is set up is to use
       * the system address space.
@@ -983,14 +984,18 @@ void cpu_exec_realizefn(CPUState *cpu, Error **errp)
  #endif
  }
  
-const char *parse_cpu_model(const char *cpu_model)
+const char *parse_cpu_option(const char *cpu_option)
  {
      ObjectClass *oc;
      CPUClass *cc;
      gchar **model_pieces;
      const char *cpu_type;
  
-    model_pieces = g_strsplit(cpu_model, ",", 2);
+    model_pieces = g_strsplit(cpu_option, ",", 2);
+    if (!model_pieces[0]) {
+        error_report("-cpu option cannot be empty");
+        exit(1);
+    }
  
      oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
      if (oc == NULL) {
@@ -1054,28 +1059,7 @@ static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
  }
  #endif
  
-#if defined(CONFIG_USER_ONLY)
-void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
-
-{
-}
-
-int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len,
-                          int flags)
-{
-    return -ENOSYS;
-}
-
-void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint)
-{
-}
-
-int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
-                          int flags, CPUWatchpoint **watchpoint)
-{
-    return -ENOSYS;
-}
-#else
+#ifndef CONFIG_USER_ONLY
  /* Add a watchpoint.  */
  int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len,
                            int flags, CPUWatchpoint **watchpoint)
@@ -1151,9 +1135,8 @@ void cpu_watchpoint_remove_all(CPUState *cpu, int mask)
   * partially or completely with the address range covered by the
   * access).
   */
-static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
-                                                  vaddr addr,
-                                                  vaddr len)
+static inline bool watchpoint_address_matches(CPUWatchpoint *wp,
+                                              vaddr addr, vaddr len)
  {
      /* We know the lengths are non-zero, but a little caution is
       * required to avoid errors in the case where the range ends
@@ -1166,7 +1149,20 @@ static inline bool cpu_watchpoint_address_matches(CPUWatchpoint *wp,
      return !(addr > wpend || wp->vaddr > addrend);
  }
  
-#endif
+/* Return flags for watchpoints that match addr + prot.  */
+int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len)
+{
+    CPUWatchpoint *wp;
+    int ret = 0;
+
+    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
+        if (watchpoint_address_matches(wp, addr, TARGET_PAGE_SIZE)) {
+            ret |= wp->flags;
+        }
+    }
+    return ret;
+}
+#endif /* !CONFIG_USER_ONLY */
  
  /* Add a breakpoint.  */
  int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags,
@@ -1351,6 +1347,8 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
      DirtyMemoryBlocks *blocks;
      unsigned long end, page;
      bool dirty = false;
+    RAMBlock *ramblock;
+    uint64_t mr_offset, mr_size;
  
      if (length == 0) {
          return false;
@@ -1362,6 +1360,10 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
      rcu_read_lock();
  
      blocks = atomic_rcu_read(&ram_list.dirty_memory[client]);
+    ramblock = qemu_get_ram_block(start);
+    /* Range sanity check on the ramblock */
+    assert(start >= ramblock->offset &&
+           start + length <= ramblock->offset + ramblock->used_length);
  
      while (page < end) {
          unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
@@ -1373,6 +1375,10 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
          page += num;
      }
  
+    mr_offset = (ram_addr_t)(page << TARGET_PAGE_BITS) - ramblock->offset;
+    mr_size = (end - page) << TARGET_PAGE_BITS;
+    memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
+
      rcu_read_unlock();
  
      if (dirty && tcg_enabled()) {
@@ -1383,9 +1389,10 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
  }
  
  DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
-     (ram_addr_t start, ram_addr_t length, unsigned client)
+    (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
  {
      DirtyMemoryBlocks *blocks;
+    ram_addr_t start = memory_region_get_ram_addr(mr) + offset;
      unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
      ram_addr_t first = QEMU_ALIGN_DOWN(start, align);
      ram_addr_t last  = QEMU_ALIGN_UP(start + length, align);
@@ -1427,6 +1434,8 @@ DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
          tlb_reset_dirty_range_all(start, length);
      }
  
+    memory_region_clear_dirty_bitmap(mr, offset, length);
+
      return snap;
  }
  
@@ -1460,7 +1469,6 @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,
                                         target_ulong *address)
  {
      hwaddr iotlb;
-    CPUWatchpoint *wp;
  
      if (memory_region_is_ram(section->mr)) {
          /* Normal RAM.  */
@@ -1478,19 +1486,6 @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu,
          iotlb += xlat;
      }
  
-    /* Make accesses to pages with watchpoints go via the
-       watchpoint trap routines.  */
-    QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
-        if (cpu_watchpoint_address_matches(wp, vaddr, TARGET_PAGE_SIZE)) {
-            /* Avoid trapping reads of pages with a write breakpoint. */
-            if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
-                iotlb = PHYS_SECTION_WATCH + paddr;
-                *address |= TLB_MMIO;
-                break;
-            }
-        }
-    }
-
      return iotlb;
  }
  #endif /* defined(CONFIG_USER_ONLY) */
@@ -1688,7 +1683,7 @@ void ram_block_dump(Monitor *mon)
   * when we actually open and map them.  Iterate over the file
   * descriptors instead, and use qemu_fd_getpagesize().
   */
-static int find_max_supported_pagesize(Object *obj, void *opaque)
+static int find_min_backend_pagesize(Object *obj, void *opaque)
  {
      long *hpsize_min = opaque;
  
@@ -1704,11 +1699,32 @@ static int find_max_supported_pagesize(Object *obj, void *opaque)
      return 0;
  }
  
-long qemu_getrampagesize(void)
+static int find_max_backend_pagesize(Object *obj, void *opaque)
+{
+    long *hpsize_max = opaque;
+
+    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
+        HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+        long hpsize = host_memory_backend_pagesize(backend);
+
+        if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
+            *hpsize_max = hpsize;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * TODO: We assume right now that all mapped host memory backends are
+ * used as RAM, however some might be used for different purposes.
+ */
+long qemu_minrampagesize(void)
  {
      long hpsize = LONG_MAX;
      long mainrampagesize;
      Object *memdev_root;
+    MachineState *ms = MACHINE(qdev_get_machine());
  
      mainrampagesize = qemu_mempath_getpagesize(mem_path);
  
@@ -1724,7 +1740,7 @@ long qemu_getrampagesize(void)
       */
      memdev_root = object_resolve_path("/objects", NULL);
      if (memdev_root) {
-        object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize);
+        object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
      }
      if (hpsize == LONG_MAX) {
          /* No additional memory regions found ==> Report main RAM page size */
@@ -1736,7 +1752,9 @@ long qemu_getrampagesize(void)
       * so if its page size is smaller we have got to report that size instead.
       */
      if (hpsize > mainrampagesize &&
-        (nb_numa_nodes == 0 || numa_info[0].node_memdev == NULL)) {
+        (ms->numa_state == NULL ||
+         ms->numa_state->num_nodes == 0 ||
+         ms->numa_state->nodes[0].node_memdev == NULL)) {
          static bool warned;
          if (!warned) {
              error_report("Huge page support disabled (n/a for main memory).");
@@ -1747,8 +1765,24 @@ long qemu_getrampagesize(void)
  
      return hpsize;
  }
+
+long qemu_maxrampagesize(void)
+{
+    long pagesize = qemu_mempath_getpagesize(mem_path);
+    Object *memdev_root = object_resolve_path("/objects", NULL);
+
+    if (memdev_root) {
+        object_child_foreach(memdev_root, find_max_backend_pagesize,
+                             &pagesize);
+    }
+    return pagesize;
+}
  #else
-long qemu_getrampagesize(void)
+long qemu_minrampagesize(void)
+{
+    return getpagesize();
+}
+long qemu_maxrampagesize(void)
  {
      return getpagesize();
  }
@@ -1831,6 +1865,7 @@ static void *file_ram_alloc(RAMBlock *block,
                              bool truncate,
                              Error **errp)
  {
+    MachineState *ms = MACHINE(qdev_get_machine());
      void *area;
  
      block->page_size = qemu_fd_getpagesize(fd);
@@ -1879,7 +1914,7 @@ static void *file_ram_alloc(RAMBlock *block,
      }
  
      area = qemu_ram_mmap(fd, memory, block->mr->align,
-                         block->flags & RAM_SHARED);
+                         block->flags & RAM_SHARED, block->flags & RAM_PMEM);
      if (area == MAP_FAILED) {
          error_setg_errno(errp, errno,
                           "unable to map backing store for guest RAM");
@@ -1887,7 +1922,7 @@ static void *file_ram_alloc(RAMBlock *block,
      }
  
      if (mem_prealloc) {
-        os_mem_prealloc(fd, area, memory, smp_cpus, errp);
+        os_mem_prealloc(fd, area, memory, ms->smp.cpus, errp);
          if (errp && *errp) {
              qemu_ram_munmap(fd, area, memory);
              return NULL;
@@ -2753,32 +2788,35 @@ static const MemoryRegionOps notdirty_mem_ops = {
  };
  
  /* Generate a debug exception if a watchpoint has been hit.  */
-static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
+void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len,
+                          MemTxAttrs attrs, int flags, uintptr_t ra)
  {
-    CPUState *cpu = current_cpu;
      CPUClass *cc = CPU_GET_CLASS(cpu);
-    target_ulong vaddr;
      CPUWatchpoint *wp;
  
      assert(tcg_enabled());
      if (cpu->watchpoint_hit) {
-        /* We re-entered the check after replacing the TB. Now raise
-         * the debug interrupt so that is will trigger after the
-         * current instruction. */
+        /*
+         * We re-entered the check after replacing the TB.
+         * Now raise the debug interrupt so that it will
+         * trigger after the current instruction.
+         */
+        qemu_mutex_lock_iothread();
          cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG);
+        qemu_mutex_unlock_iothread();
          return;
      }
-    vaddr = (cpu->mem_io_vaddr & TARGET_PAGE_MASK) + offset;
-    vaddr = cc->adjust_watchpoint_address(cpu, vaddr, len);
+
+    addr = cc->adjust_watchpoint_address(cpu, addr, len);
      QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
-        if (cpu_watchpoint_address_matches(wp, vaddr, len)
+        if (watchpoint_address_matches(wp, addr, len)
              && (wp->flags & flags)) {
              if (flags == BP_MEM_READ) {
                  wp->flags |= BP_WATCHPOINT_HIT_READ;
              } else {
                  wp->flags |= BP_WATCHPOINT_HIT_WRITE;
              }
-            wp->hitaddr = vaddr;
+            wp->hitaddr = MAX(addr, wp->vaddr);
              wp->hitattrs = attrs;
              if (!cpu->watchpoint_hit) {
                  if (wp->flags & BP_CPU &&
@@ -2793,11 +2831,14 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
                  if (wp->flags & BP_STOP_BEFORE_ACCESS) {
                      cpu->exception_index = EXCP_DEBUG;
                      mmap_unlock();
-                    cpu_loop_exit(cpu);
+                    cpu_loop_exit_restore(cpu, ra);
                  } else {
                      /* Force execution of one insn next time.  */
                      cpu->cflags_next_tb = 1 | curr_cflags();
                      mmap_unlock();
+                    if (ra) {
+                        cpu_restore_state(cpu, ra, true);
+                    }
                      cpu_loop_exit_noexc(cpu);
                  }
              }
@@ -2807,80 +2848,6 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
      }
  }
  
-/* Watchpoint access routines.  Watchpoints are inserted using TLB tricks,
-   so these check for a hit then pass through to the normal out-of-line
-   phys routines.  */
-static MemTxResult watch_mem_read(void *opaque, hwaddr addr, uint64_t *pdata,
-                                  unsigned size, MemTxAttrs attrs)
-{
-    MemTxResult res;
-    uint64_t data;
-    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
-    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
-
-    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_READ);
-    switch (size) {
-    case 1:
-        data = address_space_ldub(as, addr, attrs, &res);
-        break;
-    case 2:
-        data = address_space_lduw(as, addr, attrs, &res);
-        break;
-    case 4:
-        data = address_space_ldl(as, addr, attrs, &res);
-        break;
-    case 8:
-        data = address_space_ldq(as, addr, attrs, &res);
-        break;
-    default: abort();
-    }
-    *pdata = data;
-    return res;
-}
-
-static MemTxResult watch_mem_write(void *opaque, hwaddr addr,
-                                   uint64_t val, unsigned size,
-                                   MemTxAttrs attrs)
-{
-    MemTxResult res;
-    int asidx = cpu_asidx_from_attrs(current_cpu, attrs);
-    AddressSpace *as = current_cpu->cpu_ases[asidx].as;
-
-    check_watchpoint(addr & ~TARGET_PAGE_MASK, size, attrs, BP_MEM_WRITE);
-    switch (size) {
-    case 1:
-        address_space_stb(as, addr, val, attrs, &res);
-        break;
-    case 2:
-        address_space_stw(as, addr, val, attrs, &res);
-        break;
-    case 4:
-        address_space_stl(as, addr, val, attrs, &res);
-        break;
-    case 8:
-        address_space_stq(as, addr, val, attrs, &res);
-        break;
-    default: abort();
-    }
-    return res;
-}
-
-static const MemoryRegionOps watch_mem_ops = {
-    .read_with_attrs = watch_mem_read,
-    .write_with_attrs = watch_mem_write,
-    .endianness = DEVICE_NATIVE_ENDIAN,
-    .valid = {
-        .min_access_size = 1,
-        .max_access_size = 8,
-        .unaligned = false,
-    },
-    .impl = {
-        .min_access_size = 1,
-        .max_access_size = 8,
-        .unaligned = false,
-    },
-};
-
  static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
                                   MemTxAttrs attrs, uint8_t *buf, hwaddr len);
  static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
@@ -3056,9 +3023,6 @@ static void io_mem_init(void)
      memory_region_init_io(&io_mem_notdirty, NULL, &notdirty_mem_ops, NULL,
                            NULL, UINT64_MAX);
      memory_region_clear_global_locking(&io_mem_notdirty);
-
-    memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
-                          NULL, UINT64_MAX);
  }
  
  AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
@@ -3072,8 +3036,6 @@ AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
      assert(n == PHYS_SECTION_NOTDIRTY);
      n = dummy_section(&d->map, fv, &io_mem_rom);
      assert(n == PHYS_SECTION_ROM);
-    n = dummy_section(&d->map, fv, &io_mem_watch);
-    assert(n == PHYS_SECTION_WATCH);
  
      d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
  
@@ -3086,6 +3048,35 @@ void address_space_dispatch_free(AddressSpaceDispatch *d)
      g_free(d);
  }
  
+static void do_nothing(CPUState *cpu, run_on_cpu_data d)
+{
+}
+
+static void tcg_log_global_after_sync(MemoryListener *listener)
+{
+    CPUAddressSpace *cpuas;
+
+    /* Wait for the CPU to end the current TB.  This avoids the following
+     * incorrect race:
+     *
+     *      vCPU                         migration
+     *      ----------------------       -------------------------
+     *      TLB check -> slow path
+     *        notdirty_mem_write
+     *          write to RAM
+     *          mark dirty
+     *                                   clear dirty flag
+     *      TLB check -> fast path
+     *                                   read memory
+     *        write to RAM
+     *
+     * by pushing the migration thread's memory read after the vCPU thread has
+     * written the memory.
+     */
+    cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
+    run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
+}
+
  static void tcg_commit(MemoryListener *listener)
  {
      CPUAddressSpace *cpuas;
@@ -3276,8 +3267,9 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
              l = memory_access_size(mr, l, addr1);
              /* XXX: could force current_cpu to NULL to avoid
                 potential bugs */
-            val = ldn_p(buf, l);
-            result |= memory_region_dispatch_write(mr, addr1, val, l, attrs);
+            val = ldn_he_p(buf, l);
+            result |= memory_region_dispatch_write(mr, addr1, val,
+                                                   size_memop(l), attrs);
          } else {
              /* RAM case */
              ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);
@@ -3338,8 +3330,9 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
              /* I/O case */
              release_lock |= prepare_mmio_access(mr);
              l = memory_access_size(mr, l, addr1);
-            result |= memory_region_dispatch_read(mr, addr1, &val, l, attrs);
-            stn_p(buf, l, val);
+            result |= memory_region_dispatch_read(mr, addr1, &val,
+                                                  size_memop(l), attrs);
+            stn_he_p(buf, l, val);
          } else {
              /* RAM case */
              ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false);