migration/block: limit the number of parallel I/O requests

[mirror_qemu.git] / exec.c
diff --git a/exec.c b/exec.c

index 2202f2d73176de6bee246f90456c1d571926ffa3..c09bd93df31e9f43d790f8a2be995e9946eaaeb6 100644 (file)
--- a/exec.c
+++ b/exec.c
@@ -18,8 +18,6 @@
   */
  #include "qemu/osdep.h"
  #include "qapi/error.h"
-#ifndef _WIN32
-#endif
  
  #include "qemu/cutils.h"
  #include "cpu.h"
@@ -51,7 +49,6 @@
  #include "trace-root.h"
  
  #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
-#include <fcntl.h>
  #include <linux/falloc.h>
  #endif
  
@@ -102,6 +99,11 @@ static MemoryRegion io_mem_unassigned;
   */
  #define RAM_RESIZEABLE (1 << 2)
  
+/* UFFDIO_ZEROPAGE is available on this RAMBlock to atomically
+ * zero the page and wake waiting processes.
+ * (Set during postcopy)
+ */
+#define RAM_UF_ZEROPAGE (1 << 3)
  #endif
  
  #ifdef TARGET_PAGE_BITS_VARY
@@ -626,6 +628,13 @@ static int cpu_common_post_load(void *opaque, int version_id)
      cpu->interrupt_request &= ~0x01;
      tlb_flush(cpu);
  
+    /* loadvm has just updated the content of RAM, bypassing the
+     * usual mechanisms that ensure we flush TBs for writes to
+     * memory we've translated code from. So we must flush all TBs,
+     * which will now be stale.
+     */
+    tb_flush(cpu);
+
      return 0;
  }
  
@@ -708,9 +717,17 @@ CPUState *qemu_get_cpu(int index)
  }
  
  #if !defined(CONFIG_USER_ONLY)
-void cpu_address_space_init(CPUState *cpu, AddressSpace *as, int asidx)
+void cpu_address_space_init(CPUState *cpu, int asidx,
+                            const char *prefix, MemoryRegion *mr)
  {
      CPUAddressSpace *newas;
+    AddressSpace *as = g_new0(AddressSpace, 1);
+    char *as_name;
+
+    assert(mr);
+    as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
+    address_space_init(as, mr, as_name);
+    g_free(as_name);
  
      /* Target code should have set num_ases before calling us */
      assert(asidx < cpu->num_ases);
@@ -805,6 +822,29 @@ void cpu_exec_realizefn(CPUState *cpu, Error **errp)
  #endif
  }
  
+const char *parse_cpu_model(const char *cpu_model)
+{
+    ObjectClass *oc;
+    CPUClass *cc;
+    gchar **model_pieces;
+    const char *cpu_type;
+
+    model_pieces = g_strsplit(cpu_model, ",", 2);
+
+    oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]);
+    if (oc == NULL) {
+        error_report("unable to find CPU model '%s'", model_pieces[0]);
+        g_strfreev(model_pieces);
+        exit(EXIT_FAILURE);
+    }
+
+    cpu_type = object_class_get_name(oc);
+    cc = CPU_CLASS(oc);
+    cc->parse_features(cpu_type, model_pieces[1], &error_fatal);
+    g_strfreev(model_pieces);
+    return cpu_type;
+}
+
  #if defined(CONFIG_USER_ONLY)
  static void breakpoint_invalidate(CPUState *cpu, target_ulong pc)
  {
@@ -1273,7 +1313,7 @@ static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end,
                               uint16_t section);
  static subpage_t *subpage_init(FlatView *fv, hwaddr base);
  
-static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
+static void *(*phys_mem_alloc)(size_t size, uint64_t *align, bool shared) =
                                 qemu_anon_ram_alloc;
  
  /*
@@ -1281,7 +1321,7 @@ static void *(*phys_mem_alloc)(size_t size, uint64_t *align) =
   * Accelerators with unusual needs may need this.  Hopefully, we can
   * get rid of it eventually.
   */
-void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align))
+void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align, bool shared))
  {
      phys_mem_alloc = alloc;
  }
@@ -1600,7 +1640,13 @@ static void *file_ram_alloc(RAMBlock *block,
      void *area;
  
      block->page_size = qemu_fd_getpagesize(fd);
-    block->mr->align = block->page_size;
+    if (block->mr->align % block->page_size) {
+        error_setg(errp, "alignment 0x%" PRIx64
+                   " must be multiples of page size 0x%zx",
+                   block->mr->align, block->page_size);
+        return NULL;
+    }
+    block->mr->align = MAX(block->page_size, block->mr->align);
  #if defined(__s390x__)
      if (kvm_enabled()) {
          block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
@@ -1655,7 +1701,10 @@ static void *file_ram_alloc(RAMBlock *block,
  }
  #endif
  
-/* Called with the ramlist lock held.  */
+/* Allocate space within the ram_addr_t space that governs the
+ * dirty bitmaps.
+ * Called with the ramlist lock held.
+ */
  static ram_addr_t find_ram_offset(ram_addr_t size)
  {
      RAMBlock *block, *next_block;
@@ -1668,19 +1717,33 @@ static ram_addr_t find_ram_offset(ram_addr_t size)
      }
  
      RAMBLOCK_FOREACH(block) {
-        ram_addr_t end, next = RAM_ADDR_MAX;
+        ram_addr_t candidate, next = RAM_ADDR_MAX;
  
-        end = block->offset + block->max_length;
+        /* Align blocks to start on a 'long' in the bitmap
+         * which makes the bitmap sync'ing take the fast path.
+         */
+        candidate = block->offset + block->max_length;
+        candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
  
+        /* Search for the closest following block
+         * and find the gap.
+         */
          RAMBLOCK_FOREACH(next_block) {
-            if (next_block->offset >= end) {
+            if (next_block->offset >= candidate) {
                  next = MIN(next, next_block->offset);
              }
          }
-        if (next - end >= size && next - end < mingap) {
-            offset = end;
-            mingap = next - end;
+
+        /* If it fits remember our place and remember the size
+         * of gap, but keep going so that we might find a smaller
+         * gap to fill so avoiding fragmentation.
+         */
+        if (next - candidate >= size && next - candidate < mingap) {
+            offset = candidate;
+            mingap = next - candidate;
          }
+
+        trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
      }
  
      if (offset == RAM_ADDR_MAX) {
@@ -1689,6 +1752,8 @@ static ram_addr_t find_ram_offset(ram_addr_t size)
          abort();
      }
  
+    trace_find_ram_offset(size, offset);
+
      return offset;
  }
  
@@ -1730,6 +1795,17 @@ bool qemu_ram_is_shared(RAMBlock *rb)
      return rb->flags & RAM_SHARED;
  }
  
+/* Note: Only set at the start of postcopy */
+bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
+{
+    return rb->flags & RAM_UF_ZEROPAGE;
+}
+
+void qemu_ram_set_uf_zeroable(RAMBlock *rb)
+{
+    rb->flags |= RAM_UF_ZEROPAGE;
+}
+
  /* Called with iothread lock held.  */
  void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
  {
@@ -1884,7 +1960,7 @@ static void dirty_memory_extend(ram_addr_t old_ram_size,
      }
  }
  
-static void ram_block_add(RAMBlock *new_block, Error **errp)
+static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
  {
      RAMBlock *block;
      RAMBlock *last_block = NULL;
@@ -1907,7 +1983,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp)
              }
          } else {
              new_block->host = phys_mem_alloc(new_block->max_length,
-                                             &new_block->mr->align);
+                                             &new_block->mr->align, shared);
              if (!new_block->host) {
                  error_setg_errno(errp, errno,
                                   "cannot set up guest memory '%s'",
@@ -2012,7 +2088,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
          return NULL;
      }
  
-    ram_block_add(new_block, &local_err);
+    ram_block_add(new_block, &local_err, share);
      if (local_err) {
          g_free(new_block);
          error_propagate(errp, local_err);
@@ -2054,7 +2130,7 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
                                    void (*resized)(const char*,
                                                    uint64_t length,
                                                    void *host),
-                                  void *host, bool resizeable,
+                                  void *host, bool resizeable, bool share,
                                    MemoryRegion *mr, Error **errp)
  {
      RAMBlock *new_block;
@@ -2077,7 +2153,7 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
      if (resizeable) {
          new_block->flags |= RAM_RESIZEABLE;
      }
-    ram_block_add(new_block, &local_err);
+    ram_block_add(new_block, &local_err, share);
      if (local_err) {
          g_free(new_block);
          error_propagate(errp, local_err);
@@ -2089,12 +2165,15 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
  RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
                                     MemoryRegion *mr, Error **errp)
  {
-    return qemu_ram_alloc_internal(size, size, NULL, host, false, mr, errp);
+    return qemu_ram_alloc_internal(size, size, NULL, host, false,
+                                   false, mr, errp);
  }
  
-RAMBlock *qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr, Error **errp)
+RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share,
+                         MemoryRegion *mr, Error **errp)
  {
-    return qemu_ram_alloc_internal(size, size, NULL, NULL, false, mr, errp);
+    return qemu_ram_alloc_internal(size, size, NULL, NULL, false,
+                                   share, mr, errp);
  }
  
  RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
@@ -2103,7 +2182,8 @@ RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
                                                       void *host),
                                       MemoryRegion *mr, Error **errp)
  {
-    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, mr, errp);
+    return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true,
+                                   false, mr, errp);
  }
  
  static void reclaim_ramblock(RAMBlock *block)
@@ -2179,9 +2259,9 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
                                  flags, -1, 0);
                  }
                  if (area != vaddr) {
-                    fprintf(stderr, "Could not remap addr: "
-                            RAM_ADDR_FMT "@" RAM_ADDR_FMT "\n",
-                            length, addr);
+                    error_report("Could not remap addr: "
+                                 RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
+                                 length, addr);
                      exit(1);
                  }
                  memory_try_enable_merging(vaddr, length);
@@ -2256,6 +2336,16 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
      return ramblock_ptr(block, addr);
  }
  
+/* Return the offset of a hostpointer within a ramblock */
+ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
+{
+    ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
+    assert((uintptr_t)host >= (uintptr_t)rb->host);
+    assert(res < rb->max_length);
+
+    return res;
+}
+
  /*
   * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
   * in that RAMBlock.
@@ -2354,18 +2444,55 @@ ram_addr_t qemu_ram_addr_from_host(void *ptr)
      return block->offset + offset;
  }
  
-/* Called within RCU critical section.  */
-static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
-                               uint64_t val, unsigned size)
+/* Called within RCU critical section. */
+void memory_notdirty_write_prepare(NotDirtyInfo *ndi,
+                          CPUState *cpu,
+                          vaddr mem_vaddr,
+                          ram_addr_t ram_addr,
+                          unsigned size)
  {
-    bool locked = false;
+    ndi->cpu = cpu;
+    ndi->ram_addr = ram_addr;
+    ndi->mem_vaddr = mem_vaddr;
+    ndi->size = size;
+    ndi->locked = false;
  
      assert(tcg_enabled());
      if (!cpu_physical_memory_get_dirty_flag(ram_addr, DIRTY_MEMORY_CODE)) {
-        locked = true;
+        ndi->locked = true;
          tb_lock();
          tb_invalidate_phys_page_fast(ram_addr, size);
      }
+}
+
+/* Called within RCU critical section. */
+void memory_notdirty_write_complete(NotDirtyInfo *ndi)
+{
+    if (ndi->locked) {
+        tb_unlock();
+    }
+
+    /* Set both VGA and migration bits for simplicity and to remove
+     * the notdirty callback faster.
+     */
+    cpu_physical_memory_set_dirty_range(ndi->ram_addr, ndi->size,
+                                        DIRTY_CLIENTS_NOCODE);
+    /* we remove the notdirty callback only if the code has been
+       flushed */
+    if (!cpu_physical_memory_is_clean(ndi->ram_addr)) {
+        tlb_set_dirty(ndi->cpu, ndi->mem_vaddr);
+    }
+}
+
+/* Called within RCU critical section.  */
+static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
+                               uint64_t val, unsigned size)
+{
+    NotDirtyInfo ndi;
+
+    memory_notdirty_write_prepare(&ndi, current_cpu, current_cpu->mem_io_vaddr,
+                         ram_addr, size);
+
      switch (size) {
      case 1:
          stb_p(qemu_map_ram_ptr(NULL, ram_addr), val);
@@ -2382,21 +2509,7 @@ static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
      default:
          abort();
      }
-
-    if (locked) {
-        tb_unlock();
-    }
-
-    /* Set both VGA and migration bits for simplicity and to remove
-     * the notdirty callback faster.
-     */
-    cpu_physical_memory_set_dirty_range(ram_addr, size,
-                                        DIRTY_CLIENTS_NOCODE);
-    /* we remove the notdirty callback only if the code has been
-       flushed */
-    if (!cpu_physical_memory_is_clean(ram_addr)) {
-        tlb_set_dirty(current_cpu, current_cpu->mem_io_vaddr);
-    }
+    memory_notdirty_write_complete(&ndi);
  }
  
  static bool notdirty_mem_accepts(void *opaque, hwaddr addr,
@@ -2552,6 +2665,8 @@ static const MemoryRegionOps watch_mem_ops = {
      },
  };
  
+static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
+                                      MemTxAttrs attrs, uint8_t *buf, int len);
  static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
                                    const uint8_t *buf, int len);
  static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
@@ -2697,6 +2812,37 @@ static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
      return phys_section_add(map, &section);
  }
  
+static void readonly_mem_write(void *opaque, hwaddr addr,
+                               uint64_t val, unsigned size)
+{
+    /* Ignore any write to ROM. */
+}
+
+static bool readonly_mem_accepts(void *opaque, hwaddr addr,
+                                 unsigned size, bool is_write)
+{
+    return is_write;
+}
+
+/* This will only be used for writes, because reads are special cased
+ * to directly access the underlying host ram.
+ */
+static const MemoryRegionOps readonly_mem_ops = {
+    .write = readonly_mem_write,
+    .valid.accepts = readonly_mem_accepts,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .valid = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
+};
+
  MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs)
  {
      int asidx = cpu_asidx_from_attrs(cpu, attrs);
@@ -2709,7 +2855,8 @@ MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index, MemTxAttrs attrs)
  
  static void io_mem_init(void)
  {
-    memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX);
+    memory_region_init_io(&io_mem_rom, NULL, &readonly_mem_ops,
+                          NULL, NULL, UINT64_MAX);
      memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
                            NULL, UINT64_MAX);
  
@@ -2982,6 +3129,7 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
      return result;
  }
  
+/* Called from RCU critical section.  */
  static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
                                    const uint8_t *buf, int len)
  {
@@ -2990,25 +3138,14 @@ static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
      MemoryRegion *mr;
      MemTxResult result = MEMTX_OK;
  
-    if (len > 0) {
-        rcu_read_lock();
-        l = len;
-        mr = flatview_translate(fv, addr, &addr1, &l, true);
-        result = flatview_write_continue(fv, addr, attrs, buf, len,
-                                         addr1, l, mr);
-        rcu_read_unlock();
-    }
+    l = len;
+    mr = flatview_translate(fv, addr, &addr1, &l, true);
+    result = flatview_write_continue(fv, addr, attrs, buf, len,
+                                     addr1, l, mr);
  
      return result;
  }
  
-MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
-                                              MemTxAttrs attrs,
-                                              const uint8_t *buf, int len)
-{
-    return flatview_write(address_space_to_flatview(as), addr, attrs, buf, len);
-}
-
  /* Called within RCU critical section.  */
  MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
                                     MemTxAttrs attrs, uint8_t *buf,
@@ -3079,42 +3216,61 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
      return result;
  }
  
-MemTxResult flatview_read_full(FlatView *fv, hwaddr addr,
-                               MemTxAttrs attrs, uint8_t *buf, int len)
+/* Called from RCU critical section.  */
+static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
+                                 MemTxAttrs attrs, uint8_t *buf, int len)
  {
      hwaddr l;
      hwaddr addr1;
      MemoryRegion *mr;
+
+    l = len;
+    mr = flatview_translate(fv, addr, &addr1, &l, false);
+    return flatview_read_continue(fv, addr, attrs, buf, len,
+                                  addr1, l, mr);
+}
+
+MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
+                                    MemTxAttrs attrs, uint8_t *buf, int len)
+{
      MemTxResult result = MEMTX_OK;
+    FlatView *fv;
  
      if (len > 0) {
          rcu_read_lock();
-        l = len;
-        mr = flatview_translate(fv, addr, &addr1, &l, false);
-        result = flatview_read_continue(fv, addr, attrs, buf, len,
-                                        addr1, l, mr);
+        fv = address_space_to_flatview(as);
+        result = flatview_read(fv, addr, attrs, buf, len);
          rcu_read_unlock();
      }
  
      return result;
  }
  
-static MemTxResult flatview_rw(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
-                               uint8_t *buf, int len, bool is_write)
+MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
+                                MemTxAttrs attrs,
+                                const uint8_t *buf, int len)
  {
-    if (is_write) {
-        return flatview_write(fv, addr, attrs, (uint8_t *)buf, len);
-    } else {
-        return flatview_read(fv, addr, attrs, (uint8_t *)buf, len);
+    MemTxResult result = MEMTX_OK;
+    FlatView *fv;
+
+    if (len > 0) {
+        rcu_read_lock();
+        fv = address_space_to_flatview(as);
+        result = flatview_write(fv, addr, attrs, buf, len);
+        rcu_read_unlock();
      }
+
+    return result;
  }
  
-MemTxResult address_space_rw(AddressSpace *as, hwaddr addr,
-                             MemTxAttrs attrs, uint8_t *buf,
-                             int len, bool is_write)
+MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
+                             uint8_t *buf, int len, bool is_write)
  {
-    return flatview_rw(address_space_to_flatview(as),
-                       addr, attrs, buf, len, is_write);
+    if (is_write) {
+        return address_space_write(as, addr, attrs, buf, len);
+    } else {
+        return address_space_read_full(as, addr, attrs, buf, len);
+    }
  }
  
  void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
@@ -3280,14 +3436,12 @@ static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
      MemoryRegion *mr;
      hwaddr l, xlat;
  
-    rcu_read_lock();
      while (len > 0) {
          l = len;
          mr = flatview_translate(fv, addr, &xlat, &l, is_write);
          if (!memory_access_is_direct(mr, is_write)) {
              l = memory_access_size(mr, l, addr);
              if (!memory_region_access_valid(mr, xlat, l, is_write)) {
-                rcu_read_unlock();
                  return false;
              }
          }
@@ -3295,15 +3449,20 @@ static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
          len -= l;
          addr += l;
      }
-    rcu_read_unlock();
      return true;
  }
  
  bool address_space_access_valid(AddressSpace *as, hwaddr addr,
                                  int len, bool is_write)
  {
-    return flatview_access_valid(address_space_to_flatview(as),
-                                 addr, len, is_write);
+    FlatView *fv;
+    bool result;
+
+    rcu_read_lock();
+    fv = address_space_to_flatview(as);
+    result = flatview_access_valid(fv, addr, len, is_write);
+    rcu_read_unlock();
+    return result;
  }
  
  static hwaddr
@@ -3349,7 +3508,7 @@ void *address_space_map(AddressSpace *as,
      hwaddr l, xlat;
      MemoryRegion *mr;
      void *ptr;
-    FlatView *fv = address_space_to_flatview(as);
+    FlatView *fv;
  
      if (len == 0) {
          return NULL;
@@ -3357,6 +3516,7 @@ void *address_space_map(AddressSpace *as,
  
      l = len;
      rcu_read_lock();
+    fv = address_space_to_flatview(as);
      mr = flatview_translate(fv, addr, &xlat, &l, is_write);
  
      if (!memory_access_is_direct(mr, is_write)) {
@@ -3610,6 +3770,7 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
      }
  
      if ((start + length) <= rb->used_length) {
+        bool need_madvise, need_fallocate;
          uint8_t *host_endaddr = host_startaddr + length;
          if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
              error_report("ram_block_discard_range: Unaligned end address: %p",
@@ -3619,29 +3780,60 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
  
          errno = ENOTSUP; /* If we are missing MADVISE etc */
  
-        if (rb->page_size == qemu_host_page_size) {
-#if defined(CONFIG_MADVISE)
-            /* Note: We need the madvise MADV_DONTNEED behaviour of definitely
-             * freeing the page.
-             */
-            ret = madvise(host_startaddr, length, MADV_DONTNEED);
-#endif
-        } else {
-            /* Huge page case  - unfortunately it can't do DONTNEED, but
-             * it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
-             * huge page file.
+        /* The logic here is messy;
+         *    madvise DONTNEED fails for hugepages
+         *    fallocate works on hugepages and shmem
+         */
+        need_madvise = (rb->page_size == qemu_host_page_size);
+        need_fallocate = rb->fd != -1;
+        if (need_fallocate) {
+            /* For a file, this causes the area of the file to be zero'd
+             * if read, and for hugetlbfs also causes it to be unmapped
+             * so a userfault will trigger.
               */
  #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
              ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                              start, length);
+            if (ret) {
+                ret = -errno;
+                error_report("ram_block_discard_range: Failed to fallocate "
+                             "%s:%" PRIx64 " +%zx (%d)",
+                             rb->idstr, start, length, ret);
+                goto err;
+            }
+#else
+            ret = -ENOSYS;
+            error_report("ram_block_discard_range: fallocate not available/file"
+                         "%s:%" PRIx64 " +%zx (%d)",
+                         rb->idstr, start, length, ret);
+            goto err;
  #endif
          }
-        if (ret) {
-            ret = -errno;
-            error_report("ram_block_discard_range: Failed to discard range "
+        if (need_madvise) {
+            /* For normal RAM this causes it to be unmapped,
+             * for shared memory it causes the local mapping to disappear
+             * and to fall back on the file contents (which we just
+             * fallocate'd away).
+             */
+#if defined(CONFIG_MADVISE)
+            ret =  madvise(host_startaddr, length, MADV_DONTNEED);
+            if (ret) {
+                ret = -errno;
+                error_report("ram_block_discard_range: Failed to discard range "
+                             "%s:%" PRIx64 " +%zx (%d)",
+                             rb->idstr, start, length, ret);
+                goto err;
+            }
+#else
+            ret = -ENOSYS;
+            error_report("ram_block_discard_range: MADVISE not available"
                           "%s:%" PRIx64 " +%zx (%d)",
                           rb->idstr, start, length, ret);
+            goto err;
+#endif
          }
+        trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
+                                      need_madvise, need_fallocate, ret);
      } else {
          error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
                       "/%zx/" RAM_ADDR_FMT")",