accel/tcg: Merge io_writex into do_st_mmio_leN

[mirror_qemu.git] / accel / tcg / cputlb.c
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c

index ba44501a7c52dbc98770da9dd75ceac1f2836b35..1d56e3ec0c3955dc27f3a3c05b1c362c797c7cec 100644 (file)
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -74,8 +74,9 @@
      } while (0)
  
  /* run_on_cpu_data.target_ptr should always be big enough for a
- * target_ulong even on 32 bit builds */
-QEMU_BUILD_BUG_ON(sizeof(target_ulong) > sizeof(run_on_cpu_data));
+ * vaddr even on 32 bit builds
+ */
+QEMU_BUILD_BUG_ON(sizeof(vaddr) > sizeof(run_on_cpu_data));
  
  /* We currently can't handle more than 16 bits in the MMUIDX bitmask.
   */
@@ -1108,7 +1109,7 @@ static void tlb_add_large_page(CPUArchState *env, int mmu_idx,
  }
  
  static inline void tlb_set_compare(CPUTLBEntryFull *full, CPUTLBEntry *ent,
-                                   target_ulong address, int flags,
+                                   vaddr address, int flags,
                                     MMUAccessType access_type, bool enable)
  {
      if (enable) {
@@ -1192,6 +1193,7 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,
      write_flags = read_flags;
      if (is_ram) {
          iotlb = memory_region_get_ram_addr(section->mr) + xlat;
+        assert(!(iotlb & ~TARGET_PAGE_MASK));
          /*
           * Computing is_clean is expensive; avoid all that unless
           * the page is actually writable.
@@ -1254,16 +1256,18 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,
  
      /* refill the tlb */
      /*
-     * At this point iotlb contains a physical section number in the lower
-     * TARGET_PAGE_BITS, and either
-     *  + the ram_addr_t of the page base of the target RAM (RAM)
-     *  + the offset within section->mr of the page base (I/O, ROMD)
+     * When memory region is ram, iotlb contains a TARGET_PAGE_BITS
+     * aligned ram_addr_t of the page base of the target RAM.
+     * Otherwise, iotlb contains
+     *  - a physical section number in the lower TARGET_PAGE_BITS
+     *  - the offset within section->mr of the page base (I/O, ROMD) with the
+     *    TARGET_PAGE_BITS masked off.
       * We subtract addr_page (which is page aligned and thus won't
       * disturb the low bits) to give an offset which can be added to the
       * (non-page-aligned) vaddr of the eventual memory access to get
       * the MemoryRegion offset for the access. Note that the vaddr we
       * subtract here is that of the page base, and not the same as the
-     * vaddr we add back in io_readx()/io_writex()/get_page_addr_code().
+     * vaddr we add back in io_prepare()/get_page_addr_code().
       */
      desc->fulltlb[index] = *full;
      full = &desc->fulltlb[index];
@@ -1346,110 +1350,41 @@ static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
                                            mmu_idx, retaddr);
  }
  
-static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
-                                          vaddr addr, unsigned size,
-                                          MMUAccessType access_type,
-                                          int mmu_idx, MemTxAttrs attrs,
-                                          MemTxResult response,
-                                          uintptr_t retaddr)
-{
-    CPUClass *cc = CPU_GET_CLASS(cpu);
-
-    if (!cpu->ignore_memory_transaction_failures &&
-        cc->tcg_ops->do_transaction_failed) {
-        cc->tcg_ops->do_transaction_failed(cpu, physaddr, addr, size,
-                                           access_type, mmu_idx, attrs,
-                                           response, retaddr);
-    }
-}
-
-static uint64_t io_readx(CPUArchState *env, CPUTLBEntryFull *full,
-                         int mmu_idx, vaddr addr, uintptr_t retaddr,
-                         MMUAccessType access_type, MemOp op)
+static MemoryRegionSection *
+io_prepare(hwaddr *out_offset, CPUArchState *env, hwaddr xlat,
+           MemTxAttrs attrs, vaddr addr, uintptr_t retaddr)
  {
      CPUState *cpu = env_cpu(env);
-    hwaddr mr_offset;
      MemoryRegionSection *section;
-    MemoryRegion *mr;
-    uint64_t val;
-    MemTxResult r;
+    hwaddr mr_offset;
  
-    section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
-    mr = section->mr;
-    mr_offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
+    section = iotlb_to_section(cpu, xlat, attrs);
+    mr_offset = (xlat & TARGET_PAGE_MASK) + addr;
      cpu->mem_io_pc = retaddr;
      if (!cpu->can_do_io) {
          cpu_io_recompile(cpu, retaddr);
      }
  
-    {
-        QEMU_IOTHREAD_LOCK_GUARD();
-        r = memory_region_dispatch_read(mr, mr_offset, &val, op, full->attrs);
-    }
-
-    if (r != MEMTX_OK) {
-        hwaddr physaddr = mr_offset +
-            section->offset_within_address_space -
-            section->offset_within_region;
-
-        cpu_transaction_failed(cpu, physaddr, addr, memop_size(op), access_type,
-                               mmu_idx, full->attrs, r, retaddr);
-    }
-    return val;
-}
-
-/*
- * Save a potentially trashed CPUTLBEntryFull for later lookup by plugin.
- * This is read by tlb_plugin_lookup if the fulltlb entry doesn't match
- * because of the side effect of io_writex changing memory layout.
- */
-static void save_iotlb_data(CPUState *cs, MemoryRegionSection *section,
-                            hwaddr mr_offset)
-{
-#ifdef CONFIG_PLUGIN
-    SavedIOTLB *saved = &cs->saved_iotlb;
-    saved->section = section;
-    saved->mr_offset = mr_offset;
-#endif
+    *out_offset = mr_offset;
+    return section;
  }
  
-static void io_writex(CPUArchState *env, CPUTLBEntryFull *full,
-                      int mmu_idx, uint64_t val, vaddr addr,
-                      uintptr_t retaddr, MemOp op)
+static void io_failed(CPUArchState *env, CPUTLBEntryFull *full, vaddr addr,
+                      unsigned size, MMUAccessType access_type, int mmu_idx,
+                      MemTxResult response, uintptr_t retaddr)
  {
      CPUState *cpu = env_cpu(env);
-    hwaddr mr_offset;
-    MemoryRegionSection *section;
-    MemoryRegion *mr;
-    MemTxResult r;
-
-    section = iotlb_to_section(cpu, full->xlat_section, full->attrs);
-    mr = section->mr;
-    mr_offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
-    if (!cpu->can_do_io) {
-        cpu_io_recompile(cpu, retaddr);
-    }
-    cpu->mem_io_pc = retaddr;
  
-    /*
-     * The memory_region_dispatch may trigger a flush/resize
-     * so for plugins we save the iotlb_data just in case.
-     */
-    save_iotlb_data(cpu, section, mr_offset);
-
-    {
-        QEMU_IOTHREAD_LOCK_GUARD();
-        r = memory_region_dispatch_write(mr, mr_offset, val, op, full->attrs);
-    }
+    if (!cpu->ignore_memory_transaction_failures) {
+        CPUClass *cc = CPU_GET_CLASS(cpu);
  
-    if (r != MEMTX_OK) {
-        hwaddr physaddr = mr_offset +
-            section->offset_within_address_space -
-            section->offset_within_region;
+        if (cc->tcg_ops->do_transaction_failed) {
+            hwaddr physaddr = full->phys_addr | (addr & ~TARGET_PAGE_MASK);
  
-        cpu_transaction_failed(cpu, physaddr, addr, memop_size(op),
-                               MMU_DATA_STORE, mmu_idx, full->attrs, r,
-                               retaddr);
+            cc->tcg_ops->do_transaction_failed(cpu, physaddr, addr, size,
+                                               access_type, mmu_idx,
+                                               full->attrs, response, retaddr);
+        }
      }
  }
  
@@ -1719,45 +1654,41 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, vaddr addr,
   * in the softmmu lookup code (or helper). We don't handle re-fills or
   * checking the victim table. This is purely informational.
   *
- * This almost never fails as the memory access being instrumented
- * should have just filled the TLB. The one corner case is io_writex
- * which can cause TLB flushes and potential resizing of the TLBs
- * losing the information we need. In those cases we need to recover
- * data from a copy of the CPUTLBEntryFull. As long as this always occurs
- * from the same thread (which a mem callback will be) this is safe.
+ * The one corner case is i/o write, which can cause changes to the
+ * address space.  Those changes, and the corresponding tlb flush,
+ * should be delayed until the next TB, so even then this ought not fail.
+ * But check, Just in Case.
   */
-
  bool tlb_plugin_lookup(CPUState *cpu, vaddr addr, int mmu_idx,
                         bool is_store, struct qemu_plugin_hwaddr *data)
  {
      CPUArchState *env = cpu->env_ptr;
      CPUTLBEntry *tlbe = tlb_entry(env, mmu_idx, addr);
      uintptr_t index = tlb_index(env, mmu_idx, addr);
-    uint64_t tlb_addr = is_store ? tlb_addr_write(tlbe) : tlbe->addr_read;
-
-    if (likely(tlb_hit(tlb_addr, addr))) {
-        /* We must have an iotlb entry for MMIO */
-        if (tlb_addr & TLB_MMIO) {
-            CPUTLBEntryFull *full;
-            full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
-            data->is_io = true;
-            data->v.io.section =
-                iotlb_to_section(cpu, full->xlat_section, full->attrs);
-            data->v.io.offset = (full->xlat_section & TARGET_PAGE_MASK) + addr;
-        } else {
-            data->is_io = false;
-            data->v.ram.hostaddr = (void *)((uintptr_t)addr + tlbe->addend);
-        }
-        return true;
-    } else {
-        SavedIOTLB *saved = &cpu->saved_iotlb;
+    MMUAccessType access_type = is_store ? MMU_DATA_STORE : MMU_DATA_LOAD;
+    uint64_t tlb_addr = tlb_read_idx(tlbe, access_type);
+    CPUTLBEntryFull *full;
+
+    if (unlikely(!tlb_hit(tlb_addr, addr))) {
+        return false;
+    }
+
+    full = &env_tlb(env)->d[mmu_idx].fulltlb[index];
+    data->phys_addr = full->phys_addr | (addr & ~TARGET_PAGE_MASK);
+
+    /* We must have an iotlb entry for MMIO */
+    if (tlb_addr & TLB_MMIO) {
+        MemoryRegionSection *section =
+            iotlb_to_section(cpu, full->xlat_section & ~TARGET_PAGE_MASK,
+                             full->attrs);
          data->is_io = true;
-        data->v.io.section = saved->section;
-        data->v.io.offset = saved->mr_offset;
-        return true;
+        data->mr = section->mr;
+    } else {
+        data->is_io = false;
+        data->mr = NULL;
      }
+    return true;
  }
-
  #endif
  
  /*
@@ -2066,27 +1997,57 @@ static void *atomic_mmu_lookup(CPUArchState *env, vaddr addr, MemOpIdx oi,
  /**
   * do_ld_mmio_beN:
   * @env: cpu context
- * @p: translation parameters
+ * @full: page parameters
   * @ret_be: accumulated data
+ * @addr: virtual address
+ * @size: number of bytes
   * @mmu_idx: virtual address context
   * @ra: return address into tcg generated code, or 0
+ * Context: iothread lock held
   *
- * Load @p->size bytes from @p->addr, which is memory-mapped i/o.
+ * Load @size bytes from @addr, which is memory-mapped i/o.
   * The bytes are concatenated in big-endian order with @ret_be.
   */
-static uint64_t do_ld_mmio_beN(CPUArchState *env, MMULookupPageData *p,
-                               uint64_t ret_be, int mmu_idx,
-                               MMUAccessType type, uintptr_t ra)
+static uint64_t do_ld_mmio_beN(CPUArchState *env, CPUTLBEntryFull *full,
+                               uint64_t ret_be, vaddr addr, int size,
+                               int mmu_idx, MMUAccessType type, uintptr_t ra)
  {
-    CPUTLBEntryFull *full = p->full;
-    vaddr addr = p->addr;
-    int i, size = p->size;
+    MemoryRegionSection *section;
+    hwaddr mr_offset;
+    MemoryRegion *mr;
+    MemTxAttrs attrs;
+
+    tcg_debug_assert(size > 0 && size <= 8);
+
+    attrs = full->attrs;
+    section = io_prepare(&mr_offset, env, full->xlat_section, attrs, addr, ra);
+    mr = section->mr;
+
+    do {
+        MemOp this_mop;
+        unsigned this_size;
+        uint64_t val;
+        MemTxResult r;
+
+        /* Read aligned pieces up to 8 bytes. */
+        this_mop = ctz32(size | (int)addr | 8);
+        this_size = 1 << this_mop;
+        this_mop |= MO_BE;
+
+        r = memory_region_dispatch_read(mr, mr_offset, &val, this_mop, attrs);
+        if (unlikely(r != MEMTX_OK)) {
+            io_failed(env, full, addr, this_size, type, mmu_idx, r, ra);
+        }
+        if (this_size == 8) {
+            return val;
+        }
+
+        ret_be = (ret_be << (this_size * 8)) | val;
+        addr += this_size;
+        mr_offset += this_size;
+        size -= this_size;
+    } while (size);
  
-    QEMU_IOTHREAD_LOCK_GUARD();
-    for (i = 0; i < size; i++) {
-        uint8_t x = io_readx(env, full, mmu_idx, addr + i, ra, type, MO_UB);
-        ret_be = (ret_be << 8) | x;
-    }
      return ret_be;
  }
  
@@ -2232,7 +2193,9 @@ static uint64_t do_ld_beN(CPUArchState *env, MMULookupPageData *p,
      unsigned tmp, half_size;
  
      if (unlikely(p->flags & TLB_MMIO)) {
-        return do_ld_mmio_beN(env, p, ret_be, mmu_idx, type, ra);
+        QEMU_IOTHREAD_LOCK_GUARD();
+        return do_ld_mmio_beN(env, p->full, ret_be, p->addr, p->size,
+                              mmu_idx, type, ra);
      }
  
      /*
@@ -2281,11 +2244,11 @@ static Int128 do_ld16_beN(CPUArchState *env, MMULookupPageData *p,
      MemOp atom;
  
      if (unlikely(p->flags & TLB_MMIO)) {
-        p->size = size - 8;
-        a = do_ld_mmio_beN(env, p, a, mmu_idx, MMU_DATA_LOAD, ra);
-        p->addr += p->size;
-        p->size = 8;
-        b = do_ld_mmio_beN(env, p, 0, mmu_idx, MMU_DATA_LOAD, ra);
+        QEMU_IOTHREAD_LOCK_GUARD();
+        a = do_ld_mmio_beN(env, p->full, a, p->addr, size - 8,
+                           mmu_idx, MMU_DATA_LOAD, ra);
+        b = do_ld_mmio_beN(env, p->full, 0, p->addr + 8, 8,
+                           mmu_idx, MMU_DATA_LOAD, ra);
          return int128_make128(b, a);
      }
  
@@ -2331,7 +2294,8 @@ static uint8_t do_ld_1(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
                         MMUAccessType type, uintptr_t ra)
  {
      if (unlikely(p->flags & TLB_MMIO)) {
-        return io_readx(env, p->full, mmu_idx, p->addr, ra, type, MO_UB);
+        QEMU_IOTHREAD_LOCK_GUARD();
+        return do_ld_mmio_beN(env, p->full, 0, p->addr, 1, mmu_idx, type, ra);
      } else {
          return *(uint8_t *)p->haddr;
      }
@@ -2340,16 +2304,20 @@ static uint8_t do_ld_1(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
  static uint16_t do_ld_2(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
                          MMUAccessType type, MemOp memop, uintptr_t ra)
  {
-    uint64_t ret;
+    uint16_t ret;
  
      if (unlikely(p->flags & TLB_MMIO)) {
-        return io_readx(env, p->full, mmu_idx, p->addr, ra, type, memop);
-    }
-
-    /* Perform the load host endian, then swap if necessary. */
-    ret = load_atom_2(env, ra, p->haddr, memop);
-    if (memop & MO_BSWAP) {
-        ret = bswap16(ret);
+        QEMU_IOTHREAD_LOCK_GUARD();
+        ret = do_ld_mmio_beN(env, p->full, 0, p->addr, 2, mmu_idx, type, ra);
+        if ((memop & MO_BSWAP) == MO_LE) {
+            ret = bswap16(ret);
+        }
+    } else {
+        /* Perform the load host endian, then swap if necessary. */
+        ret = load_atom_2(env, ra, p->haddr, memop);
+        if (memop & MO_BSWAP) {
+            ret = bswap16(ret);
+        }
      }
      return ret;
  }
@@ -2360,13 +2328,17 @@ static uint32_t do_ld_4(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
      uint32_t ret;
  
      if (unlikely(p->flags & TLB_MMIO)) {
-        return io_readx(env, p->full, mmu_idx, p->addr, ra, type, memop);
-    }
-
-    /* Perform the load host endian. */
-    ret = load_atom_4(env, ra, p->haddr, memop);
-    if (memop & MO_BSWAP) {
-        ret = bswap32(ret);
+        QEMU_IOTHREAD_LOCK_GUARD();
+        ret = do_ld_mmio_beN(env, p->full, 0, p->addr, 4, mmu_idx, type, ra);
+        if ((memop & MO_BSWAP) == MO_LE) {
+            ret = bswap32(ret);
+        }
+    } else {
+        /* Perform the load host endian. */
+        ret = load_atom_4(env, ra, p->haddr, memop);
+        if (memop & MO_BSWAP) {
+            ret = bswap32(ret);
+        }
      }
      return ret;
  }
@@ -2377,13 +2349,17 @@ static uint64_t do_ld_8(CPUArchState *env, MMULookupPageData *p, int mmu_idx,
      uint64_t ret;
  
      if (unlikely(p->flags & TLB_MMIO)) {
-        return io_readx(env, p->full, mmu_idx, p->addr, ra, type, memop);
-    }
-
-    /* Perform the load host endian. */
-    ret = load_atom_8(env, ra, p->haddr, memop);
-    if (memop & MO_BSWAP) {
-        ret = bswap64(ret);
+        QEMU_IOTHREAD_LOCK_GUARD();
+        ret = do_ld_mmio_beN(env, p->full, 0, p->addr, 8, mmu_idx, type, ra);
+        if ((memop & MO_BSWAP) == MO_LE) {
+            ret = bswap64(ret);
+        }
+    } else {
+        /* Perform the load host endian. */
+        ret = load_atom_8(env, ra, p->haddr, memop);
+        if (memop & MO_BSWAP) {
+            ret = bswap64(ret);
+        }
      }
      return ret;
  }
@@ -2531,20 +2507,22 @@ static Int128 do_ld16_mmu(CPUArchState *env, vaddr addr,
      cpu_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
      crosspage = mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD, &l);
      if (likely(!crosspage)) {
-        /* Perform the load host endian. */
          if (unlikely(l.page[0].flags & TLB_MMIO)) {
              QEMU_IOTHREAD_LOCK_GUARD();
-            a = io_readx(env, l.page[0].full, l.mmu_idx, addr,
-                         ra, MMU_DATA_LOAD, MO_64);
-            b = io_readx(env, l.page[0].full, l.mmu_idx, addr + 8,
-                         ra, MMU_DATA_LOAD, MO_64);
-            ret = int128_make128(HOST_BIG_ENDIAN ? b : a,
-                                 HOST_BIG_ENDIAN ? a : b);
+            a = do_ld_mmio_beN(env, l.page[0].full, 0, addr, 8,
+                               l.mmu_idx, MMU_DATA_LOAD, ra);
+            b = do_ld_mmio_beN(env, l.page[0].full, 0, addr + 8, 8,
+                               l.mmu_idx, MMU_DATA_LOAD, ra);
+            ret = int128_make128(b, a);
+            if ((l.memop & MO_BSWAP) == MO_LE) {
+                ret = bswap128(ret);
+            }
          } else {
+            /* Perform the load host endian. */
              ret = load_atom_16(env, ra, l.page[0].haddr, l.memop);
-        }
-        if (l.memop & MO_BSWAP) {
-            ret = bswap128(ret);
+            if (l.memop & MO_BSWAP) {
+                ret = bswap128(ret);
+            }
          }
          return ret;
      }
@@ -2664,26 +2642,59 @@ Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr,
  /**
   * do_st_mmio_leN:
   * @env: cpu context
- * @p: translation parameters
+ * @full: page parameters
   * @val_le: data to store
+ * @addr: virtual address
+ * @size: number of bytes
   * @mmu_idx: virtual address context
   * @ra: return address into tcg generated code, or 0
+ * Context: iothread lock held
   *
- * Store @p->size bytes at @p->addr, which is memory-mapped i/o.
+ * Store @size bytes at @addr, which is memory-mapped i/o.
   * The bytes to store are extracted in little-endian order from @val_le;
   * return the bytes of @val_le beyond @p->size that have not been stored.
   */
-static uint64_t do_st_mmio_leN(CPUArchState *env, MMULookupPageData *p,
-                               uint64_t val_le, int mmu_idx, uintptr_t ra)
+static uint64_t do_st_mmio_leN(CPUArchState *env, CPUTLBEntryFull *full,
+                               uint64_t val_le, vaddr addr, int size,
+                               int mmu_idx, uintptr_t ra)
  {
-    CPUTLBEntryFull *full = p->full;
-    vaddr addr = p->addr;
-    int i, size = p->size;
+    MemoryRegionSection *section;
+    hwaddr mr_offset;
+    MemoryRegion *mr;
+    MemTxAttrs attrs;
+
+    tcg_debug_assert(size > 0 && size <= 8);
+
+    attrs = full->attrs;
+    section = io_prepare(&mr_offset, env, full->xlat_section, attrs, addr, ra);
+    mr = section->mr;
+
+    do {
+        MemOp this_mop;
+        unsigned this_size;
+        MemTxResult r;
+
+        /* Store aligned pieces up to 8 bytes. */
+        this_mop = ctz32(size | (int)addr | 8);
+        this_size = 1 << this_mop;
+        this_mop |= MO_LE;
+
+        r = memory_region_dispatch_write(mr, mr_offset, val_le,
+                                         this_mop, attrs);
+        if (unlikely(r != MEMTX_OK)) {
+            io_failed(env, full, addr, this_size, MMU_DATA_STORE,
+                      mmu_idx, r, ra);
+        }
+        if (this_size == 8) {
+            return 0;
+        }
+
+        val_le >>= this_size * 8;
+        addr += this_size;
+        mr_offset += this_size;
+        size -= this_size;
+    } while (size);
  
-    QEMU_IOTHREAD_LOCK_GUARD();
-    for (i = 0; i < size; i++, val_le >>= 8) {
-        io_writex(env, full, mmu_idx, val_le, addr + i, ra, MO_UB);
-    }
      return val_le;
  }
  
@@ -2698,7 +2709,9 @@ static uint64_t do_st_leN(CPUArchState *env, MMULookupPageData *p,
      unsigned tmp, half_size;
  
      if (unlikely(p->flags & TLB_MMIO)) {
-        return do_st_mmio_leN(env, p, val_le, mmu_idx, ra);
+        QEMU_IOTHREAD_LOCK_GUARD();
+        return do_st_mmio_leN(env, p->full, val_le, p->addr,
+                              p->size, mmu_idx, ra);
      } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
          return val_le >> (p->size * 8);
      }
@@ -2751,11 +2764,11 @@ static uint64_t do_st16_leN(CPUArchState *env, MMULookupPageData *p,
      MemOp atom;
  
      if (unlikely(p->flags & TLB_MMIO)) {
-        p->size = 8;
-        do_st_mmio_leN(env, p, int128_getlo(val_le), mmu_idx, ra);
-        p->size = size - 8;
-        p->addr += 8;
-        return do_st_mmio_leN(env, p, int128_gethi(val_le), mmu_idx, ra);
+        QEMU_IOTHREAD_LOCK_GUARD();
+        do_st_mmio_leN(env, p->full, int128_getlo(val_le),
+                       p->addr, 8, mmu_idx, ra);
+        return do_st_mmio_leN(env, p->full, int128_gethi(val_le),
+                              p->addr + 8, size - 8, mmu_idx, ra);
      } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
          return int128_gethi(val_le) >> ((size - 8) * 8);
      }
@@ -2799,7 +2812,8 @@ static void do_st_1(CPUArchState *env, MMULookupPageData *p, uint8_t val,
                      int mmu_idx, uintptr_t ra)
  {
      if (unlikely(p->flags & TLB_MMIO)) {
-        io_writex(env, p->full, mmu_idx, val, p->addr, ra, MO_UB);
+        QEMU_IOTHREAD_LOCK_GUARD();
+        do_st_mmio_leN(env, p->full, val, p->addr, 1, mmu_idx, ra);
      } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
          /* nothing */
      } else {
@@ -2811,7 +2825,11 @@ static void do_st_2(CPUArchState *env, MMULookupPageData *p, uint16_t val,
                      int mmu_idx, MemOp memop, uintptr_t ra)
  {
      if (unlikely(p->flags & TLB_MMIO)) {
-        io_writex(env, p->full, mmu_idx, val, p->addr, ra, memop);
+        if ((memop & MO_BSWAP) != MO_LE) {
+            val = bswap16(val);
+        }
+        QEMU_IOTHREAD_LOCK_GUARD();
+        do_st_mmio_leN(env, p->full, val, p->addr, 2, mmu_idx, ra);
      } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
          /* nothing */
      } else {
@@ -2827,7 +2845,11 @@ static void do_st_4(CPUArchState *env, MMULookupPageData *p, uint32_t val,
                      int mmu_idx, MemOp memop, uintptr_t ra)
  {
      if (unlikely(p->flags & TLB_MMIO)) {
-        io_writex(env, p->full, mmu_idx, val, p->addr, ra, memop);
+        if ((memop & MO_BSWAP) != MO_LE) {
+            val = bswap32(val);
+        }
+        QEMU_IOTHREAD_LOCK_GUARD();
+        do_st_mmio_leN(env, p->full, val, p->addr, 4, mmu_idx, ra);
      } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
          /* nothing */
      } else {
@@ -2843,7 +2865,11 @@ static void do_st_8(CPUArchState *env, MMULookupPageData *p, uint64_t val,
                      int mmu_idx, MemOp memop, uintptr_t ra)
  {
      if (unlikely(p->flags & TLB_MMIO)) {
-        io_writex(env, p->full, mmu_idx, val, p->addr, ra, memop);
+        if ((memop & MO_BSWAP) != MO_LE) {
+            val = bswap64(val);
+        }
+        QEMU_IOTHREAD_LOCK_GUARD();
+        do_st_mmio_leN(env, p->full, val, p->addr, 8, mmu_idx, ra);
      } else if (unlikely(p->flags & TLB_DISCARD_WRITE)) {
          /* nothing */
      } else {
@@ -2966,22 +2992,22 @@ static void do_st16_mmu(CPUArchState *env, vaddr addr, Int128 val,
      cpu_req_mo(TCG_MO_LD_ST | TCG_MO_ST_ST);
      crosspage = mmu_lookup(env, addr, oi, ra, MMU_DATA_STORE, &l);
      if (likely(!crosspage)) {
-        /* Swap to host endian if necessary, then store. */
-        if (l.memop & MO_BSWAP) {
-            val = bswap128(val);
-        }
          if (unlikely(l.page[0].flags & TLB_MMIO)) {
-            QEMU_IOTHREAD_LOCK_GUARD();
-            if (HOST_BIG_ENDIAN) {
-                b = int128_getlo(val), a = int128_gethi(val);
-            } else {
-                a = int128_getlo(val), b = int128_gethi(val);
+            if ((l.memop & MO_BSWAP) != MO_LE) {
+                val = bswap128(val);
              }
-            io_writex(env, l.page[0].full, l.mmu_idx, a, addr, ra, MO_64);
-            io_writex(env, l.page[0].full, l.mmu_idx, b, addr + 8, ra, MO_64);
+            a = int128_getlo(val);
+            b = int128_gethi(val);
+            QEMU_IOTHREAD_LOCK_GUARD();
+            do_st_mmio_leN(env, l.page[0].full, a, addr, 8, l.mmu_idx, ra);
+            do_st_mmio_leN(env, l.page[0].full, b, addr + 8, 8, l.mmu_idx, ra);
          } else if (unlikely(l.page[0].flags & TLB_DISCARD_WRITE)) {
              /* nothing */
          } else {
+            /* Swap to host endian if necessary, then store. */
+            if (l.memop & MO_BSWAP) {
+                val = bswap128(val);
+            }
              store_atom_16(env, ra, l.page[0].haddr, l.memop, val);
          }
          return;
@@ -3038,14 +3064,14 @@ static void plugin_store_cb(CPUArchState *env, abi_ptr addr, MemOpIdx oi)
      qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_W);
  }
  
-void cpu_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
+void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val,
                   MemOpIdx oi, uintptr_t retaddr)
  {
      helper_stb_mmu(env, addr, val, oi, retaddr);
      plugin_store_cb(env, addr, oi);
  }
  
-void cpu_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
+void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val,
                   MemOpIdx oi, uintptr_t retaddr)
  {
      tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16);
@@ -3053,7 +3079,7 @@ void cpu_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
      plugin_store_cb(env, addr, oi);
  }
  
-void cpu_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
+void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val,
                      MemOpIdx oi, uintptr_t retaddr)
  {
      tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32);
@@ -3061,7 +3087,7 @@ void cpu_stl_mmu(CPUArchState *env, target_ulong addr, uint32_t val,
      plugin_store_cb(env, addr, oi);
  }
  
-void cpu_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
+void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val,
                   MemOpIdx oi, uintptr_t retaddr)
  {
      tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64);
@@ -3069,7 +3095,7 @@ void cpu_stq_mmu(CPUArchState *env, target_ulong addr, uint64_t val,
      plugin_store_cb(env, addr, oi);
  }
  
-void cpu_st16_mmu(CPUArchState *env, target_ulong addr, Int128 val,
+void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
                    MemOpIdx oi, uintptr_t retaddr)
  {
      tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128);