9pfs: disable msize warning for synth driver

[mirror_qemu.git] / hw / 9pfs / 9p.c
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c

index 55821343e59475ee768f7c8ca029db1b69c039e9..741d222c3f2fe5769c6bf8959f257450fcc8f7e2 100644 (file)
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -17,6 +17,7 @@
  #include "qapi/error.h"
  #include "qemu/error-report.h"
  #include "qemu/iov.h"
+#include "qemu/main-loop.h"
  #include "qemu/sockets.h"
  #include "virtio-9p.h"
  #include "fsdev/qemu-fsdev.h"
@@ -25,6 +26,9 @@
  #include "trace.h"
  #include "migration/blocker.h"
  #include "sysemu/qtest.h"
+#include "qemu/xxhash.h"
+#include <math.h>
+#include <linux/limits.h>
  
  int open_fd_hw;
  int total_open_fd;
@@ -310,8 +314,8 @@ static V9fsFidState *alloc_fid(V9fsState *s, int32_t fid)
      f->next = s->fid_list;
      s->fid_list = f;
  
-    v9fs_readdir_init(&f->fs.dir);
-    v9fs_readdir_init(&f->fs_reclaim.dir);
+    v9fs_readdir_init(s->proto_version, &f->fs.dir);
+    v9fs_readdir_init(s->proto_version, &f->fs_reclaim.dir);
  
      return f;
  }
@@ -571,14 +575,374 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
                                  P9_STAT_MODE_NAMED_PIPE |   \
                                  P9_STAT_MODE_SOCKET)
  
-/* This is the algorithm from ufs in spfs */
-static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
+/* Mirrors all bits of a byte. So e.g. binary 10100000 would become 00000101. */
+static inline uint8_t mirror8bit(uint8_t byte)
  {
+    return (byte * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+}
+
+/* Same as mirror8bit() just for a 64 bit data type instead for a byte. */
+static inline uint64_t mirror64bit(uint64_t value)
+{
+    return ((uint64_t)mirror8bit(value         & 0xff) << 56) |
+           ((uint64_t)mirror8bit((value >> 8)  & 0xff) << 48) |
+           ((uint64_t)mirror8bit((value >> 16) & 0xff) << 40) |
+           ((uint64_t)mirror8bit((value >> 24) & 0xff) << 32) |
+           ((uint64_t)mirror8bit((value >> 32) & 0xff) << 24) |
+           ((uint64_t)mirror8bit((value >> 40) & 0xff) << 16) |
+           ((uint64_t)mirror8bit((value >> 48) & 0xff) << 8)  |
+           ((uint64_t)mirror8bit((value >> 56) & 0xff));
+}
+
+/**
+ * @brief Parameter k for the Exponential Golomb algorihm to be used.
+ *
+ * The smaller this value, the smaller the minimum bit count for the Exp.
+ * Golomb generated affixes will be (at lowest index) however for the
+ * price of having higher maximum bit count of generated affixes (at highest
+ * index). Likewise increasing this parameter yields in smaller maximum bit
+ * count for the price of having higher minimum bit count.
+ *
+ * In practice that means: a good value for k depends on the expected amount
+ * of devices to be exposed by one export. For a small amount of devices k
+ * should be small, for a large amount of devices k might be increased
+ * instead. The default of k=0 should be fine for most users though.
+ *
+ * @b IMPORTANT: In case this ever becomes a runtime parameter; the value of
+ * k should not change as long as guest is still running! Because that would
+ * cause completely different inode numbers to be generated on guest.
+ */
+#define EXP_GOLOMB_K    0
+
+/**
+ * @brief Exponential Golomb algorithm for arbitrary k (including k=0).
+ *
+ * The Exponential Golomb algorithm generates @b prefixes (@b not suffixes!)
+ * with growing length and with the mathematical property of being
+ * "prefix-free". The latter means the generated prefixes can be prepended
+ * in front of arbitrary numbers and the resulting concatenated numbers are
+ * guaranteed to be always unique.
+ *
+ * This is a minor adjustment to the original Exp. Golomb algorithm in the
+ * sense that lowest allowed index (@param n) starts with 1, not with zero.
+ *
+ * @param n - natural number (or index) of the prefix to be generated
+ *            (1, 2, 3, ...)
+ * @param k - parameter k of Exp. Golomb algorithm to be used
+ *            (see comment on EXP_GOLOMB_K macro for details about k)
+ */
+static VariLenAffix expGolombEncode(uint64_t n, int k)
+{
+    const uint64_t value = n + (1 << k) - 1;
+    const int bits = (int) log2(value) + 1;
+    return (VariLenAffix) {
+        .type = AffixType_Prefix,
+        .value = value,
+        .bits = bits + MAX((bits - 1 - k), 0)
+    };
+}
+
+/**
+ * @brief Converts a suffix into a prefix, or a prefix into a suffix.
+ *
+ * Simply mirror all bits of the affix value, for the purpose to preserve
+ * respectively the mathematical "prefix-free" or "suffix-free" property
+ * after the conversion.
+ *
+ * If a passed prefix is suitable to create unique numbers, then the
+ * returned suffix is suitable to create unique numbers as well (and vice
+ * versa).
+ */
+static VariLenAffix invertAffix(const VariLenAffix *affix)
+{
+    return (VariLenAffix) {
+        .type =
+            (affix->type == AffixType_Suffix) ?
+                AffixType_Prefix : AffixType_Suffix,
+        .value =
+            mirror64bit(affix->value) >>
+            ((sizeof(affix->value) * 8) - affix->bits),
+        .bits = affix->bits
+    };
+}
+
+/**
+ * @brief Generates suffix numbers with "suffix-free" property.
+ *
+ * This is just a wrapper function on top of the Exp. Golomb algorithm.
+ *
+ * Since the Exp. Golomb algorithm generates prefixes, but we need suffixes,
+ * this function converts the Exp. Golomb prefixes into appropriate suffixes
+ * which are still suitable for generating unique numbers.
+ *
+ * @param n - natural number (or index) of the suffix to be generated
+ *            (1, 2, 3, ...)
+ */
+static VariLenAffix affixForIndex(uint64_t index)
+{
+    VariLenAffix prefix;
+    prefix = expGolombEncode(index, EXP_GOLOMB_K);
+    return invertAffix(&prefix); /* convert prefix to suffix */
+}
+
+/* creative abuse of tb_hash_func7, which is based on xxhash */
+static uint32_t qpp_hash(QppEntry e)
+{
+    return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0);
+}
+
+static uint32_t qpf_hash(QpfEntry e)
+{
+    return qemu_xxhash7(e.ino, e.dev, 0, 0, 0);
+}
+
+static bool qpd_cmp_func(const void *obj, const void *userp)
+{
+    const QpdEntry *e1 = obj, *e2 = userp;
+    return e1->dev == e2->dev;
+}
+
+static bool qpp_cmp_func(const void *obj, const void *userp)
+{
+    const QppEntry *e1 = obj, *e2 = userp;
+    return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
+}
+
+static bool qpf_cmp_func(const void *obj, const void *userp)
+{
+    const QpfEntry *e1 = obj, *e2 = userp;
+    return e1->dev == e2->dev && e1->ino == e2->ino;
+}
+
+static void qp_table_remove(void *p, uint32_t h, void *up)
+{
+    g_free(p);
+}
+
+static void qp_table_destroy(struct qht *ht)
+{
+    if (!ht || !ht->map) {
+        return;
+    }
+    qht_iter(ht, qp_table_remove, NULL);
+    qht_destroy(ht);
+}
+
+static void qpd_table_init(struct qht *ht)
+{
+    qht_init(ht, qpd_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
+}
+
+static void qpp_table_init(struct qht *ht)
+{
+    qht_init(ht, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
+}
+
+static void qpf_table_init(struct qht *ht)
+{
+    qht_init(ht, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
+}
+
+/*
+ * Returns how many (high end) bits of inode numbers of the passed fs
+ * device shall be used (in combination with the device number) to
+ * generate hash values for qpp_table entries.
+ *
+ * This function is required if variable length suffixes are used for inode
+ * number mapping on guest level. Since a device may end up having multiple
+ * entries in qpp_table, each entry most probably with a different suffix
+ * length, we thus need this function in conjunction with qpd_table to
+ * "agree" about a fix amount of bits (per device) to be always used for
+ * generating hash values for the purpose of accessing qpp_table in order
+ * get consistent behaviour when accessing qpp_table.
+ */
+static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev)
+{
+    QpdEntry lookup = {
+        .dev = dev
+    }, *val;
+    uint32_t hash = dev;
+    VariLenAffix affix;
+
+    val = qht_lookup(&pdu->s->qpd_table, &lookup, hash);
+    if (!val) {
+        val = g_malloc0(sizeof(QpdEntry));
+        *val = lookup;
+        affix = affixForIndex(pdu->s->qp_affix_next);
+        val->prefix_bits = affix.bits;
+        qht_insert(&pdu->s->qpd_table, val, hash, NULL);
+        pdu->s->qp_ndevices++;
+    }
+    return val->prefix_bits;
+}
+
+/**
+ * @brief Slow / full mapping host inode nr -> guest inode nr.
+ *
+ * This function performs a slower and much more costly remapping of an
+ * original file inode number on host to an appropriate different inode
+ * number on guest. For every (dev, inode) combination on host a new
+ * sequential number is generated, cached and exposed as inode number on
+ * guest.
+ *
+ * This is just a "last resort" fallback solution if the much faster/cheaper
+ * qid_path_suffixmap() failed. In practice this slow / full mapping is not
+ * expected ever to be used at all though.
+ *
+ * @see qid_path_suffixmap() for details
+ *
+ */
+static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
+                            uint64_t *path)
+{
+    QpfEntry lookup = {
+        .dev = stbuf->st_dev,
+        .ino = stbuf->st_ino
+    }, *val;
+    uint32_t hash = qpf_hash(lookup);
+    VariLenAffix affix;
+
+    val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
+
+    if (!val) {
+        if (pdu->s->qp_fullpath_next == 0) {
+            /* no more files can be mapped :'( */
+            error_report_once(
+                "9p: No more prefixes available for remapping inodes from "
+                "host to guest."
+            );
+            return -ENFILE;
+        }
+
+        val = g_malloc0(sizeof(QppEntry));
+        *val = lookup;
+
+        /* new unique inode and device combo */
+        affix = affixForIndex(
+            1ULL << (sizeof(pdu->s->qp_affix_next) * 8)
+        );
+        val->path = (pdu->s->qp_fullpath_next++ << affix.bits) | affix.value;
+        pdu->s->qp_fullpath_next &= ((1ULL << (64 - affix.bits)) - 1);
+        qht_insert(&pdu->s->qpf_table, val, hash, NULL);
+    }
+
+    *path = val->path;
+    return 0;
+}
+
+/**
+ * @brief Quick mapping host inode nr -> guest inode nr.
+ *
+ * This function performs quick remapping of an original file inode number
+ * on host to an appropriate different inode number on guest. This remapping
+ * of inodes is required to avoid inode nr collisions on guest which would
+ * happen if the 9p export contains more than 1 exported file system (or
+ * more than 1 file system data set), because unlike on host level where the
+ * files would have different device nrs, all files exported by 9p would
+ * share the same device nr on guest (the device nr of the virtual 9p device
+ * that is).
+ *
+ * Inode remapping is performed by chopping off high end bits of the original
+ * inode number from host, shifting the result upwards and then assigning a
+ * generated suffix number for the low end bits, where the same suffix number
+ * will be shared by all inodes with the same device id AND the same high end
+ * bits that have been chopped off. That approach utilizes the fact that inode
+ * numbers very likely share the same high end bits (i.e. due to their common
+ * sequential generation by file systems) and hence we only have to generate
+ * and track a very limited amount of suffixes in practice due to that.
+ *
+ * We generate variable size suffixes for that purpose. The 1st generated
+ * suffix will only have 1 bit and hence we only need to chop off 1 bit from
+ * the original inode number. The subsequent suffixes being generated will
+ * grow in (bit) size subsequently, i.e. the 2nd and 3rd suffix being
+ * generated will have 3 bits and hence we have to chop off 3 bits from their
+ * original inodes, and so on. That approach of using variable length suffixes
+ * (i.e. over fixed size ones) utilizes the fact that in practice only a very
+ * limited amount of devices are shared by the same export (e.g. typically
+ * less than 2 dozen devices per 9p export), so in practice we need to chop
+ * off less bits than with fixed size prefixes and yet are flexible to add
+ * new devices at runtime below host's export directory at any time without
+ * having to reboot guest nor requiring to reconfigure guest for that. And due
+ * to the very limited amount of original high end bits that we chop off that
+ * way, the total amount of suffixes we need to generate is less than by using
+ * fixed size prefixes and hence it also improves performance of the inode
+ * remapping algorithm, and finally has the nice side effect that the inode
+ * numbers on guest will be much smaller & human friendly. ;-)
+ */
+static int qid_path_suffixmap(V9fsPDU *pdu, const struct stat *stbuf,
+                              uint64_t *path)
+{
+    const int ino_hash_bits = qid_inode_prefix_hash_bits(pdu, stbuf->st_dev);
+    QppEntry lookup = {
+        .dev = stbuf->st_dev,
+        .ino_prefix = (uint16_t) (stbuf->st_ino >> (64 - ino_hash_bits))
+    }, *val;
+    uint32_t hash = qpp_hash(lookup);
+
+    val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
+
+    if (!val) {
+        if (pdu->s->qp_affix_next == 0) {
+            /* we ran out of affixes */
+            warn_report_once(
+                "9p: Potential degraded performance of inode remapping"
+            );
+            return -ENFILE;
+        }
+
+        val = g_malloc0(sizeof(QppEntry));
+        *val = lookup;
+
+        /* new unique inode affix and device combo */
+        val->qp_affix_index = pdu->s->qp_affix_next++;
+        val->qp_affix = affixForIndex(val->qp_affix_index);
+        qht_insert(&pdu->s->qpp_table, val, hash, NULL);
+    }
+    /* assuming generated affix to be suffix type, not prefix */
+    *path = (stbuf->st_ino << val->qp_affix.bits) | val->qp_affix.value;
+    return 0;
+}
+
+static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
+{
+    int err;
      size_t size;
  
-    memset(&qidp->path, 0, sizeof(qidp->path));
-    size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
-    memcpy(&qidp->path, &stbuf->st_ino, size);
+    if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
+        /* map inode+device to qid path (fast path) */
+        err = qid_path_suffixmap(pdu, stbuf, &qidp->path);
+        if (err == -ENFILE) {
+            /* fast path didn't work, fall back to full map */
+            err = qid_path_fullmap(pdu, stbuf, &qidp->path);
+        }
+        if (err) {
+            return err;
+        }
+    } else {
+        if (pdu->s->dev_id != stbuf->st_dev) {
+            if (pdu->s->ctx.export_flags & V9FS_FORBID_MULTIDEVS) {
+                error_report_once(
+                    "9p: Multiple devices detected in same VirtFS export. "
+                    "Access of guest to additional devices is (partly) "
+                    "denied due to virtfs option 'multidevs=forbid' being "
+                    "effective."
+                );
+                return -ENODEV;
+            } else {
+                warn_report_once(
+                    "9p: Multiple devices detected in same VirtFS export, "
+                    "which might lead to file ID collisions and severe "
+                    "misbehaviours on guest! You should either use a "
+                    "separate export for each device shared from host or "
+                    "use virtfs option 'multidevs=remap'!"
+                );
+            }
+        }
+        memset(&qidp->path, 0, sizeof(qidp->path));
+        size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
+        memcpy(&qidp->path, &stbuf->st_ino, size);
+    }
+
      qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8);
      qidp->type = 0;
      if (S_ISDIR(stbuf->st_mode)) {
@@ -587,6 +951,8 @@ static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
      if (S_ISLNK(stbuf->st_mode)) {
          qidp->type |= P9_QID_TYPE_SYMLINK;
      }
+
+    return 0;
  }
  
  static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
@@ -599,7 +965,10 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
      if (err < 0) {
          return err;
      }
-    stat_to_qid(&stbuf, qidp);
+    err = stat_to_qid(pdu, &stbuf, qidp);
+    if (err < 0) {
+        return err;
+    }
      return 0;
  }
  
@@ -743,9 +1112,9 @@ static int donttouch_stat(V9fsStat *stat)
  {
      if (stat->type == -1 &&
          stat->dev == -1 &&
-        stat->qid.type == -1 &&
-        stat->qid.version == -1 &&
-        stat->qid.path == -1 &&
+        stat->qid.type == 0xff &&
+        stat->qid.version == (uint32_t) -1 &&
+        stat->qid.path == (uint64_t) -1 &&
          stat->mode == -1 &&
          stat->atime == -1 &&
          stat->mtime == -1 &&
@@ -830,7 +1199,10 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
  
      memset(v9stat, 0, sizeof(*v9stat));
  
-    stat_to_qid(stbuf, &v9stat->qid);
+    err = stat_to_qid(pdu, stbuf, &v9stat->qid);
+    if (err < 0) {
+        return err;
+    }
      v9stat->mode = stat_to_v9mode(stbuf);
      v9stat->atime = stbuf->st_atime;
      v9stat->mtime = stbuf->st_mtime;
@@ -891,7 +1263,7 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
  #define P9_STATS_ALL           0x00003fffULL /* Mask for All fields above */
  
  
-static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
+static int stat_to_v9stat_dotl(V9fsPDU *pdu, const struct stat *stbuf,
                                  V9fsStatDotl *v9lstat)
  {
      memset(v9lstat, 0, sizeof(*v9lstat));
@@ -913,7 +1285,7 @@ static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
      /* Currently we only support BASIC fields in stat */
      v9lstat->st_result_mask = P9_STATS_BASIC;
  
-    stat_to_qid(stbuf, &v9lstat->qid);
+    return stat_to_qid(pdu, stbuf, &v9lstat->qid);
  }
  
  static void print_sg(struct iovec *sg, int cnt)
@@ -968,8 +1340,29 @@ static void coroutine_fn v9fs_version(void *opaque)
          s->proto_version = V9FS_PROTO_2000L;
      } else {
          v9fs_string_sprintf(&version, "unknown");
+        /* skip min. msize check, reporting invalid version has priority */
+        goto marshal;
+    }
+
+    if (s->msize < P9_MIN_MSIZE) {
+        err = -EMSGSIZE;
+        error_report(
+            "9pfs: Client requested msize < minimum msize ("
+            stringify(P9_MIN_MSIZE) ") supported by this server."
+        );
+        goto out;
      }
  
+    /* 8192 is the default msize of Linux clients */
+    if (s->msize <= 8192 && !(s->ctx.export_flags & V9FS_NO_PERF_WARN)) {
+        warn_report_once(
+            "9p: degraded performance: a reasonable high msize should be "
+            "chosen on client/guest side (chosen msize is <= 8192). See "
+            "https://wiki.qemu.org/Documentation/9psetup#msize for details."
+        );
+    }
+
+marshal:
      err = pdu_marshal(pdu, offset, "ds", s->msize, &version);
      if (err < 0) {
          goto out;
@@ -991,7 +1384,6 @@ static void coroutine_fn v9fs_attach(void *opaque)
      size_t offset = 7;
      V9fsQID qid;
      ssize_t err;
-    Error *local_err = NULL;
  
      v9fs_string_init(&uname);
      v9fs_string_init(&aname);
@@ -1029,9 +1421,8 @@ static void coroutine_fn v9fs_attach(void *opaque)
          error_setg(&s->migration_blocker,
                     "Migration is disabled when VirtFS export path '%s' is mounted in the guest using mount_tag '%s'",
                     s->ctx.fs_root ? s->ctx.fs_root : "NULL", s->tag);
-        err = migrate_add_blocker(s->migration_blocker, &local_err);
-        if (local_err) {
-            error_free(local_err);
+        err = migrate_add_blocker(s->migration_blocker, NULL);
+        if (err < 0) {
              error_free(s->migration_blocker);
              s->migration_blocker = NULL;
              clunk_fid(s, fid);
@@ -1115,7 +1506,6 @@ static void coroutine_fn v9fs_getattr(void *opaque)
      uint64_t request_mask;
      V9fsStatDotl v9stat_dotl;
      V9fsPDU *pdu = opaque;
-    V9fsState *s = pdu->s;
  
      retval = pdu_unmarshal(pdu, offset, "dq", &fid, &request_mask);
      if (retval < 0) {
@@ -1136,7 +1526,10 @@ static void coroutine_fn v9fs_getattr(void *opaque)
      if (retval < 0) {
          goto out;
      }
-    stat_to_v9stat_dotl(s, &stbuf, &v9stat_dotl);
+    retval = stat_to_v9stat_dotl(pdu, &stbuf, &v9stat_dotl);
+    if (retval < 0) {
+        goto out;
+    }
  
      /*  fill st_gen if requested and supported by underlying fs */
      if (request_mask & P9_STATS_GEN) {
@@ -1381,7 +1774,10 @@ static void coroutine_fn v9fs_walk(void *opaque)
              if (err < 0) {
                  goto out;
              }
-            stat_to_qid(&stbuf, &qid);
+            err = stat_to_qid(pdu, &stbuf, &qid);
+            if (err < 0) {
+                goto out;
+            }
              v9fs_path_copy(&dpath, &path);
          }
          memcpy(&qids[name_idx], &qid, sizeof(qid));
@@ -1434,8 +1830,10 @@ static int32_t coroutine_fn get_iounit(V9fsPDU *pdu, V9fsPath *path)
       * and as well as less than (client msize - P9_IOHDRSZ))
       */
      if (!v9fs_co_statfs(pdu, path, &stbuf)) {
-        iounit = stbuf.f_bsize;
-        iounit *= (s->msize - P9_IOHDRSZ)/stbuf.f_bsize;
+        if (stbuf.f_bsize) {
+            iounit = stbuf.f_bsize;
+            iounit *= (s->msize - P9_IOHDRSZ) / stbuf.f_bsize;
+        }
      }
      if (!iounit) {
          iounit = s->msize - P9_IOHDRSZ;
@@ -1483,7 +1881,10 @@ static void coroutine_fn v9fs_open(void *opaque)
      if (err < 0) {
          goto out;
      }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
      if (S_ISDIR(stbuf.st_mode)) {
          err = v9fs_co_opendir(pdu, fidp);
          if (err < 0) {
@@ -1593,7 +1994,10 @@ static void coroutine_fn v9fs_lcreate(void *opaque)
          fidp->flags |= FID_NON_RECLAIMABLE;
      }
      iounit =  get_iounit(pdu, &fidp->path);
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
      err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
      if (err < 0) {
          goto out;
@@ -1833,7 +2237,14 @@ static void coroutine_fn v9fs_read(void *opaque)
          goto out_nofid;
      }
      if (fidp->fid_type == P9_FID_DIR) {
-
+        if (s->proto_version != V9FS_PROTO_2000U) {
+            warn_report_once(
+                "9p: bad client: T_read request on directory only expected "
+                "with 9P2000.u protocol version"
+            );
+            err = -EOPNOTSUPP;
+            goto out;
+        }
          if (off == 0) {
              v9fs_co_rewinddir(pdu, fidp);
          }
@@ -1894,7 +2305,13 @@ out_nofid:
      pdu_complete(pdu, err);
  }
  
-static size_t v9fs_readdir_data_size(V9fsString *name)
+/**
+ * Returns size required in Rreaddir response for the passed dirent @p name.
+ *
+ * @param name - directory entry's name (i.e. file name, directory name)
+ * @returns required size in bytes
+ */
+size_t v9fs_readdir_response_size(V9fsString *name)
  {
      /*
       * Size of each dirent on the wire: size of qid (13) + size of offset (8)
@@ -1903,70 +2320,106 @@ static size_t v9fs_readdir_data_size(V9fsString *name)
      return 24 + v9fs_string_size(name);
  }
  
+static void v9fs_free_dirents(struct V9fsDirEnt *e)
+{
+    struct V9fsDirEnt *next = NULL;
+
+    for (; e; e = next) {
+        next = e->next;
+        g_free(e->dent);
+        g_free(e->st);
+        g_free(e);
+    }
+}
+
  static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
-                                        int32_t max_count)
+                                        off_t offset, int32_t max_count)
  {
      size_t size;
      V9fsQID qid;
      V9fsString name;
      int len, err = 0;
      int32_t count = 0;
-    off_t saved_dir_pos;
      struct dirent *dent;
+    struct stat *st;
+    struct V9fsDirEnt *entries = NULL;
  
-    /* save the directory position */
-    saved_dir_pos = v9fs_co_telldir(pdu, fidp);
-    if (saved_dir_pos < 0) {
-        return saved_dir_pos;
+    /*
+     * inode remapping requires the device id, which in turn might be
+     * different for different directory entries, so if inode remapping is
+     * enabled we have to make a full stat for each directory entry
+     */
+    const bool dostat = pdu->s->ctx.export_flags & V9FS_REMAP_INODES;
+
+    /*
+     * Fetch all required directory entries altogether on a background IO
+     * thread from fs driver. We don't want to do that for each entry
+     * individually, because hopping between threads (this main IO thread
+     * and background IO driver thread) would sum up to huge latencies.
+     */
+    count = v9fs_co_readdir_many(pdu, fidp, &entries, offset, max_count,
+                                 dostat);
+    if (count < 0) {
+        err = count;
+        count = 0;
+        goto out;
      }
+    count = 0;
  
-    while (1) {
-        v9fs_readdir_lock(&fidp->fs.dir);
+    for (struct V9fsDirEnt *e = entries; e; e = e->next) {
+        dent = e->dent;
  
-        err = v9fs_co_readdir(pdu, fidp, &dent);
-        if (err || !dent) {
-            break;
+        if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
+            st = e->st;
+            /* e->st should never be NULL, but just to be sure */
+            if (!st) {
+                err = -1;
+                break;
+            }
+
+            /* remap inode */
+            err = stat_to_qid(pdu, st, &qid);
+            if (err < 0) {
+                break;
+            }
+        } else {
+            /*
+             * Fill up just the path field of qid because the client uses
+             * only that. To fill the entire qid structure we will have
+             * to stat each dirent found, which is expensive. For the
+             * latter reason we don't call stat_to_qid() here. Only drawback
+             * is that no multi-device export detection of stat_to_qid()
+             * would be done and provided as error to the user here. But
+             * user would get that error anyway when accessing those
+             * files/dirs through other ways.
+             */
+            size = MIN(sizeof(dent->d_ino), sizeof(qid.path));
+            memcpy(&qid.path, &dent->d_ino, size);
+            /* Fill the other fields with dummy values */
+            qid.type = 0;
+            qid.version = 0;
          }
+
          v9fs_string_init(&name);
          v9fs_string_sprintf(&name, "%s", dent->d_name);
-        if ((count + v9fs_readdir_data_size(&name)) > max_count) {
-            v9fs_readdir_unlock(&fidp->fs.dir);
-
-            /* Ran out of buffer. Set dir back to old position and return */
-            v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
-            v9fs_string_free(&name);
-            return count;
-        }
-        /*
-         * Fill up just the path field of qid because the client uses
-         * only that. To fill the entire qid structure we will have
-         * to stat each dirent found, which is expensive
-         */
-        size = MIN(sizeof(dent->d_ino), sizeof(qid.path));
-        memcpy(&qid.path, &dent->d_ino, size);
-        /* Fill the other fields with dummy values */
-        qid.type = 0;
-        qid.version = 0;
  
          /* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */
          len = pdu_marshal(pdu, 11 + count, "Qqbs",
                            &qid, dent->d_off,
                            dent->d_type, &name);
  
-        v9fs_readdir_unlock(&fidp->fs.dir);
+        v9fs_string_free(&name);
  
          if (len < 0) {
-            v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
-            v9fs_string_free(&name);
-            return len;
+            err = len;
+            break;
          }
+
          count += len;
-        v9fs_string_free(&name);
-        saved_dir_pos = dent->d_off;
      }
  
-    v9fs_readdir_unlock(&fidp->fs.dir);
-
+out:
+    v9fs_free_dirents(entries);
      if (err < 0) {
          return err;
      }
@@ -1983,6 +2436,7 @@ static void coroutine_fn v9fs_readdir(void *opaque)
      int32_t count;
      uint32_t max_count;
      V9fsPDU *pdu = opaque;
+    V9fsState *s = pdu->s;
  
      retval = pdu_unmarshal(pdu, offset, "dqd", &fid,
                             &initial_offset, &max_count);
@@ -1991,6 +2445,14 @@ static void coroutine_fn v9fs_readdir(void *opaque)
      }
      trace_v9fs_readdir(pdu->tag, pdu->id, fid, initial_offset, max_count);
  
+    /* Enough space for a R_readdir header: size[4] Rreaddir tag[2] count[4] */
+    if (max_count > s->msize - 11) {
+        max_count = s->msize - 11;
+        warn_report_once(
+            "9p: bad client: T_readdir with count > msize - 11"
+        );
+    }
+
      fidp = get_fid(pdu, fid);
      if (fidp == NULL) {
          retval = -EINVAL;
@@ -2000,12 +2462,15 @@ static void coroutine_fn v9fs_readdir(void *opaque)
          retval = -EINVAL;
          goto out;
      }
-    if (initial_offset == 0) {
-        v9fs_co_rewinddir(pdu, fidp);
-    } else {
-        v9fs_co_seekdir(pdu, fidp, initial_offset);
+    if (s->proto_version != V9FS_PROTO_2000L) {
+        warn_report_once(
+            "9p: bad client: T_readdir request only expected with 9P2000.L "
+            "protocol version"
+        );
+        retval = -EOPNOTSUPP;
+        goto out;
      }
-    count = v9fs_do_readdir(pdu, fidp, max_count);
+    count = v9fs_do_readdir(pdu, fidp, (off_t) initial_offset, max_count);
      if (count < 0) {
          retval = count;
          goto out;
@@ -2033,8 +2498,7 @@ static int v9fs_xattr_write(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp,
  
  
      if (fidp->fs.xattr.len < off) {
-        err = -ENOSPC;
-        goto out;
+        return -ENOSPC;
      }
      write_count = fidp->fs.xattr.len - off;
      if (write_count > count) {
@@ -2060,7 +2524,7 @@ static int v9fs_xattr_write(V9fsState *s, V9fsPDU *pdu, V9fsFidState *fidp,
          off += to_copy;
          write_count -= to_copy;
      }
-out:
+
      return err;
  }
  
@@ -2327,7 +2791,10 @@ static void coroutine_fn v9fs_create(void *opaque)
          }
      }
      iounit = get_iounit(pdu, &fidp->path);
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
      err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
      if (err < 0) {
          goto out;
@@ -2384,7 +2851,10 @@ static void coroutine_fn v9fs_symlink(void *opaque)
      if (err < 0) {
          goto out;
      }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
      err =  pdu_marshal(pdu, offset, "Q", &qid);
      if (err < 0) {
          goto out;
@@ -2619,8 +3089,7 @@ static int coroutine_fn v9fs_complete_rename(V9fsPDU *pdu, V9fsFidState *fidp,
      if (newdirfid != -1) {
          dirfidp = get_fid(pdu, newdirfid);
          if (dirfidp == NULL) {
-            err = -ENOENT;
-            goto out_nofid;
+            return -ENOENT;
          }
          if (fidp->fid_type != P9_FID_NONE) {
              err = -EINVAL;
@@ -2663,7 +3132,6 @@ out:
          put_fid(pdu, dirfidp);
      }
      v9fs_path_free(&new_path);
-out_nofid:
      return err;
  }
  
@@ -3064,7 +3532,10 @@ static void coroutine_fn v9fs_mknod(void *opaque)
      if (err < 0) {
          goto out;
      }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
      err = pdu_marshal(pdu, offset, "Q", &qid);
      if (err < 0) {
          goto out;
@@ -3222,7 +3693,10 @@ static void coroutine_fn v9fs_mkdir(void *opaque)
      if (err < 0) {
          goto out;
      }
-    stat_to_qid(&stbuf, &qid);
+    err = stat_to_qid(pdu, &stbuf, &qid);
+    if (err < 0) {
+        goto out;
+    }
      err = pdu_marshal(pdu, offset, "Q", &qid);
      if (err < 0) {
          goto out;
@@ -3551,6 +4025,7 @@ void pdu_submit(V9fsPDU *pdu, P9MsgHeader *hdr)
  int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
                                 Error **errp)
  {
+    ERRP_GUARD();
      int i, len;
      struct stat stat;
      FsDriverEntry *fse;
@@ -3633,31 +4108,43 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
          goto out;
      }
  
+    s->dev_id = stat.st_dev;
+
+    /* init inode remapping : */
+    /* hash table for variable length inode suffixes */
+    qpd_table_init(&s->qpd_table);
+    /* hash table for slow/full inode remapping (most users won't need it) */
+    qpf_table_init(&s->qpf_table);
+    /* hash table for quick inode remapping */
+    qpp_table_init(&s->qpp_table);
+    s->qp_ndevices = 0;
+    s->qp_affix_next = 1; /* reserve 0 to detect overflow */
+    s->qp_fullpath_next = 1;
+
      s->ctx.fst = &fse->fst;
      fsdev_throttle_init(s->ctx.fst);
  
-    v9fs_path_free(&path);
-
      rc = 0;
  out:
      if (rc) {
-        if (s->ops && s->ops->cleanup && s->ctx.private) {
-            s->ops->cleanup(&s->ctx);
-        }
-        g_free(s->tag);
-        g_free(s->ctx.fs_root);
-        v9fs_path_free(&path);
+        v9fs_device_unrealize_common(s);
      }
+    v9fs_path_free(&path);
      return rc;
  }
  
-void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
+void v9fs_device_unrealize_common(V9fsState *s)
  {
-    if (s->ops->cleanup) {
+    if (s->ops && s->ops->cleanup) {
          s->ops->cleanup(&s->ctx);
      }
-    fsdev_throttle_cleanup(s->ctx.fst);
+    if (s->ctx.fst) {
+        fsdev_throttle_cleanup(s->ctx.fst);
+    }
      g_free(s->tag);
+    qp_table_destroy(&s->qpd_table);
+    qp_table_destroy(&s->qpp_table);
+    qp_table_destroy(&s->qpf_table);
      g_free(s->ctx.fs_root);
  }