]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/commitdiff
Merge tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 30 Jul 2016 23:33:25 +0000 (16:33 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 30 Jul 2016 23:33:25 +0000 (16:33 -0700)
Pull NFS client updates from Trond Myklebust:
 "Highlights include:

  Stable bugfixes:
   - nfs: don't create zero-length requests

   - several LAYOUTGET bugfixes

  Features:
   - several performance related features

   - more aggressive caching when we can rely on close-to-open
     cache consistency

   - remove serialisation of O_DIRECT reads and writes

   - optimise several code paths to not flush to disk unnecessarily.

     However allow for the idiosyncracies of pNFS for those layout
     types that need to issue a LAYOUTCOMMIT before the metadata can
     be updated on the server.

   - SUNRPC updates to the client data receive path

   - pNFS/SCSI support RH/Fedora dm-mpath device nodes

   - pNFS files/flexfiles can now use unprivileged ports when
     the generic NFS mount options allow it.

  Bugfixes:
   - Don't use RDMA direct data placement together with data
     integrity or privacy security flavours

   - Remove the RDMA ALLPHYSICAL memory registration mode as
     it has potential security holes.

   - Several layout recall fixes to improve NFSv4.1 protocol
     compliance.

   - Fix an Oops in the pNFS files and flexfiles connection
     setup to the DS

   - Allow retry of operations that used a returned delegation
      stateid

   - Don't mark the inode as revalidated if a LAYOUTCOMMIT is
     outstanding

   - Fix writeback races in nfs4_copy_range() and
     nfs42_proc_deallocate()"

* tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (104 commits)
  pNFS: Actively set attributes as invalid if LAYOUTCOMMIT is outstanding
  NFSv4: Clean up lookup of SECINFO_NO_NAME
  NFSv4.2: Fix warning "variable ‘stateids’ set but not used"
  NFSv4: Fix warning "no previous prototype for ‘nfs4_listxattr’"
  SUNRPC: Fix a compiler warning in fs/nfs/clnt.c
  pNFS: Remove redundant smp_mb() from pnfs_init_lseg()
  pNFS: Cleanup - do layout segment initialisation in one place
  pNFS: Remove redundant stateid invalidation
  pNFS: Remove redundant pnfs_mark_layout_returned_if_empty()
  pNFS: Clear the layout metadata if the server changed the layout stateid
  pNFS: Cleanup - don't open code pnfs_mark_layout_stateid_invalid()
  NFS: pnfs_mark_matching_lsegs_return() should match the layout sequence id
  pNFS: Do not set plh_return_seq for non-callback related layoutreturns
  pNFS: Ensure layoutreturn acts as a completion for layout callbacks
  pNFS: Fix CB_LAYOUTRECALL stateid verification
  pNFS: Always update the layout barrier seqid on LAYOUTGET
  pNFS: Always update the layout stateid if NFS_LAYOUT_INVALID_STID is set
  pNFS: Clear the layout return tracking on layout reinitialisation
  pNFS: LAYOUTRETURN should only update the stateid if the layout is valid
  nfs: don't create zero-length requests
  ...

55 files changed:
fs/nfs/Makefile
fs/nfs/blocklayout/dev.c
fs/nfs/blocklayout/extent_tree.c
fs/nfs/callback_proc.c
fs/nfs/callback_xdr.c
fs/nfs/client.c
fs/nfs/dir.c
fs/nfs/direct.c
fs/nfs/file.c
fs/nfs/filelayout/filelayout.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/io.c [new file with mode: 0644]
fs/nfs/nfs3client.c
fs/nfs/nfs42proc.c
fs/nfs/nfs42xdr.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4client.c
fs/nfs/nfs4file.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4xdr.c
fs/nfs/nfstrace.h
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/pnfs_nfs.c
fs/nfs/super.c
fs/nfs/write.c
include/linux/nfs_fs.h
include/linux/nfs_xdr.h
include/linux/sunrpc/auth.h
include/linux/sunrpc/gss_api.h
include/linux/sunrpc/sched.h
include/linux/sunrpc/xprtsock.h
net/sunrpc/auth.c
net/sunrpc/auth_generic.c
net/sunrpc/auth_gss/auth_gss.c
net/sunrpc/auth_gss/gss_krb5_mech.c
net/sunrpc/auth_gss/gss_mech_switch.c
net/sunrpc/auth_null.c
net/sunrpc/auth_unix.c
net/sunrpc/clnt.c
net/sunrpc/sched.c
net/sunrpc/svc.c
net/sunrpc/xprt.c
net/sunrpc/xprtmultipath.c
net/sunrpc/xprtrdma/Makefile
net/sunrpc/xprtrdma/fmr_ops.c
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/physical_ops.c [deleted file]
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c

index 8664417955a2730b0813a2984b2fe4689a38e3ce..6abdda209642e70d917308282d82ec2cfa0f353b 100644 (file)
@@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
 
 CFLAGS_nfstrace.o += -I$(src)
 nfs-y                  := client.o dir.o file.o getroot.o inode.o super.o \
-                          direct.o pagelist.o read.o symlink.o unlink.o \
+                          io.o direct.o pagelist.o read.o symlink.o unlink.o \
                           write.o namespace.o mount_clnt.o nfstrace.o
 nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
 nfs-$(CONFIG_SYSCTL)   += sysctl.o
index e5b89675263efffc0629dba595782248ade72715..a69ef4e9c24c7ed411cf54bd9b1ba78a371010f2 100644 (file)
@@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
                if (!p)
                        return -EIO;
                b->simple.nr_sigs = be32_to_cpup(p++);
-               if (!b->simple.nr_sigs) {
-                       dprintk("no signature\n");
+               if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) {
+                       dprintk("Bad signature count: %d\n", b->simple.nr_sigs);
                        return -EIO;
                }
 
@@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
                        memcpy(&b->simple.sigs[i].sig, p,
                                b->simple.sigs[i].sig_len);
 
-                       b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+                       b->simple.len += 8 + 4 + \
+                               (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2);
                }
                break;
        case PNFS_BLOCK_VOLUME_SLICE:
@@ -104,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
                p = xdr_inline_decode(xdr, 4);
                if (!p)
                        return -EIO;
+
                b->concat.volumes_count = be32_to_cpup(p++);
+               if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
+                       dprintk("Too many volumes: %d\n", b->concat.volumes_count);
+                       return -EIO;
+               }
 
                p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
                if (!p)
@@ -116,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
                p = xdr_inline_decode(xdr, 8 + 4);
                if (!p)
                        return -EIO;
+
                p = xdr_decode_hyper(p, &b->stripe.chunk_size);
                b->stripe.volumes_count = be32_to_cpup(p++);
+               if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
+                       dprintk("Too many volumes: %d\n", b->stripe.volumes_count);
+                       return -EIO;
+               }
 
                p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
                if (!p)
@@ -224,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
                struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
        struct pnfs_block_volume *v = &volumes[idx];
+       struct block_device *bdev;
        dev_t dev;
 
        dev = bl_resolve_deviceid(server, v, gfp_mask);
        if (!dev)
                return -EIO;
 
-       d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
-       if (IS_ERR(d->bdev)) {
+       bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
+       if (IS_ERR(bdev)) {
                printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
-                       MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
-               return PTR_ERR(d->bdev);
+                       MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
+               return PTR_ERR(bdev);
        }
+       d->bdev = bdev;
 
 
        d->len = i_size_read(d->bdev->bd_inode);
@@ -287,44 +300,71 @@ bl_validate_designator(struct pnfs_block_volume *v)
        }
 }
 
+/*
+ * Try to open the udev path for the WWN.  At least on Debian the udev
+ * by-id path will always point to the dm-multipath device if one exists.
+ */
+static struct block_device *
+bl_open_udev_path(struct pnfs_block_volume *v)
+{
+       struct block_device *bdev;
+       const char *devname;
+
+       devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN",
+                               v->scsi.designator_len, v->scsi.designator);
+       if (!devname)
+               return ERR_PTR(-ENOMEM);
+
+       bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
+       if (IS_ERR(bdev)) {
+               pr_warn("pNFS: failed to open device %s (%ld)\n",
+                       devname, PTR_ERR(bdev));
+       }
+
+       kfree(devname);
+       return bdev;
+}
+
+/*
+ * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the
+ * wwn- links will only point to the first discovered SCSI device there.
+ */
+static struct block_device *
+bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v)
+{
+       struct block_device *bdev;
+       const char *devname;
+
+       devname = kasprintf(GFP_KERNEL,
+                       "/dev/disk/by-id/dm-uuid-mpath-%d%*phN",
+                       v->scsi.designator_type,
+                       v->scsi.designator_len, v->scsi.designator);
+       if (!devname)
+               return ERR_PTR(-ENOMEM);
+
+       bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
+       kfree(devname);
+       return bdev;
+}
+
 static int
 bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
                struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
        struct pnfs_block_volume *v = &volumes[idx];
+       struct block_device *bdev;
        const struct pr_ops *ops;
-       const char *devname;
        int error;
 
        if (!bl_validate_designator(v))
                return -EINVAL;
 
-       switch (v->scsi.designator_len) {
-       case 8:
-               devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
-                               v->scsi.designator);
-               break;
-       case 12:
-               devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
-                               v->scsi.designator);
-               break;
-       case 16:
-               devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
-                               v->scsi.designator);
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
-       if (IS_ERR(d->bdev)) {
-               pr_warn("pNFS: failed to open device %s (%ld)\n",
-                       devname, PTR_ERR(d->bdev));
-               kfree(devname);
-               return PTR_ERR(d->bdev);
-       }
-
-       kfree(devname);
+       bdev = bl_open_dm_mpath_udev_path(v);
+       if (IS_ERR(bdev))
+               bdev = bl_open_udev_path(v);
+       if (IS_ERR(bdev))
+               return PTR_ERR(bdev);
+       d->bdev = bdev;
 
        d->len = i_size_read(d->bdev->bd_inode);
        d->map = bl_map_simple;
@@ -352,7 +392,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
        return 0;
 
 out_blkdev_put:
-       blkdev_put(d->bdev, FMODE_READ);
+       blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE);
        return error;
 }
 
index 720b3ff55fa9b31502c42120f721e3365e735cf3..992bcb19c11e744bed4c390e133ea3ebe7cf0916 100644 (file)
@@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
        return be;
 }
 
+static void __ext_put_deviceids(struct list_head *head)
+{
+       struct pnfs_block_extent *be, *tmp;
+
+       list_for_each_entry_safe(be, tmp, head, be_list) {
+               nfs4_put_deviceid_node(be->be_device);
+               kfree(be);
+       }
+}
+
 static void
 __ext_tree_insert(struct rb_root *root,
                struct pnfs_block_extent *new, bool merge_ok)
@@ -163,7 +173,8 @@ free_new:
 }
 
 static int
-__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
+__ext_tree_remove(struct rb_root *root,
+               sector_t start, sector_t end, struct list_head *tmp)
 {
        struct pnfs_block_extent *be;
        sector_t len1 = 0, len2 = 0;
@@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
                        struct pnfs_block_extent *next = ext_tree_next(be);
 
                        rb_erase(&be->be_node, root);
-                       nfs4_put_deviceid_node(be->be_device);
-                       kfree(be);
+                       list_add_tail(&be->be_list, tmp);
                        be = next;
                }
 
@@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
                sector_t start, sector_t end)
 {
        int err, err2;
+       LIST_HEAD(tmp);
 
        spin_lock(&bl->bl_ext_lock);
-       err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+       err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
        if (rw) {
-               err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
+               err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
                if (!err)
                        err = err2;
        }
        spin_unlock(&bl->bl_ext_lock);
 
+       __ext_put_deviceids(&tmp);
        return err;
 }
 
@@ -396,12 +408,13 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
        sector_t end = start + len;
        struct pnfs_block_extent *be;
        int err = 0;
+       LIST_HEAD(tmp);
 
        spin_lock(&bl->bl_ext_lock);
        /*
         * First remove all COW extents or holes from written to range.
         */
-       err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+       err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
        if (err)
                goto out;
 
@@ -459,6 +472,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
        }
 out:
        spin_unlock(&bl->bl_ext_lock);
+
+       __ext_put_deviceids(&tmp);
        return err;
 }
 
index aaa2e8d3df6f214d92e8ecd24a48c6e4eb029e69..c92a75e066a6f75828079f422c2546ea73a90415 100644 (file)
@@ -119,27 +119,30 @@ out:
  * hashed by filehandle.
  */
 static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
-               struct nfs_fh *fh, nfs4_stateid *stateid)
+               struct nfs_fh *fh)
 {
        struct nfs_server *server;
+       struct nfs_inode *nfsi;
        struct inode *ino;
        struct pnfs_layout_hdr *lo;
 
+restart:
        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
                list_for_each_entry(lo, &server->layouts, plh_layouts) {
-                       if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid))
+                       nfsi = NFS_I(lo->plh_inode);
+                       if (nfs_compare_fh(fh, &nfsi->fh))
                                continue;
-                       if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
+                       if (nfsi->layout != lo)
                                continue;
                        ino = igrab(lo->plh_inode);
                        if (!ino)
                                break;
                        spin_lock(&ino->i_lock);
                        /* Is this layout in the process of being freed? */
-                       if (NFS_I(ino)->layout != lo) {
+                       if (nfsi->layout != lo) {
                                spin_unlock(&ino->i_lock);
                                iput(ino);
-                               break;
+                               goto restart;
                        }
                        pnfs_get_layout_hdr(lo);
                        spin_unlock(&ino->i_lock);
@@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
 }
 
 static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
-               struct nfs_fh *fh, nfs4_stateid *stateid)
+               struct nfs_fh *fh)
 {
        struct pnfs_layout_hdr *lo;
 
        spin_lock(&clp->cl_lock);
        rcu_read_lock();
-       lo = get_layout_by_fh_locked(clp, fh, stateid);
+       lo = get_layout_by_fh_locked(clp, fh);
        rcu_read_unlock();
        spin_unlock(&clp->cl_lock);
 
@@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
 /*
  * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
  */
-static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo,
+static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
                                        const nfs4_stateid *new)
 {
        u32 oldseq, newseq;
 
-       oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+       /* Is the stateid still not initialised? */
+       if (!pnfs_layout_is_valid(lo))
+               return NFS4ERR_DELAY;
+
+       /* Mismatched stateid? */
+       if (!nfs4_stateid_match_other(&lo->plh_stateid, new))
+               return NFS4ERR_BAD_STATEID;
+
        newseq = be32_to_cpu(new->seqid);
+       /* Are we already in a layout recall situation? */
+       if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
+           lo->plh_return_seq != 0) {
+               if (newseq < lo->plh_return_seq)
+                       return NFS4ERR_OLD_STATEID;
+               if (newseq > lo->plh_return_seq)
+                       return NFS4ERR_DELAY;
+               goto out;
+       }
 
+       /* Check that the stateid matches what we think it should be. */
+       oldseq = be32_to_cpu(lo->plh_stateid.seqid);
        if (newseq > oldseq + 1)
-               return false;
-       return true;
+               return NFS4ERR_DELAY;
+       /* Crazy server! */
+       if (newseq <= oldseq)
+               return NFS4ERR_OLD_STATEID;
+out:
+       return NFS_OK;
 }
 
 static u32 initiate_file_draining(struct nfs_client *clp,
@@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
        LIST_HEAD(free_me_list);
 
-       lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
+       lo = get_layout_by_fh(clp, &args->cbl_fh);
        if (!lo) {
                trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
                                &args->cbl_stateid, -rv);
@@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        }
 
        ino = lo->plh_inode;
+       pnfs_layoutcommit_inode(ino, false);
+
 
        spin_lock(&ino->i_lock);
-       if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
-               rv = NFS4ERR_DELAY;
+       rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
+       if (rv != NFS_OK)
                goto unlock;
-       }
        pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
-       spin_unlock(&ino->i_lock);
-
-       pnfs_layoutcommit_inode(ino, false);
 
-       spin_lock(&ino->i_lock);
        /*
         * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
         */
@@ -223,11 +245,13 @@ static u32 initiate_file_draining(struct nfs_client *clp,
                goto unlock;
        }
 
+       /* Embrace your forgetfulness! */
+       rv = NFS4ERR_NOMATCHING_LAYOUT;
+
        if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
                NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
                        &args->cbl_range);
        }
-       pnfs_mark_layout_returned_if_empty(lo);
 unlock:
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&free_me_list);
index d81f96aacd51e71b1da710b477e7d44ff9a90b24..656f68f7fe53e110a33f8bb876028e6083c3db20 100644 (file)
@@ -925,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        if (hdr_arg.minorversion == 0) {
                cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident);
                if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
-                       return rpc_drop_reply;
+                       goto out_invalidcred;
        }
 
        cps.minorversion = hdr_arg.minorversion;
@@ -953,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        nfs_put_client(cps.clp);
        dprintk("%s: done, status = %u\n", __func__, ntohl(status));
        return rpc_success;
+
+out_invalidcred:
+       pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
+       return rpc_autherr_badcred;
 }
 
 /*
index 487c5607d52f4c5c853a1f13cc8fbb545e9e79a2..003ebce4bbc49fa0e5508816027ae65798119e4f 100644 (file)
@@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
  */
 struct nfs_client *
 nfs_get_client(const struct nfs_client_initdata *cl_init,
-              const struct rpc_timeout *timeparms,
-              const char *ip_addr,
               rpc_authflavor_t authflavour)
 {
        struct nfs_client *clp, *new = NULL;
@@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
                                        &nn->nfs_client_list);
                        spin_unlock(&nn->nfs_client_lock);
                        new->cl_flags = cl_init->init_flags;
-                       return rpc_ops->init_client(new, timeparms, ip_addr);
+                       return rpc_ops->init_client(new, cl_init);
                }
 
                spin_unlock(&nn->nfs_client_lock);
@@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values);
  * Create an RPC client handle
  */
 int nfs_create_rpc_client(struct nfs_client *clp,
-                         const struct rpc_timeout *timeparms,
+                         const struct nfs_client_initdata *cl_init,
                          rpc_authflavor_t flavor)
 {
        struct rpc_clnt         *clnt = NULL;
@@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp,
                .protocol       = clp->cl_proto,
                .address        = (struct sockaddr *)&clp->cl_addr,
                .addrsize       = clp->cl_addrlen,
-               .timeout        = timeparms,
+               .timeout        = cl_init->timeparms,
                .servername     = clp->cl_hostname,
+               .nodename       = cl_init->nodename,
                .program        = &nfs_program,
                .version        = clp->rpc_ops->version,
                .authflavor     = flavor,
@@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
  * nfs_init_client - Initialise an NFS2 or NFS3 client
  *
  * @clp: nfs_client to initialise
- * @timeparms: timeout parameters for underlying RPC transport
- * @ip_addr: IP presentation address (not used)
+ * @cl_init: Initialisation parameters
  *
  * Returns pointer to an NFS client, or an ERR_PTR value.
  */
 struct nfs_client *nfs_init_client(struct nfs_client *clp,
-                   const struct rpc_timeout *timeparms,
-                   const char *ip_addr)
+                                  const struct nfs_client_initdata *cl_init)
 {
        int error;
 
@@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp,
         * Create a client RPC handle for doing FSSTAT with UNIX auth only
         * - RFC 2623, sec 2.3.2
         */
-       error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
+       error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
        if (error < 0)
                goto error;
        nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server,
                           const struct nfs_parsed_mount_data *data,
                           struct nfs_subversion *nfs_mod)
 {
+       struct rpc_timeout timeparms;
        struct nfs_client_initdata cl_init = {
                .hostname = data->nfs_server.hostname,
                .addr = (const struct sockaddr *)&data->nfs_server.address,
@@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server,
                .nfs_mod = nfs_mod,
                .proto = data->nfs_server.protocol,
                .net = data->net,
+               .timeparms = &timeparms,
        };
-       struct rpc_timeout timeparms;
        struct nfs_client *clp;
        int error;
 
@@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server,
                set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 
        /* Allocate or find a client reference we can use */
-       clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
+       clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX);
        if (IS_ERR(clp)) {
                dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
                return PTR_ERR(clp);
index baaa38859899eb6b97e234f2f94f92d83fee1520..177fefb26c18d344a30dfb85f75db61c7a6e44ec 100644 (file)
@@ -2252,21 +2252,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st
        return NULL;
 }
 
-static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_access_entry *cache;
-       int err = -ENOENT;
+       bool retry = true;
+       int err;
 
        spin_lock(&inode->i_lock);
-       if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
-               goto out_zap;
-       cache = nfs_access_search_rbtree(inode, cred);
-       if (cache == NULL)
-               goto out;
-       if (!nfs_have_delegated_attributes(inode) &&
-           !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
-               goto out_stale;
+       for(;;) {
+               if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+                       goto out_zap;
+               cache = nfs_access_search_rbtree(inode, cred);
+               err = -ENOENT;
+               if (cache == NULL)
+                       goto out;
+               /* Found an entry, is our attribute cache valid? */
+               if (!nfs_attribute_cache_expired(inode) &&
+                   !(nfsi->cache_validity & NFS_INO_INVALID_ATTR))
+                       break;
+               err = -ECHILD;
+               if (!may_block)
+                       goto out;
+               if (!retry)
+                       goto out_zap;
+               spin_unlock(&inode->i_lock);
+               err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+               if (err)
+                       return err;
+               spin_lock(&inode->i_lock);
+               retry = false;
+       }
        res->jiffies = cache->jiffies;
        res->cred = cache->cred;
        res->mask = cache->mask;
@@ -2275,12 +2291,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
 out:
        spin_unlock(&inode->i_lock);
        return err;
-out_stale:
-       rb_erase(&cache->rb_node, &nfsi->access_cache);
-       list_del(&cache->lru);
-       spin_unlock(&inode->i_lock);
-       nfs_access_free_entry(cache);
-       return -ENOENT;
 out_zap:
        spin_unlock(&inode->i_lock);
        nfs_access_zap_cache(inode);
@@ -2307,13 +2317,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred,
                cache = NULL;
        if (cache == NULL)
                goto out;
-       if (!nfs_have_delegated_attributes(inode) &&
-           !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+       err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode);
+       if (err)
                goto out;
        res->jiffies = cache->jiffies;
        res->cred = cache->cred;
        res->mask = cache->mask;
-       err = 0;
 out:
        rcu_read_unlock();
        return err;
@@ -2402,18 +2411,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask);
 static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 {
        struct nfs_access_entry cache;
+       bool may_block = (mask & MAY_NOT_BLOCK) == 0;
        int status;
 
        trace_nfs_access_enter(inode);
 
        status = nfs_access_get_cached_rcu(inode, cred, &cache);
        if (status != 0)
-               status = nfs_access_get_cached(inode, cred, &cache);
+               status = nfs_access_get_cached(inode, cred, &cache, may_block);
        if (status == 0)
                goto out_cached;
 
        status = -ECHILD;
-       if (mask & MAY_NOT_BLOCK)
+       if (!may_block)
                goto out;
 
        /* Be clever: ask server to check for all possible rights */
index e6210ead71d06d941d7f7083899eec321b8cc7ae..72b7d13ee3c6a14e489f6a2b94c2caf66ef8598f 100644 (file)
@@ -196,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
        WARN_ON_ONCE(verfp->committed < 0);
 }
 
+static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1,
+               const struct nfs_writeverf *v2)
+{
+       return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier);
+}
+
 /*
  * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
  * @dreq - direct request possibly spanning multiple servers
@@ -215,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
                nfs_direct_set_hdr_verf(dreq, hdr);
                return 0;
        }
-       return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+       return nfs_direct_cmp_verf(verfp, &hdr->verf);
 }
 
 /*
@@ -238,7 +244,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
        if (verfp->committed < 0)
                return 1;
 
-       return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
+       return nfs_direct_cmp_verf(verfp, &data->verf);
 }
 
 /**
@@ -366,22 +372,10 @@ out:
  * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
  * the iocb is still valid here if this is a synchronous request.
  */
-static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
+static void nfs_direct_complete(struct nfs_direct_req *dreq)
 {
        struct inode *inode = dreq->inode;
 
-       if (dreq->iocb && write) {
-               loff_t pos = dreq->iocb->ki_pos + dreq->count;
-
-               spin_lock(&inode->i_lock);
-               if (i_size_read(inode) < pos)
-                       i_size_write(inode, pos);
-               spin_unlock(&inode->i_lock);
-       }
-
-       if (write)
-               nfs_zap_mapping(inode, inode->i_mapping);
-
        inode_dio_end(inode);
 
        if (dreq->iocb) {
@@ -436,7 +430,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
        }
 out_put:
        if (put_dreq(dreq))
-               nfs_direct_complete(dreq, false);
+               nfs_direct_complete(dreq);
        hdr->release(hdr);
 }
 
@@ -542,7 +536,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
        }
 
        if (put_dreq(dreq))
-               nfs_direct_complete(dreq, false);
+               nfs_direct_complete(dreq);
        return 0;
 }
 
@@ -583,17 +577,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
        if (!count)
                goto out;
 
-       inode_lock(inode);
-       result = nfs_sync_mapping(mapping);
-       if (result)
-               goto out_unlock;
-
        task_io_account_read(count);
 
        result = -ENOMEM;
        dreq = nfs_direct_req_alloc();
        if (dreq == NULL)
-               goto out_unlock;
+               goto out;
 
        dreq->inode = inode;
        dreq->bytes_left = dreq->max_count = count;
@@ -608,10 +597,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
 
+       nfs_start_io_direct(inode);
+
        NFS_I(inode)->read_io += count;
        result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
 
-       inode_unlock(inode);
+       nfs_end_io_direct(inode);
 
        if (!result) {
                result = nfs_direct_wait(dreq);
@@ -619,13 +610,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
                        iocb->ki_pos += result;
        }
 
-       nfs_direct_req_release(dreq);
-       return result;
-
 out_release:
        nfs_direct_req_release(dreq);
-out_unlock:
-       inode_unlock(inode);
 out:
        return result;
 }
@@ -657,6 +643,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
        nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
 
        dreq->count = 0;
+       dreq->verf.committed = NFS_INVALID_STABLE_HOW;
+       nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
        for (i = 0; i < dreq->mirror_count; i++)
                dreq->mirrors[i].count = 0;
        get_dreq(dreq);
@@ -775,7 +763,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
                        nfs_direct_write_reschedule(dreq);
                        break;
                default:
-                       nfs_direct_complete(dreq, true);
+                       nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
+                       nfs_direct_complete(dreq);
        }
 }
 
@@ -991,6 +980,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 {
        ssize_t result = -EINVAL;
+       size_t count;
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
@@ -1001,34 +991,24 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
        dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
                file, iov_iter_count(iter), (long long) iocb->ki_pos);
 
-       nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES,
-                     iov_iter_count(iter));
+       result = generic_write_checks(iocb, iter);
+       if (result <= 0)
+               return result;
+       count = result;
+       nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
 
        pos = iocb->ki_pos;
        end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
 
-       inode_lock(inode);
-
-       result = nfs_sync_mapping(mapping);
-       if (result)
-               goto out_unlock;
-
-       if (mapping->nrpages) {
-               result = invalidate_inode_pages2_range(mapping,
-                                       pos >> PAGE_SHIFT, end);
-               if (result)
-                       goto out_unlock;
-       }
-
-       task_io_account_write(iov_iter_count(iter));
+       task_io_account_write(count);
 
        result = -ENOMEM;
        dreq = nfs_direct_req_alloc();
        if (!dreq)
-               goto out_unlock;
+               goto out;
 
        dreq->inode = inode;
-       dreq->bytes_left = dreq->max_count = iov_iter_count(iter);
+       dreq->bytes_left = dreq->max_count = count;
        dreq->io_start = pos;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        l_ctx = nfs_get_lock_context(dreq->ctx);
@@ -1040,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
        if (!is_sync_kiocb(iocb))
                dreq->iocb = iocb;
 
+       nfs_start_io_direct(inode);
+
        result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
 
        if (mapping->nrpages) {
@@ -1047,30 +1029,19 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
                                              pos >> PAGE_SHIFT, end);
        }
 
-       inode_unlock(inode);
+       nfs_end_io_direct(inode);
 
        if (!result) {
                result = nfs_direct_wait(dreq);
                if (result > 0) {
-                       struct inode *inode = mapping->host;
-
                        iocb->ki_pos = pos + result;
-                       spin_lock(&inode->i_lock);
-                       if (i_size_read(inode) < iocb->ki_pos)
-                               i_size_write(inode, iocb->ki_pos);
-                       spin_unlock(&inode->i_lock);
-
                        /* XXX: should check the generic_write_sync retval */
                        generic_write_sync(iocb, result);
                }
        }
-       nfs_direct_req_release(dreq);
-       return result;
-
 out_release:
        nfs_direct_req_release(dreq);
-out_unlock:
-       inode_unlock(inode);
+out:
        return result;
 }
 
index 717a8d6af52df59064ddf7d882cd899fa99115d8..7d620970f2e1addfbd447dc7b4a0bd1233bc9ec6 100644 (file)
@@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
                iocb->ki_filp,
                iov_iter_count(to), (unsigned long) iocb->ki_pos);
 
-       result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping);
+       nfs_start_io_read(inode);
+       result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
        if (!result) {
                result = generic_file_read_iter(iocb, to);
                if (result > 0)
                        nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
        }
+       nfs_end_io_read(inode);
        return result;
 }
 EXPORT_SYMBOL_GPL(nfs_file_read);
@@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
        dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
                filp, (unsigned long) count, (unsigned long long) *ppos);
 
-       res = nfs_revalidate_mapping_protected(inode, filp->f_mapping);
+       nfs_start_io_read(inode);
+       res = nfs_revalidate_mapping(inode, filp->f_mapping);
        if (!res) {
                res = generic_file_splice_read(filp, ppos, pipe, count, flags);
                if (res > 0)
                        nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
        }
+       nfs_end_io_read(inode);
        return res;
 }
 EXPORT_SYMBOL_GPL(nfs_file_splice_read);
@@ -272,16 +276,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
        trace_nfs_fsync_enter(inode);
 
-       inode_dio_wait(inode);
        do {
                ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
                if (ret != 0)
                        break;
-               inode_lock(inode);
                ret = nfs_file_fsync_commit(file, start, end, datasync);
                if (!ret)
                        ret = pnfs_sync_inode(inode, !!datasync);
-               inode_unlock(inode);
                /*
                 * If nfs_file_fsync_commit detected a server reboot, then
                 * resend all dirty pages that might have been covered by
@@ -359,19 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file, mapping->host->i_ino, len, (long long) pos);
 
 start:
-       /*
-        * Prevent starvation issues if someone is doing a consistency
-        * sync-to-disk
-        */
-       ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
-                                nfs_wait_bit_killable, TASK_KILLABLE);
-       if (ret)
-               return ret;
-       /*
-        * Wait for O_DIRECT to complete
-        */
-       inode_dio_wait(mapping->host);
-
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
@@ -432,7 +420,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
                return status;
        NFS_I(mapping->host)->write_io += copied;
 
-       if (nfs_ctx_key_to_expire(ctx)) {
+       if (nfs_ctx_key_to_expire(ctx, mapping->host)) {
                status = nfs_wb_all(mapping->host);
                if (status < 0)
                        return status;
@@ -470,31 +458,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset,
  */
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
-       struct address_space *mapping = page->mapping;
-
        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 
-       /* Always try to initiate a 'commit' if relevant, but only
-        * wait for it if the caller allows blocking.  Even then,
-        * only wait 1 second and only if the 'bdi' is not congested.
-        * Waiting indefinitely can cause deadlocks when the NFS
-        * server is on this machine, when a new TCP connection is
-        * needed and in other rare cases.  There is no particular
-        * need to wait extensively here.  A short wait has the
-        * benefit that someone else can worry about the freezer.
-        */
-       if (mapping) {
-               struct nfs_server *nfss = NFS_SERVER(mapping->host);
-               nfs_commit_inode(mapping->host, 0);
-               if (gfpflags_allow_blocking(gfp) &&
-                   !bdi_write_congested(&nfss->backing_dev_info)) {
-                       wait_on_page_bit_killable_timeout(page, PG_private,
-                                                         HZ);
-                       if (PagePrivate(page))
-                               set_bdi_congested(&nfss->backing_dev_info,
-                                                 BLK_RW_ASYNC);
-               }
-       }
        /* If PagePrivate() is set, then the page is not freeable */
        if (PagePrivate(page))
                return 0;
@@ -604,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                filp, filp->f_mapping->host->i_ino,
                (long long)page_offset(page));
 
+       sb_start_pagefault(inode->i_sb);
+
        /* make sure the cache has finished storing the page */
        nfs_fscache_wait_on_page_write(NFS_I(inode), page);
 
@@ -630,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 out_unlock:
        unlock_page(page);
 out:
+       sb_end_pagefault(inode->i_sb);
        return ret;
 }
 
@@ -645,7 +613,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode)
 
        ctx = nfs_file_open_context(filp);
        if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
-           nfs_ctx_key_to_expire(ctx))
+           nfs_ctx_key_to_expire(ctx, inode))
                return 1;
        return 0;
 }
@@ -656,23 +624,17 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
        struct inode *inode = file_inode(file);
        unsigned long written = 0;
        ssize_t result;
-       size_t count = iov_iter_count(from);
 
        result = nfs_key_timeout_notify(file, inode);
        if (result)
                return result;
 
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               result = generic_write_checks(iocb, from);
-               if (result <= 0)
-                       return result;
+       if (iocb->ki_flags & IOCB_DIRECT)
                return nfs_file_direct_write(iocb, from);
-       }
 
        dprintk("NFS: write(%pD2, %zu@%Ld)\n",
-               file, count, (long long) iocb->ki_pos);
+               file, iov_iter_count(from), (long long) iocb->ki_pos);
 
-       result = -EBUSY;
        if (IS_SWAPFILE(inode))
                goto out_swapfile;
        /*
@@ -684,28 +646,33 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
                        goto out;
        }
 
-       result = count;
-       if (!count)
+       nfs_start_io_write(inode);
+       result = generic_write_checks(iocb, from);
+       if (result > 0) {
+               current->backing_dev_info = inode_to_bdi(inode);
+               result = generic_perform_write(file, from, iocb->ki_pos);
+               current->backing_dev_info = NULL;
+       }
+       nfs_end_io_write(inode);
+       if (result <= 0)
                goto out;
 
-       result = generic_file_write_iter(iocb, from);
-       if (result > 0)
-               written = result;
+       written = generic_write_sync(iocb, result);
+       iocb->ki_pos += written;
 
        /* Return error values */
-       if (result >= 0 && nfs_need_check_write(file, inode)) {
+       if (nfs_need_check_write(file, inode)) {
                int err = vfs_fsync(file, 0);
                if (err < 0)
                        result = err;
        }
-       if (result > 0)
-               nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+       nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
 out:
        return result;
 
 out_swapfile:
        printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
-       goto out;
+       return -EBUSY;
 }
 EXPORT_SYMBOL_GPL(nfs_file_write);
 
@@ -779,11 +746,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
        return status;
 }
 
-static int
-is_time_granular(struct timespec *ts) {
-       return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
-}
-
 static int
 do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
@@ -817,12 +779,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
         * This makes locking act as a cache coherency point.
         */
        nfs_sync_mapping(filp->f_mapping);
-       if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
-               if (is_time_granular(&NFS_SERVER(inode)->time_delta))
-                       __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-               else
-                       nfs_zap_caches(inode);
-       }
+       if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+               nfs_zap_mapping(inode, filp->f_mapping);
 out:
        return status;
 }
index aa59757389dc8b38887b4cce4dcc5a287a2ba42a..a3fc48ba4931d7a25c56077ac114ef279eaa4f3a 100644 (file)
@@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 static void
 filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
+       loff_t end_offs = 0;
 
        if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
-           hdr->res.verf->committed != NFS_DATA_SYNC)
+           hdr->res.verf->committed == NFS_FILE_SYNC)
                return;
+       if (hdr->res.verf->committed == NFS_DATA_SYNC)
+               end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
 
-       pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
-                       hdr->mds_offset + hdr->res.count);
+       /* Note: if the write is unstable, don't set end_offs until commit */
+       pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
        dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
                (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 }
@@ -354,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task,
        }
 
        filelayout_set_layoutcommit(hdr);
+
+       /* zero out the fattr */
+       hdr->fattr.valid = 0;
+       if (task->tk_status >= 0)
+               nfs_writeback_update_inode(hdr);
+
        return 0;
 }
 
@@ -375,8 +384,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
                return -EAGAIN;
        }
 
-       if (data->verf.committed == NFS_UNSTABLE)
-               pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
+       pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
 
        return 0;
 }
index 0e8018bc98808bf29828e4861aee0c1ed9df3b31..e6206eaf2bdf34e4aa11d23dbf68b6e7c0c4684e 100644 (file)
@@ -1325,15 +1325,16 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
  * we always send layoutcommit after DS writes.
  */
 static void
-ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
+ff_layout_set_layoutcommit(struct inode *inode,
+               struct pnfs_layout_segment *lseg,
+               loff_t end_offset)
 {
-       if (!ff_layout_need_layoutcommit(hdr->lseg))
+       if (!ff_layout_need_layoutcommit(lseg))
                return;
 
-       pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
-                       hdr->mds_offset + hdr->res.count);
-       dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
-               (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+       pnfs_set_layoutcommit(inode, lseg, end_offset);
+       dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino,
+               (unsigned long long) NFS_I(inode)->layout->plh_lwb);
 }
 
 static bool
@@ -1469,6 +1470,7 @@ static void ff_layout_read_release(void *data)
 static int ff_layout_write_done_cb(struct rpc_task *task,
                                struct nfs_pgio_header *hdr)
 {
+       loff_t end_offs = 0;
        int err;
 
        trace_nfs4_pnfs_write(hdr, task->tk_status);
@@ -1494,7 +1496,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 
        if (hdr->res.verf->committed == NFS_FILE_SYNC ||
            hdr->res.verf->committed == NFS_DATA_SYNC)
-               ff_layout_set_layoutcommit(hdr);
+               end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
+
+       /* Note: if the write is unstable, don't set end_offs until commit */
+       ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
 
        /* zero out fattr since we don't care DS attr at all */
        hdr->fattr.valid = 0;
@@ -1530,9 +1535,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
                return -EAGAIN;
        }
 
-       if (data->verf.committed == NFS_UNSTABLE
-           && ff_layout_need_layoutcommit(data->lseg))
-               pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
+       ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
 
        return 0;
 }
index dda689d7a8a706862e21737cb5189ab175d976e1..bf4ec5ecc97e4571e3f971222c71f0519874682a 100644 (file)
@@ -662,9 +662,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        trace_nfs_getattr_enter(inode);
        /* Flush out writes to the server in order to update c/mtime.  */
        if (S_ISREG(inode->i_mode)) {
-               inode_lock(inode);
-               err = nfs_sync_inode(inode);
-               inode_unlock(inode);
+               err = filemap_write_and_wait(inode->i_mapping);
                if (err)
                        goto out;
        }
@@ -879,7 +877,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
        struct nfs_inode *nfsi = NFS_I(inode);
 
        spin_lock(&inode->i_lock);
-       list_add(&ctx->list, &nfsi->open_files);
+       if (ctx->mode & FMODE_WRITE)
+               list_add(&ctx->list, &nfsi->open_files);
+       else
+               list_add_tail(&ctx->list, &nfsi->open_files);
        spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
@@ -972,6 +973,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        if (NFS_STALE(inode))
                goto out;
 
+       /* pNFS: Attributes aren't updated until we layoutcommit */
+       if (S_ISREG(inode->i_mode)) {
+               status = pnfs_sync_inode(inode, false);
+               if (status)
+                       goto out;
+       }
+
        status = -ENOMEM;
        fattr = nfs_alloc_fattr();
        if (fattr == NULL)
@@ -1122,14 +1130,12 @@ out:
 }
 
 /**
- * __nfs_revalidate_mapping - Revalidate the pagecache
+ * nfs_revalidate_mapping - Revalidate the pagecache
  * @inode - pointer to host inode
  * @mapping - pointer to mapping
- * @may_lock - take inode->i_mutex?
  */
-static int __nfs_revalidate_mapping(struct inode *inode,
-               struct address_space *mapping,
-               bool may_lock)
+int nfs_revalidate_mapping(struct inode *inode,
+               struct address_space *mapping)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        unsigned long *bitlock = &nfsi->flags;
@@ -1178,12 +1184,7 @@ static int __nfs_revalidate_mapping(struct inode *inode,
        nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
        spin_unlock(&inode->i_lock);
        trace_nfs_invalidate_mapping_enter(inode);
-       if (may_lock) {
-               inode_lock(inode);
-               ret = nfs_invalidate_mapping(inode, mapping);
-               inode_unlock(inode);
-       } else
-               ret = nfs_invalidate_mapping(inode, mapping);
+       ret = nfs_invalidate_mapping(inode, mapping);
        trace_nfs_invalidate_mapping_exit(inode, ret);
 
        clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
@@ -1193,27 +1194,28 @@ out:
        return ret;
 }
 
-/**
- * nfs_revalidate_mapping - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
- */
-int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+static bool nfs_file_has_writers(struct nfs_inode *nfsi)
 {
-       return __nfs_revalidate_mapping(inode, mapping, false);
+       struct inode *inode = &nfsi->vfs_inode;
+
+       assert_spin_locked(&inode->i_lock);
+
+       if (!S_ISREG(inode->i_mode))
+               return false;
+       if (list_empty(&nfsi->open_files))
+               return false;
+       /* Note: This relies on nfsi->open_files being ordered with writers
+        *       being placed at the head of the list.
+        *       See nfs_inode_attach_open_context()
+        */
+       return (list_first_entry(&nfsi->open_files,
+                       struct nfs_open_context,
+                       list)->mode & FMODE_WRITE) == FMODE_WRITE;
 }
 
-/**
- * nfs_revalidate_mapping_protected - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
- *
- * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex
- * while invalidating the mapping.
- */
-int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping)
+static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
 {
-       return __nfs_revalidate_mapping(inode, mapping, true);
+       return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi);
 }
 
 static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
@@ -1280,22 +1282,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
        if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
                return -EIO;
 
-       if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
-                       inode->i_version != fattr->change_attr)
-               invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+       if (!nfs_file_has_buffered_writers(nfsi)) {
+               /* Verify a few of the more important attributes */
+               if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr)
+                       invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE;
 
-       /* Verify a few of the more important attributes */
-       if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
-               invalid |= NFS_INO_INVALID_ATTR;
+               if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
+                       invalid |= NFS_INO_INVALID_ATTR;
 
-       if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
-               cur_size = i_size_read(inode);
-               new_isize = nfs_size_to_loff_t(fattr->size);
-               if (cur_size != new_isize)
-                       invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+               if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime))
+                       invalid |= NFS_INO_INVALID_ATTR;
+
+               if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+                       cur_size = i_size_read(inode);
+                       new_isize = nfs_size_to_loff_t(fattr->size);
+                       if (cur_size != new_isize)
+                               invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+               }
        }
-       if (nfsi->nrequests != 0)
-               invalid &= ~NFS_INO_REVAL_PAGECACHE;
 
        /* Have any file permissions changed? */
        if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
@@ -1470,28 +1474,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
                ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
 }
 
-/*
- * Don't trust the change_attribute, mtime, ctime or size if
- * a pnfs LAYOUTCOMMIT is outstanding
- */
-static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
-               struct nfs_fattr *fattr)
-{
-       if (pnfs_layoutcommit_outstanding(inode))
-               fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
-                               NFS_ATTR_FATTR_MTIME |
-                               NFS_ATTR_FATTR_CTIME |
-                               NFS_ATTR_FATTR_SIZE);
-}
-
 static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
 {
        int ret;
 
        trace_nfs_refresh_inode_enter(inode);
 
-       nfs_inode_attrs_handle_layoutcommit(inode, fattr);
-
        if (nfs_inode_attrs_need_update(inode, fattr))
                ret = nfs_update_inode(inode, fattr);
        else
@@ -1527,7 +1515,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode);
 
 static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
 {
-       unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+       unsigned long invalid = NFS_INO_INVALID_ATTR;
 
        /*
         * Don't revalidate the pagecache if we hold a delegation, but do
@@ -1676,6 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        unsigned long invalid = 0;
        unsigned long now = jiffies;
        unsigned long save_cache_validity;
+       bool have_writers = nfs_file_has_buffered_writers(nfsi);
        bool cache_revalidated = true;
 
        dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
@@ -1725,17 +1714,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        /* Do atomic weak cache consistency updates */
        invalid |= nfs_wcc_update_inode(inode, fattr);
 
+       if (pnfs_layoutcommit_outstanding(inode)) {
+               nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR;
+               cache_revalidated = false;
+       }
+
        /* More cache consistency checks */
        if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
                if (inode->i_version != fattr->change_attr) {
                        dprintk("NFS: change_attr change on server for file %s/%ld\n",
                                        inode->i_sb->s_id, inode->i_ino);
-                       invalid |= NFS_INO_INVALID_ATTR
-                               | NFS_INO_INVALID_DATA
-                               | NFS_INO_INVALID_ACCESS
-                               | NFS_INO_INVALID_ACL;
-                       if (S_ISDIR(inode->i_mode))
-                               nfs_force_lookup_revalidate(inode);
+                       /* Could it be a race with writeback? */
+                       if (!have_writers) {
+                               invalid |= NFS_INO_INVALID_ATTR
+                                       | NFS_INO_INVALID_DATA
+                                       | NFS_INO_INVALID_ACCESS
+                                       | NFS_INO_INVALID_ACL;
+                               if (S_ISDIR(inode->i_mode))
+                                       nfs_force_lookup_revalidate(inode);
+                       }
                        inode->i_version = fattr->change_attr;
                }
        } else {
@@ -1768,9 +1765,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                if (new_isize != cur_isize) {
                        /* Do we perhaps have any outstanding writes, or has
                         * the file grown beyond our last write? */
-                       if ((nfsi->nrequests == 0) || new_isize > cur_isize) {
+                       if (nfsi->nrequests == 0 || new_isize > cur_isize) {
                                i_size_write(inode, new_isize);
-                               invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+                               if (!have_writers)
+                                       invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
                        }
                        dprintk("NFS: isize change on server for file %s/%ld "
                                        "(%Ld to %Ld)\n",
index 5ea04d87fc653db7bf578853b1e97b165812864e..7ce5e023c3c3cd36056d1272cc16c9ebb1d1198a 100644 (file)
@@ -66,13 +66,16 @@ struct nfs_clone_mount {
 
 struct nfs_client_initdata {
        unsigned long init_flags;
-       const char *hostname;
-       const struct sockaddr *addr;
+       const char *hostname;                   /* Hostname of the server */
+       const struct sockaddr *addr;            /* Address of the server */
+       const char *nodename;                   /* Hostname of the client */
+       const char *ip_addr;                    /* IP address of the client */
        size_t addrlen;
        struct nfs_subversion *nfs_mod;
        int proto;
        u32 minorversion;
        struct net *net;
+       const struct rpc_timeout *timeparms;
 };
 
 /*
@@ -147,9 +150,8 @@ extern void nfs_umount(const struct nfs_mount_request *info);
 extern const struct rpc_program nfs_program;
 extern void nfs_clients_init(struct net *net);
 extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
-int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t);
+int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
 struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
-                                 const struct rpc_timeout *, const char *,
                                  rpc_authflavor_t);
 int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
 void nfs_server_insert_lists(struct nfs_server *);
@@ -184,7 +186,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
                                           rpc_authflavor_t);
 extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
 extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
-extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
                                             const struct sockaddr *ds_addr,
                                             int ds_addrlen, int ds_proto,
                                             unsigned int ds_timeo,
@@ -193,7 +195,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
                                             rpc_authflavor_t au_flavor);
 extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
                                                struct inode *);
-extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
                        const struct sockaddr *ds_addr, int ds_addrlen,
                        int ds_proto, unsigned int ds_timeo,
                        unsigned int ds_retrans, rpc_authflavor_t au_flavor);
@@ -338,8 +340,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
-                          const struct rpc_timeout *timeparms,
-                          const char *ip_addr);
+                          const struct nfs_client_initdata *);
 
 /* dir.c */
 extern void nfs_force_use_readdirplus(struct inode *dir);
@@ -411,6 +412,19 @@ extern void __exit unregister_nfs_fs(void);
 extern bool nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
 
+/* io.c */
+extern void nfs_start_io_read(struct inode *inode);
+extern void nfs_end_io_read(struct inode *inode);
+extern void nfs_start_io_write(struct inode *inode);
+extern void nfs_end_io_write(struct inode *inode);
+extern void nfs_start_io_direct(struct inode *inode);
+extern void nfs_end_io_direct(struct inode *inode);
+
+static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
+{
+       return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
+}
+
 /* namespace.c */
 #define NFS_PATH_CANONICAL 1
 extern char *nfs_path(char **p, struct dentry *dentry,
@@ -496,9 +510,29 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
                    struct inode *inode,
                    struct nfs_direct_req *dreq);
 int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
-bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode);
 void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
 
+int nfs_filemap_write_and_wait_range(struct address_space *mapping,
+               loff_t lstart, loff_t lend);
+
+#ifdef CONFIG_NFS_V4_1
+static inline
+void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
+{
+       int i;
+
+       for (i = 0; i < cinfo->nbuckets; i++)
+               cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+}
+#else
+static inline
+void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
+{
+}
+#endif
+
+
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
                struct page *, struct page *, enum migrate_mode);
@@ -506,6 +540,13 @@ extern int nfs_migrate_page(struct address_space *,
 #define nfs_migrate_page NULL
 #endif
 
+static inline int
+nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
+               const struct nfs_write_verifier *v2)
+{
+       return memcmp(v1->data, v2->data, sizeof(v1->data));
+}
+
 /* unlink.c */
 extern struct rpc_task *
 nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
@@ -521,8 +562,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 /* nfs4proc.c */
 extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
-                           const struct rpc_timeout *timeparms,
-                           const char *ip_addr);
+                           const struct nfs_client_initdata *);
 extern int nfs40_walk_client_list(struct nfs_client *clp,
                                struct nfs_client **result,
                                struct rpc_cred *cred);
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
new file mode 100644 (file)
index 0000000..1fc5d1c
--- /dev/null
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016 Trond Myklebust
+ *
+ * I/O and data path helper functionality.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+
+#include "internal.h"
+
+/* Call with exclusively locked inode->i_rwsem */
+static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
+{
+       if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+               clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
+               inode_dio_wait(inode);
+       }
+}
+
+/**
+ * nfs_start_io_read - declare the file is being used for buffered reads
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+void
+nfs_start_io_read(struct inode *inode)
+{
+       struct nfs_inode *nfsi = NFS_I(inode);
+       /* Be an optimist! */
+       down_read(&inode->i_rwsem);
+       if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0)
+               return;
+       up_read(&inode->i_rwsem);
+       /* Slow path.... */
+       down_write(&inode->i_rwsem);
+       nfs_block_o_direct(nfsi, inode);
+       downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_read - declare that the buffered read operation is done
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_read(struct inode *inode)
+{
+       up_read(&inode->i_rwsem);
+}
+
+/**
+ * nfs_start_io_write - declare the file is being used for buffered writes
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+void
+nfs_start_io_write(struct inode *inode)
+{
+       down_write(&inode->i_rwsem);
+       nfs_block_o_direct(NFS_I(inode), inode);
+}
+
+/**
+ * nfs_end_io_write - declare that the buffered write operation is done
+ * @inode - file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_write(struct inode *inode)
+{
+       up_write(&inode->i_rwsem);
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
+{
+       if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+               set_bit(NFS_INO_ODIRECT, &nfsi->flags);
+               nfs_wb_all(inode);
+       }
+}
+
+/**
+ * nfs_end_io_direct - declare the file is being used for direct i/o
+ * @inode - file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+void
+nfs_start_io_direct(struct inode *inode)
+{
+       struct nfs_inode *nfsi = NFS_I(inode);
+       /* Be an optimist! */
+       down_read(&inode->i_rwsem);
+       if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0)
+               return;
+       up_read(&inode->i_rwsem);
+       /* Slow path.... */
+       down_write(&inode->i_rwsem);
+       nfs_block_buffered(nfsi, inode);
+       downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode - file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_direct(struct inode *inode)
+{
+       up_read(&inode->i_rwsem);
+}
index 9e9fa347a948698dabd2ea53580d7538658fc327..ee753547fb0a3fc13d3b3d9b364816c83b9521db 100644 (file)
@@ -76,19 +76,23 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
  * low timeout interval so that if a connection is lost, we retry through
  * the MDS.
  */
-struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
                const struct sockaddr *ds_addr, int ds_addrlen,
                int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
                rpc_authflavor_t au_flavor)
 {
+       struct rpc_timeout ds_timeout;
+       struct nfs_client *mds_clp = mds_srv->nfs_client;
        struct nfs_client_initdata cl_init = {
                .addr = ds_addr,
                .addrlen = ds_addrlen,
+               .nodename = mds_clp->cl_rpcclient->cl_nodename,
+               .ip_addr = mds_clp->cl_ipaddr,
                .nfs_mod = &nfs_v3,
                .proto = ds_proto,
                .net = mds_clp->cl_net,
+               .timeparms = &ds_timeout,
        };
-       struct rpc_timeout ds_timeout;
        struct nfs_client *clp;
        char buf[INET6_ADDRSTRLEN + 1];
 
@@ -97,10 +101,12 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
                return ERR_PTR(-EINVAL);
        cl_init.hostname = buf;
 
+       if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
+               set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
        /* Use the MDS nfs_client cl_ipaddr. */
        nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
-       clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
-                            au_flavor);
+       clp = nfs_get_client(&cl_init, au_flavor);
 
        return clp;
 }
index aa03ed09ba06f79763146b0a6f8cb171c9778b0c..33da841a21bb2871f753fb38a72dd76ce2725ded 100644 (file)
@@ -113,15 +113,17 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
        if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
                return -EOPNOTSUPP;
 
-       nfs_wb_all(inode);
        inode_lock(inode);
+       err = nfs_sync_inode(inode);
+       if (err)
+               goto out_unlock;
 
        err = nfs42_proc_fallocate(&msg, filep, offset, len);
        if (err == 0)
                truncate_pagecache_range(inode, offset, (offset + len) -1);
        if (err == -EOPNOTSUPP)
                NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
-
+out_unlock:
        inode_unlock(inode);
        return err;
 }
@@ -154,11 +156,20 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
        if (status)
                return status;
 
+       status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping,
+                       pos_src, pos_src + (loff_t)count - 1);
+       if (status)
+               return status;
+
        status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
                                     dst_lock, FMODE_WRITE);
        if (status)
                return status;
 
+       status = nfs_sync_inode(dst_inode);
+       if (status)
+               return status;
+
        status = nfs4_call_sync(server->client, server, &msg,
                                &args.seq_args, &res.seq_res, 0);
        if (status == -ENOTSUPP)
@@ -258,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep,
        if (status)
                return status;
 
-       nfs_wb_all(inode);
+       status = nfs_filemap_write_and_wait_range(inode->i_mapping,
+                       offset, LLONG_MAX);
+       if (status)
+               return status;
+
        status = nfs4_call_sync(server->client, server, &msg,
                                &args.seq_args, &res.seq_res, 0);
        if (status == -ENOTSUPP)
@@ -336,8 +351,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
                         * Mark the bad layout state as invalid, then retry
                         * with the current stateid.
                         */
-                       set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-                       pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
+                       pnfs_mark_layout_stateid_invalid(lo, &head);
                        spin_unlock(&inode->i_lock);
                        pnfs_free_lseg_list(&head);
                } else
index 6dc6f2aea0d6c5380d42369ae44f48cfbdcf7622..8b2605882a2016e603f7405304bb91c85b344fc5 100644 (file)
@@ -330,13 +330,21 @@ static int decode_write_response(struct xdr_stream *xdr,
                                 struct nfs42_write_res *res)
 {
        __be32 *p;
-       int stateids;
 
        p = xdr_inline_decode(xdr, 4 + 8 + 4);
        if (unlikely(!p))
                goto out_overflow;
 
-       stateids = be32_to_cpup(p++);
+       /*
+        * We never use asynchronous mode, so warn if a server returns
+        * a stateid.
+        */
+       if (unlikely(*p != 0)) {
+               pr_err_once("%s: server has set unrequested "
+                               "asynchronous mode\n", __func__);
+               return -EREMOTEIO;
+       }
+       p++;
        p = xdr_decode_hyper(p, &res->count);
        res->verifier.committed = be32_to_cpup(p);
        return decode_verifier(xdr, &res->verifier.verifier);
index 768456fa1b177a2f3b35c9352d1949c9fc44a03b..4be567a549585387f5c439338eb197db81983bfb 100644 (file)
@@ -185,6 +185,7 @@ struct nfs4_state {
 struct nfs4_exception {
        struct nfs4_state *state;
        struct inode *inode;
+       nfs4_stateid *stateid;
        long timeout;
        unsigned char delay : 1,
                      recovering : 1,
index 10410e8b58530389d7efb18352e8a6253d12b267..8d7d08d4f95f17e09d53bda53da364ef02ea05b8 100644 (file)
@@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
  * Returns pointer to an NFS client, or an ERR_PTR value.
  */
 struct nfs_client *nfs4_init_client(struct nfs_client *clp,
-                                   const struct rpc_timeout *timeparms,
-                                   const char *ip_addr)
+                                   const struct nfs_client_initdata *cl_init)
 {
        char buf[INET6_ADDRSTRLEN + 1];
+       const char *ip_addr = cl_init->ip_addr;
        struct nfs_client *old;
        int error;
 
@@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
        __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
        __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
 
-       error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
+       error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
        if (error == -EINVAL)
-               error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
+               error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
        if (error < 0)
                goto error;
 
@@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server,
                .hostname = hostname,
                .addr = addr,
                .addrlen = addrlen,
+               .ip_addr = ip_addr,
                .nfs_mod = &nfs_v4,
                .proto = proto,
                .minorversion = minorversion,
                .net = net,
+               .timeparms = timeparms,
        };
        struct nfs_client *clp;
        int error;
@@ -809,7 +811,7 @@ static int nfs4_set_client(struct nfs_server *server,
                set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
 
        /* Allocate or find a client reference we can use */
-       clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
+       clp = nfs_get_client(&cl_init, authflavour);
        if (IS_ERR(clp)) {
                error = PTR_ERR(clp);
                goto error;
@@ -842,20 +844,24 @@ error:
  * low timeout interval so that if a connection is lost, we retry through
  * the MDS.
  */
-struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
                const struct sockaddr *ds_addr, int ds_addrlen,
                int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
                u32 minor_version, rpc_authflavor_t au_flavor)
 {
+       struct rpc_timeout ds_timeout;
+       struct nfs_client *mds_clp = mds_srv->nfs_client;
        struct nfs_client_initdata cl_init = {
                .addr = ds_addr,
                .addrlen = ds_addrlen,
+               .nodename = mds_clp->cl_rpcclient->cl_nodename,
+               .ip_addr = mds_clp->cl_ipaddr,
                .nfs_mod = &nfs_v4,
                .proto = ds_proto,
                .minorversion = minor_version,
                .net = mds_clp->cl_net,
+               .timeparms = &ds_timeout,
        };
-       struct rpc_timeout ds_timeout;
        struct nfs_client *clp;
        char buf[INET6_ADDRSTRLEN + 1];
 
@@ -863,14 +869,16 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
                return ERR_PTR(-EINVAL);
        cl_init.hostname = buf;
 
+       if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
+               __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
        /*
         * Set an authflavor equual to the MDS value. Use the MDS nfs_client
         * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
         * (section 13.1 RFC 5661).
         */
        nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
-       clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
-                            au_flavor);
+       clp = nfs_get_client(&cl_init, au_flavor);
 
        dprintk("<-- %s %p\n", __func__, clp);
        return clp;
index 014b0e41ace5a8f5b17510fd32d9403faa0c4084..d085ad7948844cb0931125241bf549c915badd93 100644 (file)
@@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
        if (openflags & O_TRUNC) {
                attr.ia_valid |= ATTR_SIZE;
                attr.ia_size = 0;
-               nfs_sync_inode(inode);
+               filemap_write_and_wait(inode->i_mapping);
        }
 
        inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
@@ -133,21 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
                                    struct file *file_out, loff_t pos_out,
                                    size_t count, unsigned int flags)
 {
-       struct inode *in_inode = file_inode(file_in);
-       struct inode *out_inode = file_inode(file_out);
-       int ret;
-
-       if (in_inode == out_inode)
+       if (file_inode(file_in) == file_inode(file_out))
                return -EINVAL;
 
-       /* flush any pending writes */
-       ret = nfs_sync_inode(in_inode);
-       if (ret)
-               return ret;
-       ret = nfs_sync_inode(out_inode);
-       if (ret)
-               return ret;
-
        return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
 }
 
index ff416d0e24bc25215a991c282e8ae19a65d31832..da5c9e58e9075bfb74a682c89e8ccc8961383ec9 100644 (file)
@@ -363,6 +363,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
 {
        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state *state = exception->state;
+       const nfs4_stateid *stateid = exception->stateid;
        struct inode *inode = exception->inode;
        int ret = errorcode;
 
@@ -376,9 +377,18 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
                case -NFS4ERR_DELEG_REVOKED:
                case -NFS4ERR_ADMIN_REVOKED:
                case -NFS4ERR_BAD_STATEID:
-                       if (inode && nfs_async_inode_return_delegation(inode,
-                                               NULL) == 0)
-                               goto wait_on_recovery;
+                       if (inode) {
+                               int err;
+
+                               err = nfs_async_inode_return_delegation(inode,
+                                               stateid);
+                               if (err == 0)
+                                       goto wait_on_recovery;
+                               if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) {
+                                       exception->retry = 1;
+                                       break;
+                               }
+                       }
                        if (state == NULL)
                                break;
                        ret = nfs4_schedule_stateid_recovery(server, state);
@@ -427,6 +437,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
                case -NFS4ERR_DELAY:
                        nfs_inc_server_stats(server, NFSIOS_DELAY);
                case -NFS4ERR_GRACE:
+               case -NFS4ERR_LAYOUTTRYLATER:
                case -NFS4ERR_RECALLCONFLICT:
                        exception->delay = 1;
                        return 0;
@@ -2669,28 +2680,17 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
        return res;
 }
 
-static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
-                           struct nfs_fattr *fattr, struct iattr *sattr,
-                           struct nfs4_state *state, struct nfs4_label *ilabel,
-                           struct nfs4_label *olabel)
+static int _nfs4_do_setattr(struct inode *inode,
+                           struct nfs_setattrargs *arg,
+                           struct nfs_setattrres *res,
+                           struct rpc_cred *cred,
+                           struct nfs4_state *state)
 {
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs_setattrargs  arg = {
-                .fh             = NFS_FH(inode),
-                .iap            = sattr,
-               .server         = server,
-               .bitmask = server->attr_bitmask,
-               .label          = ilabel,
-        };
-        struct nfs_setattrres  res = {
-               .fattr          = fattr,
-               .label          = olabel,
-               .server         = server,
-        };
         struct rpc_message msg = {
                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
-               .rpc_argp       = &arg,
-               .rpc_resp       = &res,
+               .rpc_argp       = arg,
+               .rpc_resp       = res,
                .rpc_cred       = cred,
         };
        struct rpc_cred *delegation_cred = NULL;
@@ -2699,17 +2699,13 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        bool truncate;
        int status;
 
-       arg.bitmask = nfs4_bitmask(server, ilabel);
-       if (ilabel)
-               arg.bitmask = nfs4_bitmask(server, olabel);
-
-       nfs_fattr_init(fattr);
+       nfs_fattr_init(res->fattr);
 
        /* Servers should only apply open mode checks for file size changes */
-       truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
+       truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
        fmode = truncate ? FMODE_WRITE : FMODE_READ;
 
-       if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) {
+       if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) {
                /* Use that stateid */
        } else if (truncate && state != NULL) {
                struct nfs_lockowner lockowner = {
@@ -2719,19 +2715,19 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                if (!nfs4_valid_open_stateid(state))
                        return -EBADF;
                if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
-                               &arg.stateid, &delegation_cred) == -EIO)
+                               &arg->stateid, &delegation_cred) == -EIO)
                        return -EBADF;
        } else
-               nfs4_stateid_copy(&arg.stateid, &zero_stateid);
+               nfs4_stateid_copy(&arg->stateid, &zero_stateid);
        if (delegation_cred)
                msg.rpc_cred = delegation_cred;
 
-       status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+       status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
 
        put_rpccred(delegation_cred);
        if (status == 0 && state != NULL)
                renew_lease(server, timestamp);
-       trace_nfs4_setattr(inode, &arg.stateid, status);
+       trace_nfs4_setattr(inode, &arg->stateid, status);
        return status;
 }
 
@@ -2741,13 +2737,31 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                           struct nfs4_label *olabel)
 {
        struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_setattrargs  arg = {
+                .fh             = NFS_FH(inode),
+                .iap            = sattr,
+               .server         = server,
+               .bitmask = server->attr_bitmask,
+               .label          = ilabel,
+        };
+        struct nfs_setattrres  res = {
+               .fattr          = fattr,
+               .label          = olabel,
+               .server         = server,
+        };
        struct nfs4_exception exception = {
                .state = state,
                .inode = inode,
+               .stateid = &arg.stateid,
        };
        int err;
+
+       arg.bitmask = nfs4_bitmask(server, ilabel);
+       if (ilabel)
+               arg.bitmask = nfs4_bitmask(server, olabel);
+
        do {
-               err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
+               err = _nfs4_do_setattr(inode, &arg, &res, cred, state);
                switch (err) {
                case -NFS4ERR_OPENMODE:
                        if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -3267,13 +3281,6 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
        return status;
 }
 
-static int nfs4_do_find_root_sec(struct nfs_server *server,
-               struct nfs_fh *fhandle, struct nfs_fsinfo *info)
-{
-       int mv = server->nfs_client->cl_minorversion;
-       return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info);
-}
-
 /**
  * nfs4_proc_get_rootfh - get file handle for server's pseudoroot
  * @server: initialized nfs_server handle
@@ -3293,7 +3300,8 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
                status = nfs4_lookup_root(server, fhandle, info);
 
        if (auth_probe || status == NFS4ERR_WRONGSEC)
-               status = nfs4_do_find_root_sec(server, fhandle, info);
+               status = server->nfs_client->cl_mvops->find_root_sec(server,
+                               fhandle, info);
 
        if (status == 0)
                status = nfs4_server_capabilities(server, fhandle);
@@ -4392,7 +4400,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
                                 struct rpc_message *msg)
 {
        hdr->timestamp   = jiffies;
-       hdr->pgio_done_cb = nfs4_read_done_cb;
+       if (!hdr->pgio_done_cb)
+               hdr->pgio_done_cb = nfs4_read_done_cb;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
        nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
 }
@@ -7869,11 +7878,13 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
        struct inode *inode = lgp->args.inode;
        struct nfs_server *server = NFS_SERVER(inode);
        struct pnfs_layout_hdr *lo;
-       int status = task->tk_status;
+       int nfs4err = task->tk_status;
+       int err, status = 0;
+       LIST_HEAD(head);
 
        dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
 
-       switch (status) {
+       switch (nfs4err) {
        case 0:
                goto out;
 
@@ -7905,45 +7916,42 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
                        status = -EOVERFLOW;
                        goto out;
                }
-               /* Fallthrough */
+               status = -EBUSY;
+               break;
        case -NFS4ERR_RECALLCONFLICT:
-               nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
-                                       exception);
                status = -ERECALLCONFLICT;
-               goto out;
+               break;
        case -NFS4ERR_EXPIRED:
        case -NFS4ERR_BAD_STATEID:
                exception->timeout = 0;
                spin_lock(&inode->i_lock);
-               if (nfs4_stateid_match(&lgp->args.stateid,
+               lo = NFS_I(inode)->layout;
+               /* If the open stateid was bad, then recover it. */
+               if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
+                   nfs4_stateid_match_other(&lgp->args.stateid,
                                        &lgp->args.ctx->state->stateid)) {
                        spin_unlock(&inode->i_lock);
-                       /* If the open stateid was bad, then recover it. */
                        exception->state = lgp->args.ctx->state;
                        break;
                }
-               lo = NFS_I(inode)->layout;
-               if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) &&
-                   nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
-                       LIST_HEAD(head);
-
-                       /*
-                        * Mark the bad layout state as invalid, then retry
-                        * with the current stateid.
-                        */
-                       set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-                       pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
-                       spin_unlock(&inode->i_lock);
-                       pnfs_free_lseg_list(&head);
-                       status = -EAGAIN;
-                       goto out;
-               } else
-                       spin_unlock(&inode->i_lock);
-       }
 
-       status = nfs4_handle_exception(server, status, exception);
-       if (exception->retry)
+               /*
+                * Mark the bad layout state as invalid, then retry
+                */
+               pnfs_mark_layout_stateid_invalid(lo, &head);
+               spin_unlock(&inode->i_lock);
+               pnfs_free_lseg_list(&head);
                status = -EAGAIN;
+               goto out;
+       }
+
+       err = nfs4_handle_exception(server, nfs4err, exception);
+       if (!status) {
+               if (exception->retry)
+                       status = -EAGAIN;
+               else
+                       status = err;
+       }
 out:
        dprintk("<-- %s\n", __func__);
        return status;
@@ -8129,8 +8137,7 @@ static void nfs4_layoutreturn_release(void *calldata)
        spin_lock(&lo->plh_inode->i_lock);
        pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
                        be32_to_cpu(lrp->args.stateid.seqid));
-       pnfs_mark_layout_returned_if_empty(lo);
-       if (lrp->res.lrs_present)
+       if (lrp->res.lrs_present && pnfs_layout_is_valid(lo))
                pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
        pnfs_clear_layoutreturn_waitbit(lo);
        spin_unlock(&lo->plh_inode->i_lock);
@@ -8835,7 +8842,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
 #endif
 };
 
-ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
 {
        ssize_t error, error2;
 
index 661e753fe1c93d0c6be59fd26c0339316d50ee12..7bd3a5c09d3185856e85bbe8c5484fccfee3651f 100644 (file)
@@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
        p = xdr_encode_hyper(p, args->lastbytewritten + 1);     /* length */
        *p = cpu_to_be32(0); /* reclaim */
        encode_nfs4_stateid(xdr, &args->stateid);
-       p = reserve_space(xdr, 20);
-       *p++ = cpu_to_be32(1); /* newoffset = TRUE */
-       p = xdr_encode_hyper(p, args->lastbytewritten);
+       if (args->lastbytewritten != U64_MAX) {
+               p = reserve_space(xdr, 20);
+               *p++ = cpu_to_be32(1); /* newoffset = TRUE */
+               p = xdr_encode_hyper(p, args->lastbytewritten);
+       } else {
+               p = reserve_space(xdr, 12);
+               *p++ = cpu_to_be32(0); /* newoffset = FALSE */
+       }
        *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
        *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
 
index 31c7763b94d58803c832e08f2d046f27de7c9c2f..2ca9167bc97d0468d7dddc284efbbae56ebd8bbf 100644 (file)
@@ -37,7 +37,6 @@
                        { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
                        { 1 << NFS_INO_STALE, "STALE" }, \
                        { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
-                       { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
                        { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
                        { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
                        { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
index 0fbe734cc38cb8d27ba2c8efaf985a676f4d0cd8..70806cae0d36bf71ca704d5d1ab7bc128d1455e9 100644 (file)
@@ -259,7 +259,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
  * is required.
  * Note that caller must hold inode->i_lock.
  */
-static int
+int
 pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
                struct list_head *lseg_list)
 {
@@ -334,14 +334,17 @@ pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 }
 
 static void
-init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
+               const struct pnfs_layout_range *range,
+               const nfs4_stateid *stateid)
 {
        INIT_LIST_HEAD(&lseg->pls_list);
        INIT_LIST_HEAD(&lseg->pls_lc_list);
        atomic_set(&lseg->pls_refcount, 1);
-       smp_mb();
        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
        lseg->pls_layout = lo;
+       lseg->pls_range = *range;
+       lseg->pls_seq = be32_to_cpu(stateid->seqid);
 }
 
 static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
@@ -486,15 +489,6 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
               (end2 == NFS4_MAX_UINT64 || end2 > start1);
 }
 
-static bool
-should_free_lseg(const struct pnfs_layout_range *lseg_range,
-                const struct pnfs_layout_range *recall_range)
-{
-       return (recall_range->iomode == IOMODE_ANY ||
-               lseg_range->iomode == recall_range->iomode) &&
-              pnfs_lseg_range_intersecting(lseg_range, recall_range);
-}
-
 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
                struct list_head *tmp_list)
 {
@@ -533,6 +527,27 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
        return (s32)(s1 - s2) > 0;
 }
 
+static bool
+pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
+                const struct pnfs_layout_range *recall_range)
+{
+       return (recall_range->iomode == IOMODE_ANY ||
+               lseg_range->iomode == recall_range->iomode) &&
+              pnfs_lseg_range_intersecting(lseg_range, recall_range);
+}
+
+static bool
+pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
+               const struct pnfs_layout_range *recall_range,
+               u32 seq)
+{
+       if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
+               return false;
+       if (recall_range == NULL)
+               return true;
+       return pnfs_should_free_range(&lseg->pls_range, recall_range);
+}
+
 /**
  * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
  * @lo: layout header containing the lsegs
@@ -562,10 +577,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
        if (list_empty(&lo->plh_segs))
                return 0;
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
-               if (!recall_range ||
-                   should_free_lseg(&lseg->pls_range, recall_range)) {
-                       if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq))
-                               continue;
+               if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
                        dprintk("%s: freeing lseg %p iomode %d seq %u"
                                "offset %llu length %llu\n", __func__,
                                lseg, lseg->pls_range.iomode, lseg->pls_seq,
@@ -761,24 +773,25 @@ void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
                        bool update_barrier)
 {
-       u32 oldseq, newseq, new_barrier;
-       int empty = list_empty(&lo->plh_segs);
+       u32 oldseq, newseq, new_barrier = 0;
+       bool invalid = !pnfs_layout_is_valid(lo);
 
        oldseq = be32_to_cpu(lo->plh_stateid.seqid);
        newseq = be32_to_cpu(new->seqid);
-       if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
+       if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) {
                nfs4_stateid_copy(&lo->plh_stateid, new);
-               if (update_barrier) {
-                       new_barrier = be32_to_cpu(new->seqid);
-               } else {
-                       /* Because of wraparound, we want to keep the barrier
-                        * "close" to the current seqids.
-                        */
-                       new_barrier = newseq - atomic_read(&lo->plh_outstanding);
-               }
-               if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
-                       lo->plh_barrier = new_barrier;
+               /*
+                * Because of wraparound, we want to keep the barrier
+                * "close" to the current seqids.
+                */
+               new_barrier = newseq - atomic_read(&lo->plh_outstanding);
        }
+       if (update_barrier)
+               new_barrier = be32_to_cpu(new->seqid);
+       else if (new_barrier == 0)
+               return;
+       if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
+               lo->plh_barrier = new_barrier;
 }
 
 static bool
@@ -873,15 +886,37 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
        rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 }
 
+static void
+pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
+{
+       lo->plh_return_iomode = 0;
+       lo->plh_return_seq = 0;
+       clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+}
+
 static bool
-pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
+               nfs4_stateid *stateid,
+               enum pnfs_iomode *iomode)
 {
        if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
                return false;
-       lo->plh_return_iomode = 0;
-       lo->plh_return_seq = 0;
        pnfs_get_layout_hdr(lo);
-       clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+       if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
+               if (stateid != NULL) {
+                       nfs4_stateid_copy(stateid, &lo->plh_stateid);
+                       if (lo->plh_return_seq != 0)
+                               stateid->seqid = cpu_to_be32(lo->plh_return_seq);
+               }
+               if (iomode != NULL)
+                       *iomode = lo->plh_return_iomode;
+               pnfs_clear_layoutreturn_info(lo);
+               return true;
+       }
+       if (stateid != NULL)
+               nfs4_stateid_copy(stateid, &lo->plh_stateid);
+       if (iomode != NULL)
+               *iomode = IOMODE_ANY;
        return true;
 }
 
@@ -949,10 +984,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
                enum pnfs_iomode iomode;
                bool send;
 
-               nfs4_stateid_copy(&stateid, &lo->plh_stateid);
-               stateid.seqid = cpu_to_be32(lo->plh_return_seq);
-               iomode = lo->plh_return_iomode;
-               send = pnfs_prepare_layoutreturn(lo);
+               send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
                spin_unlock(&inode->i_lock);
                if (send) {
                        /* Send an async layoutreturn so we dont deadlock */
@@ -989,7 +1021,6 @@ _pnfs_return_layout(struct inode *ino)
                dprintk("NFS: %s no layout to return\n", __func__);
                goto out;
        }
-       nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
        /* Reference matched in nfs4_layoutreturn_release */
        pnfs_get_layout_hdr(lo);
        empty = list_empty(&lo->plh_segs);
@@ -1012,8 +1043,7 @@ _pnfs_return_layout(struct inode *ino)
                goto out_put_layout_hdr;
        }
 
-       set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-       send = pnfs_prepare_layoutreturn(lo);
+       send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
        if (send)
@@ -1080,11 +1110,10 @@ bool pnfs_roc(struct inode *ino)
                        goto out_noroc;
        }
 
-       nfs4_stateid_copy(&stateid, &lo->plh_stateid);
        /* always send layoutreturn if being marked so */
-       if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED,
-                                  &lo->plh_flags))
-               layoutreturn = pnfs_prepare_layoutreturn(lo);
+       if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+               layoutreturn = pnfs_prepare_layoutreturn(lo,
+                               &stateid, NULL);
 
        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
                /* If we are sending layoutreturn, invalidate all valid lsegs */
@@ -1132,7 +1161,6 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 
        spin_lock(&ino->i_lock);
        lo = NFS_I(ino)->layout;
-       pnfs_mark_layout_returned_if_empty(lo);
        if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
                lo->plh_barrier = barrier;
        spin_unlock(&ino->i_lock);
@@ -1505,7 +1533,7 @@ pnfs_update_layout(struct inode *ino,
        struct pnfs_layout_segment *lseg = NULL;
        nfs4_stateid stateid;
        long timeout = 0;
-       unsigned long giveup = jiffies + rpc_get_timeout(server->client);
+       unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
        bool first;
 
        if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
@@ -1645,33 +1673,44 @@ lookup_again:
        lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
        trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
                                 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+       atomic_dec(&lo->plh_outstanding);
        if (IS_ERR(lseg)) {
                switch(PTR_ERR(lseg)) {
-               case -ERECALLCONFLICT:
+               case -EBUSY:
                        if (time_after(jiffies, giveup))
                                lseg = NULL;
-                       /* Fallthrough */
-               case -EAGAIN:
-                       pnfs_put_layout_hdr(lo);
-                       if (first)
-                               pnfs_clear_first_layoutget(lo);
-                       if (lseg) {
-                               trace_pnfs_update_layout(ino, pos, count,
-                                       iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
-                               goto lookup_again;
+                       break;
+               case -ERECALLCONFLICT:
+                       /* Huh? We hold no layouts, how is there a recall? */
+                       if (first) {
+                               lseg = NULL;
+                               break;
                        }
+                       /* Destroy the existing layout and start over */
+                       if (time_after(jiffies, giveup))
+                               pnfs_destroy_layout(NFS_I(ino));
                        /* Fallthrough */
+               case -EAGAIN:
+                       break;
                default:
                        if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
                                pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
                                lseg = NULL;
                        }
+                       goto out_put_layout_hdr;
+               }
+               if (lseg) {
+                       if (first)
+                               pnfs_clear_first_layoutget(lo);
+                       trace_pnfs_update_layout(ino, pos, count,
+                               iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
+                       pnfs_put_layout_hdr(lo);
+                       goto lookup_again;
                }
        } else {
                pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
        }
 
-       atomic_dec(&lo->plh_outstanding);
 out_put_layout_hdr:
        if (first)
                pnfs_clear_first_layoutget(lo);
@@ -1735,9 +1774,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                return lseg;
        }
 
-       init_lseg(lo, lseg);
-       lseg->pls_range = res->range;
-       lseg->pls_seq = be32_to_cpu(res->stateid.seqid);
+       pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
 
        spin_lock(&ino->i_lock);
        if (pnfs_layoutgets_blocked(lo)) {
@@ -1758,16 +1795,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                 * inode invalid, and don't bother validating the stateid
                 * sequence number.
                 */
-               pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0);
+               pnfs_mark_layout_stateid_invalid(lo, &free_me);
 
                nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
                lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
        }
 
-       clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-
        pnfs_get_lseg(lseg);
        pnfs_layout_insert_lseg(lo, lseg, &free_me);
+       if (!pnfs_layout_is_valid(lo)) {
+               pnfs_clear_layoutreturn_info(lo);
+               clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+       }
+
 
        if (res->return_on_close)
                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
@@ -1787,14 +1827,14 @@ static void
 pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
                         u32 seq)
 {
-       if (lo->plh_return_iomode == iomode)
-               return;
-       if (lo->plh_return_iomode != 0)
+       if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
                iomode = IOMODE_ANY;
        lo->plh_return_iomode = iomode;
        set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
-       if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
+       if (seq != 0) {
+               WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
                lo->plh_return_seq = seq;
+       }
 }
 
 /**
@@ -1824,7 +1864,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
        assert_spin_locked(&lo->plh_inode->i_lock);
 
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
-               if (should_free_lseg(&lseg->pls_range, return_range)) {
+               if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
                        dprintk("%s: marking lseg %p iomode %d "
                                "offset %llu length %llu\n", __func__,
                                lseg, lseg->pls_range.iomode,
@@ -1855,19 +1895,17 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
        bool return_now = false;
 
        spin_lock(&inode->i_lock);
-       pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq);
+       pnfs_set_plh_return_info(lo, range.iomode, 0);
        /*
         * mark all matching lsegs so that we are sure to have no live
         * segments at hand when sending layoutreturn. See pnfs_put_lseg()
         * for how it works.
         */
-       if (!pnfs_mark_matching_lsegs_return(lo, &free_me,
-                                               &range, lseg->pls_seq)) {
+       if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) {
                nfs4_stateid stateid;
-               enum pnfs_iomode iomode = lo->plh_return_iomode;
+               enum pnfs_iomode iomode;
 
-               nfs4_stateid_copy(&stateid, &lo->plh_stateid);
-               return_now = pnfs_prepare_layoutreturn(lo);
+               return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
                spin_unlock(&inode->i_lock);
                if (return_now)
                        pnfs_send_layoutreturn(lo, &stateid, iomode, false);
@@ -2382,7 +2420,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
        nfs_fattr_init(&data->fattr);
        data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
        data->res.fattr = &data->fattr;
-       data->args.lastbytewritten = end_pos - 1;
+       if (end_pos != 0)
+               data->args.lastbytewritten = end_pos - 1;
+       else
+               data->args.lastbytewritten = U64_MAX;
        data->res.server = NFS_SERVER(inode);
 
        if (ld->prepare_layoutcommit) {
index b21bd0bee784391b0b88483d9b9d3ee7cce91bde..31d99b2927b000a7d95afebc4df1ad4478c25096 100644 (file)
@@ -268,6 +268,8 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
                                const struct pnfs_layout_range *recall_range,
                                u32 seq);
+int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+               struct list_head *lseg_list);
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -375,6 +377,11 @@ static inline bool nfs_have_layout(struct inode *inode)
        return NFS_I(inode)->layout != NULL;
 }
 
+static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo)
+{
+       return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0;
+}
+
 static inline struct nfs4_deviceid_node *
 nfs4_get_deviceid(struct nfs4_deviceid_node *d)
 {
@@ -545,19 +552,6 @@ pnfs_calc_offset_length(u64 offset, u64 end)
        return 1 + end - offset;
 }
 
-/**
- * pnfs_mark_layout_returned_if_empty - marks the layout as returned
- * @lo: layout header
- *
- * Note: Caller must hold inode->i_lock
- */
-static inline void
-pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
-{
-       if (list_empty(&lo->plh_segs))
-               set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-}
-
 static inline void
 pnfs_copy_range(struct pnfs_layout_range *dst,
                const struct pnfs_layout_range *src)
@@ -628,6 +622,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync)
        return 0;
 }
 
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+       return false;
+}
+
+
 static inline bool
 pnfs_roc(struct inode *ino)
 {
@@ -716,13 +717,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
        return false;
 }
 
-static inline bool
-pnfs_layoutcommit_outstanding(struct inode *inode)
-{
-       return false;
-}
-
-
 static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 {
        return NULL;
index b38e3c0dc7908566acd1c8bc9d51402e8069ef6a..f3468b57a32a32c71d7aa55fbf835ae064c16bcc 100644 (file)
@@ -595,7 +595,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
 }
 
 static struct nfs_client *(*get_v3_ds_connect)(
-                       struct nfs_client *mds_clp,
+                       struct nfs_server *mds_srv,
                        const struct sockaddr *ds_addr,
                        int ds_addrlen,
                        int ds_proto,
@@ -654,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
                        rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
                                        rpc_clnt_test_and_add_xprt, NULL);
                } else
-                       clp = get_v3_ds_connect(mds_srv->nfs_client,
+                       clp = get_v3_ds_connect(mds_srv,
                                        (struct sockaddr *)&da->da_addr,
                                        da->da_addrlen, IPPROTO_TCP,
                                        timeo, retrans, au_flavor);
@@ -690,7 +690,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
                dprintk("%s: DS %s: trying address %s\n",
                        __func__, ds->ds_remotestr, da->da_remotestr);
 
-               clp = nfs4_set_ds_client(mds_srv->nfs_client,
+               clp = nfs4_set_ds_client(mds_srv,
                                        (struct sockaddr *)&da->da_addr,
                                        da->da_addrlen, IPPROTO_TCP,
                                        timeo, retrans, minor_version,
@@ -940,6 +940,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
 int
 pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
 {
+       int ret;
+
+       if (!pnfs_layoutcommit_outstanding(inode))
+               return 0;
+       ret = nfs_commit_inode(inode, FLUSH_SYNC);
+       if (ret < 0)
+               return ret;
        if (datasync)
                return 0;
        return pnfs_layoutcommit_inode(inode, true);
index 2137e0202f2560d39dc383a01ecf33fcfcf4958d..18d446e1a82bbb5b558fca8deb3869849f91e58c 100644 (file)
@@ -1684,6 +1684,7 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
 {
        rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
        unsigned int i;
+       int use_auth_null = false;
 
        /*
         * If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1691,14 +1692,21 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
         *
         * AUTH_NULL has a special meaning when it's in the server list - it
         * means that the server will ignore the rpc creds, so any flavor
-        * can be used.
+        * can be used but still use the sec= that was specified.
         */
        for (i = 0; i < count; i++) {
                flavor = server_authlist[i];
 
-               if (nfs_auth_info_match(&args->auth_info, flavor) ||
-                   flavor == RPC_AUTH_NULL)
+               if (nfs_auth_info_match(&args->auth_info, flavor))
                        goto out;
+
+               if (flavor == RPC_AUTH_NULL)
+                       use_auth_null = true;
+       }
+
+       if (use_auth_null) {
+               flavor = RPC_AUTH_NULL;
+               goto out;
        }
 
        dfprintk(MOUNT,
index 593fa21a02c07a9dca3f45ce0ef8f87edad61a5f..3a6724c6eb5ffbd6e83e45354cb2d4d068577527 100644 (file)
@@ -625,7 +625,7 @@ static int nfs_writepage_locked(struct page *page,
        int err;
 
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
-       nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
+       nfs_pageio_init_write(&pgio, inode, 0,
                                false, &nfs_async_write_completion_ops);
        err = nfs_do_writepage(page, wbc, &pgio, launder);
        nfs_pageio_complete(&pgio);
@@ -657,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
        struct inode *inode = mapping->host;
-       unsigned long *bitlock = &NFS_I(inode)->flags;
        struct nfs_pageio_descriptor pgio;
        int err;
 
-       /* Stop dirtying of new pages while we sync */
-       err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING,
-                       nfs_wait_bit_killable, TASK_KILLABLE);
-       if (err)
-               goto out_err;
-
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
        nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
@@ -674,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
        err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
        nfs_pageio_complete(&pgio);
 
-       clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
-       smp_mb__after_atomic();
-       wake_up_bit(bitlock, NFS_INO_FLUSHING);
-
        if (err < 0)
                goto out_err;
        err = pgio.pg_error;
@@ -1195,9 +1184,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode)
 /*
  * Test if the open context credential key is marked to expire soon.
  */
-bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
 {
-       return rpcauth_cred_key_to_expire(ctx->cred);
+       struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
+
+       return rpcauth_cred_key_to_expire(auth, ctx->cred);
 }
 
 /*
@@ -1289,6 +1280,9 @@ int nfs_updatepage(struct file *file, struct page *page,
        dprintk("NFS:       nfs_updatepage(%pD2 %d@%lld)\n",
                file, count, (long long)(page_file_offset(page) + offset));
 
+       if (!count)
+               goto out;
+
        if (nfs_can_extend_write(file, page, inode)) {
                count = max(count + offset, nfs_page_length(page));
                offset = 0;
@@ -1299,7 +1293,7 @@ int nfs_updatepage(struct file *file, struct page *page,
                nfs_set_pageerror(page);
        else
                __set_page_dirty_nobuffers(page);
-
+out:
        dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
                        status, (long long)i_size_read(inode));
        return status;
@@ -1800,7 +1794,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 
                /* Okay, COMMIT succeeded, apparently. Check the verifier
                 * returned by the server against all stored verfs. */
-               if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) {
+               if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) {
                        /* We have a match */
                        nfs_inode_remove_request(req);
                        dprintk(" OK\n");
@@ -1923,6 +1917,24 @@ out_mark_dirty:
 }
 EXPORT_SYMBOL_GPL(nfs_write_inode);
 
+/*
+ * Wrapper for filemap_write_and_wait_range()
+ *
+ * Needed for pNFS in order to ensure data becomes visible to the
+ * client.
+ */
+int nfs_filemap_write_and_wait_range(struct address_space *mapping,
+               loff_t lstart, loff_t lend)
+{
+       int ret;
+
+       ret = filemap_write_and_wait_range(mapping, lstart, lend);
+       if (ret == 0)
+               ret = pnfs_sync_inode(mapping->host, true);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range);
+
 /*
  * flush the inode to disk.
  */
index d71278c3c5bdda030e3edbc03946a1b4282cd8ad..810124b33327c5db6b13b180c0a8d0f19b79295c 100644 (file)
@@ -205,12 +205,12 @@ struct nfs_inode {
 #define NFS_INO_STALE          (1)             /* possible stale inode */
 #define NFS_INO_ACL_LRU_SET    (2)             /* Inode is on the LRU list */
 #define NFS_INO_INVALIDATING   (3)             /* inode is being invalidated */
-#define NFS_INO_FLUSHING       (4)             /* inode is flushing out data */
 #define NFS_INO_FSCACHE                (5)             /* inode can be cached by FS-Cache */
 #define NFS_INO_FSCACHE_LOCK   (6)             /* FS-Cache cookie management lock */
 #define NFS_INO_LAYOUTCOMMIT   (9)             /* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)          /* layoutcommit inflight */
 #define NFS_INO_LAYOUTSTATS    (11)            /* layoutstats inflight */
+#define NFS_INO_ODIRECT                (12)            /* I/O setting is O_DIRECT */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
@@ -351,7 +351,6 @@ extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *ino
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_revalidate_mapping_rcu(struct inode *inode);
-extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *);
 extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
index c304a11b5b1aae93d5bb5a18ad995534f157b028..82b81a1c24382740366dfeea40ec11dd611372a6 100644 (file)
@@ -1596,9 +1596,8 @@ struct nfs_rpc_ops {
        int (*have_delegation)(struct inode *, fmode_t);
        int (*return_delegation)(struct inode *);
        struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *);
-       struct nfs_client *
-               (*init_client) (struct nfs_client *, const struct rpc_timeout *,
-                               const char *);
+       struct nfs_client *(*init_client) (struct nfs_client *,
+                               const struct nfs_client_initdata *);
        void    (*free_client) (struct nfs_client *);
        struct nfs_server *(*create_server)(struct nfs_mount_info *, struct nfs_subversion *);
        struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *,
index 899791573a403ba8434c12a05b4594f23ac6c33b..4ccf184e971f3572a1fe5aa105535e17d75e7989 100644 (file)
@@ -37,7 +37,6 @@ struct rpcsec_gss_info;
 
 /* auth_cred ac_flags bits */
 enum {
-       RPC_CRED_NO_CRKEY_TIMEOUT = 0, /* underlying cred has no key timeout */
        RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */
        RPC_CRED_NOTIFY_TIMEOUT = 2,   /* nofity generic cred when underlying
                                        key will expire soon */
@@ -82,6 +81,9 @@ struct rpc_cred {
 
 #define RPCAUTH_CRED_MAGIC     0x0f4aa4f0
 
+/* rpc_auth au_flags */
+#define RPCAUTH_AUTH_NO_CRKEY_TIMEOUT  0x0001 /* underlying cred has no key timeout */
+
 /*
  * Client authentication handle
  */
@@ -107,6 +109,9 @@ struct rpc_auth {
        /* per-flavor data */
 };
 
+/* rpc_auth au_flags */
+#define RPCAUTH_AUTH_DATATOUCH 0x00000002
+
 struct rpc_auth_create_args {
        rpc_authflavor_t pseudoflavor;
        const char *target_name;
@@ -196,7 +201,7 @@ void                        rpcauth_destroy_credcache(struct rpc_auth *);
 void                   rpcauth_clear_credcache(struct rpc_cred_cache *);
 int                    rpcauth_key_timeout_notify(struct rpc_auth *,
                                                struct rpc_cred *);
-bool                   rpcauth_cred_key_to_expire(struct rpc_cred *);
+bool                   rpcauth_cred_key_to_expire(struct rpc_auth *, struct rpc_cred *);
 char *                 rpcauth_stringify_acceptor(struct rpc_cred *);
 
 static inline
index 1f911ccb2a75655bee357d27076b455776d4d377..68ec78c1aa48e1ec3e4b285420a69732a2b05c6a 100644 (file)
@@ -73,6 +73,7 @@ u32 gss_delete_sec_context(
 rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop,
                                        u32 service);
 u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor);
+bool gss_pseudoflavor_to_datatouch(struct gss_api_mech *, u32 pseudoflavor);
 char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service);
 
 struct pf_desc {
@@ -81,6 +82,7 @@ struct pf_desc {
        u32     service;
        char    *name;
        char    *auth_domain_name;
+       bool    datatouch;
 };
 
 /* Different mechanisms (e.g., krb5 or spkm3) may implement gss-api, and
index 05a1809c44d99e59813576a1e2a6daf242e2a4be..817af0b4385ea384026b7f35e296e94211796974 100644 (file)
@@ -230,6 +230,10 @@ void               rpc_wake_up_queued_task(struct rpc_wait_queue *,
                                        struct rpc_task *);
 void           rpc_wake_up(struct rpc_wait_queue *);
 struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
+struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
+                                       struct rpc_wait_queue *,
+                                       bool (*)(struct rpc_task *, void *),
+                                       void *);
 struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *,
                                        bool (*)(struct rpc_task *, void *),
                                        void *);
@@ -247,6 +251,7 @@ void                rpc_show_tasks(struct net *);
 int            rpc_init_mempool(void);
 void           rpc_destroy_mempool(void);
 extern struct workqueue_struct *rpciod_workqueue;
+extern struct workqueue_struct *xprtiod_workqueue;
 void           rpc_prepare_task(struct rpc_task *task);
 
 static inline int rpc_wait_for_completion_task(struct rpc_task *task)
index 0ece4ba06f060aeb8e35927982efc07cbf285b66..bef3fb0abb8f4a6fa713ff7fe774630779c88803 100644 (file)
@@ -80,6 +80,7 @@ struct sock_xprt {
 #define TCP_RPC_REPLY          (1UL << 6)
 
 #define XPRT_SOCK_CONNECTING   1U
+#define XPRT_SOCK_DATA_READY   (2)
 
 #endif /* __KERNEL__ */
 
index 040ff627c18a52f463e6a27e06b3794f4b0b5b10..a7e42f9a405c1c6fde97fc3cc34628a45016b03c 100644 (file)
@@ -51,9 +51,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
        ret = kstrtoul(val, 0, &num);
        if (ret == -EINVAL)
                goto out_inval;
-       nbits = fls(num);
-       if (num > (1U << nbits))
-               nbits++;
+       nbits = fls(num - 1);
        if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
                goto out_inval;
        *(unsigned int *)kp->arg = nbits;
@@ -359,8 +357,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred)
 EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify);
 
 bool
-rpcauth_cred_key_to_expire(struct rpc_cred *cred)
+rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred)
 {
+       if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
+               return false;
        if (!cred->cr_ops->crkey_to_expire)
                return false;
        return cred->cr_ops->crkey_to_expire(cred);
index 54dd3fdead54b8813bb6cbdf769f96a11d8f49ea..168219535a341056c22679273771dcf7d7656f28 100644 (file)
@@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
 
 
        /* Fast track for non crkey_timeout (no key) underlying credentials */
-       if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags))
+       if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
                return 0;
 
        /* Fast track for the normal case */
@@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
        if (IS_ERR(tcred))
                return -EACCES;
 
-       if (!tcred->cr_ops->crkey_timeout) {
-               set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags);
-               ret = 0;
-               goto out_put;
-       }
-
        /* Test for the almost error case */
        ret = tcred->cr_ops->crkey_timeout(tcred);
        if (ret != 0) {
@@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
                set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
        }
 
-out_put:
        put_rpccred(tcred);
        return ret;
 }
index e64ae93d5b4f618e216f2c55070ceef60bd737d5..23c8e7c3965651ad5ee03ee617ad92d06646802f 100644 (file)
@@ -1015,8 +1015,11 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
        auth = &gss_auth->rpc_auth;
        auth->au_cslack = GSS_CRED_SLACK >> 2;
        auth->au_rslack = GSS_VERF_SLACK >> 2;
+       auth->au_flags = 0;
        auth->au_ops = &authgss_ops;
        auth->au_flavor = flavor;
+       if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
+               auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
        atomic_set(&auth->au_count, 1);
        kref_init(&gss_auth->kref);
 
index 65427492b1c95f681f79345e7ce29c62d941e147..60595835317afbdd1c2ccc657cd727ed11477a6b 100644 (file)
@@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = {
                .qop = GSS_C_QOP_DEFAULT,
                .service = RPC_GSS_SVC_INTEGRITY,
                .name = "krb5i",
+               .datatouch = true,
        },
        [2] = {
                .pseudoflavor = RPC_AUTH_GSS_KRB5P,
                .qop = GSS_C_QOP_DEFAULT,
                .service = RPC_GSS_SVC_PRIVACY,
                .name = "krb5p",
+               .datatouch = true,
        },
 };
 
index 7063d856a598e3c4f1201c5df26ce86c6158538d..5fec3abbe19bb640de31bf85bfc9becdeab5f359 100644 (file)
@@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
 }
 EXPORT_SYMBOL(gss_pseudoflavor_to_service);
 
+bool
+gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor)
+{
+       int i;
+
+       for (i = 0; i < gm->gm_pf_num; i++) {
+               if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
+                       return gm->gm_pfs[i].datatouch;
+       }
+       return false;
+}
+
 char *
 gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
 {
index 8d9eb4d5ddd8fd861a70a30f9922a3203ad8bb6f..4d17376b2acb118fc1f17f2981f22eea676e3667 100644 (file)
@@ -115,6 +115,7 @@ static
 struct rpc_auth null_auth = {
        .au_cslack      = NUL_CALLSLACK,
        .au_rslack      = NUL_REPLYSLACK,
+       .au_flags       = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
        .au_ops         = &authnull_ops,
        .au_flavor      = RPC_AUTH_NULL,
        .au_count       = ATOMIC_INIT(0),
index 9f65452b7cbcbe380a36c0941176b5562a434f88..a99278c984e82a2156c20e3a4a7fa174783b9297 100644 (file)
@@ -228,6 +228,7 @@ static
 struct rpc_auth                unix_auth = {
        .au_cslack      = UNX_CALLSLACK,
        .au_rslack      = NUL_REPLYSLACK,
+       .au_flags       = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
        .au_ops         = &authunix_ops,
        .au_flavor      = RPC_AUTH_UNIX,
        .au_count       = ATOMIC_INIT(0),
index 2808d550d2730ff8e36b6d8c072c65e1631064c4..cb49898a5a58aacfadceda27a07ceb45eb88a8d3 100644 (file)
@@ -2577,7 +2577,7 @@ static void rpc_cb_add_xprt_release(void *calldata)
        kfree(data);
 }
 
-const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
+static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
        .rpc_call_done = rpc_cb_add_xprt_done,
        .rpc_release = rpc_cb_add_xprt_release,
 };
index fcfd48d263f64f1f52ef317a9a8974a8a457196e..9ae588511aafd9470736a4ff24a498c941b7ece3 100644 (file)
@@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue;
 /*
  * rpciod-related stuff
  */
-struct workqueue_struct *rpciod_workqueue;
+struct workqueue_struct *rpciod_workqueue __read_mostly;
+struct workqueue_struct *xprtiod_workqueue __read_mostly;
 
 /*
  * Disable the timer for a given RPC task. Should be called with
@@ -329,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
  * lockless RPC_IS_QUEUED() test) before we've had a chance to test
  * the RPC_TASK_RUNNING flag.
  */
-static void rpc_make_runnable(struct rpc_task *task)
+static void rpc_make_runnable(struct workqueue_struct *wq,
+               struct rpc_task *task)
 {
        bool need_wakeup = !rpc_test_and_set_running(task);
 
@@ -338,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task)
                return;
        if (RPC_IS_ASYNC(task)) {
                INIT_WORK(&task->u.tk_work, rpc_async_schedule);
-               queue_work(rpciod_workqueue, &task->u.tk_work);
+               queue_work(wq, &task->u.tk_work);
        } else
                wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
 }
@@ -407,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
 EXPORT_SYMBOL_GPL(rpc_sleep_on_priority);
 
 /**
- * __rpc_do_wake_up_task - wake up a single rpc_task
+ * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task
+ * @wq: workqueue on which to run task
  * @queue: wait queue
  * @task: task to be woken up
  *
  * Caller must hold queue->lock, and have cleared the task queued flag.
  */
-static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
+               struct rpc_wait_queue *queue,
+               struct rpc_task *task)
 {
        dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
                        task->tk_pid, jiffies);
@@ -428,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
 
        __rpc_remove_wait_queue(queue, task);
 
-       rpc_make_runnable(task);
+       rpc_make_runnable(wq, task);
 
        dprintk("RPC:       __rpc_wake_up_task done\n");
 }
@@ -436,15 +441,24 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
 /*
  * Wake up a queued task while the queue lock is being held
  */
-static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
+               struct rpc_wait_queue *queue, struct rpc_task *task)
 {
        if (RPC_IS_QUEUED(task)) {
                smp_rmb();
                if (task->tk_waitqueue == queue)
-                       __rpc_do_wake_up_task(queue, task);
+                       __rpc_do_wake_up_task_on_wq(wq, queue, task);
        }
 }
 
+/*
+ * Wake up a queued task while the queue lock is being held
+ */
+static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+       rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
+}
+
 /*
  * Wake up a task on a specific queue
  */
@@ -518,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue)
 /*
  * Wake up the first task on the wait queue.
  */
-struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
+struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
+               struct rpc_wait_queue *queue,
                bool (*func)(struct rpc_task *, void *), void *data)
 {
        struct rpc_task *task = NULL;
@@ -529,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
        task = __rpc_find_next_queued(queue);
        if (task != NULL) {
                if (func(task, data))
-                       rpc_wake_up_task_queue_locked(queue, task);
+                       rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
                else
                        task = NULL;
        }
@@ -537,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
 
        return task;
 }
+
+/*
+ * Wake up the first task on the wait queue.
+ */
+struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
+               bool (*func)(struct rpc_task *, void *), void *data)
+{
+       return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data);
+}
 EXPORT_SYMBOL_GPL(rpc_wake_up_first);
 
 static bool rpc_wake_up_next_func(struct rpc_task *task, void *data)
@@ -814,7 +838,7 @@ void rpc_execute(struct rpc_task *task)
        bool is_async = RPC_IS_ASYNC(task);
 
        rpc_set_active(task);
-       rpc_make_runnable(task);
+       rpc_make_runnable(rpciod_workqueue, task);
        if (!is_async)
                __rpc_execute(task);
 }
@@ -1071,10 +1095,22 @@ static int rpciod_start(void)
         * Create the rpciod thread and wait for it to start.
         */
        dprintk("RPC:       creating workqueue rpciod\n");
-       /* Note: highpri because network receive is latency sensitive */
-       wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+       wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0);
+       if (!wq)
+               goto out_failed;
        rpciod_workqueue = wq;
-       return rpciod_workqueue != NULL;
+       /* Note: highpri because network receive is latency sensitive */
+       wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+       if (!wq)
+               goto free_rpciod;
+       xprtiod_workqueue = wq;
+       return 1;
+free_rpciod:
+       wq = rpciod_workqueue;
+       rpciod_workqueue = NULL;
+       destroy_workqueue(wq);
+out_failed:
+       return 0;
 }
 
 static void rpciod_stop(void)
@@ -1088,6 +1124,9 @@ static void rpciod_stop(void)
        wq = rpciod_workqueue;
        rpciod_workqueue = NULL;
        destroy_workqueue(wq);
+       wq = xprtiod_workqueue;
+       xprtiod_workqueue = NULL;
+       destroy_workqueue(wq);
 }
 
 void
index cc9852897395c6f6db42653c773788446a9c5d3e..c5b0cb4f4056c4da0a0adc556cf46bebb48d2e7a 100644 (file)
@@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
                *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 
                /* Encode reply */
-               if (test_bit(RQ_DROPME, &rqstp->rq_flags)) {
+               if (*statp == rpc_drop_reply ||
+                   test_bit(RQ_DROPME, &rqstp->rq_flags)) {
                        if (procp->pc_release)
                                procp->pc_release(rqstp, NULL, rqstp->rq_resp);
                        goto dropit;
                }
+               if (*statp == rpc_autherr_badcred) {
+                       if (procp->pc_release)
+                               procp->pc_release(rqstp, NULL, rqstp->rq_resp);
+                       goto err_bad_auth;
+               }
                if (*statp == rpc_success &&
                    (xdr = procp->pc_encode) &&
                    !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
index 216a1385718a27e9f86516720d28d61b9aaac609..8313960cac524dd36d220f9b55d124435400f25a 100644 (file)
@@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt)
                clear_bit(XPRT_LOCKED, &xprt->state);
                smp_mb__after_atomic();
        } else
-               queue_work(rpciod_workqueue, &xprt->task_cleanup);
+               queue_work(xprtiod_workqueue, &xprt->task_cleanup);
 }
 
 /*
@@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt)
        if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
                return;
 
-       if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt))
+       if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
+                               __xprt_lock_write_func, xprt))
                return;
        xprt_clear_locked(xprt);
 }
@@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
                return;
        if (RPCXPRT_CONGESTED(xprt))
                goto out_unlock;
-       if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt))
+       if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
+                               __xprt_lock_write_cong_func, xprt))
                return;
 out_unlock:
        xprt_clear_locked(xprt);
@@ -645,7 +647,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
        set_bit(XPRT_CLOSE_WAIT, &xprt->state);
        /* Try to schedule an autoclose RPC call */
        if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
-               queue_work(rpciod_workqueue, &xprt->task_cleanup);
+               queue_work(xprtiod_workqueue, &xprt->task_cleanup);
        xprt_wake_pending_tasks(xprt, -EAGAIN);
        spin_unlock_bh(&xprt->transport_lock);
 }
@@ -672,7 +674,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
        set_bit(XPRT_CLOSE_WAIT, &xprt->state);
        /* Try to schedule an autoclose RPC call */
        if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
-               queue_work(rpciod_workqueue, &xprt->task_cleanup);
+               queue_work(xprtiod_workqueue, &xprt->task_cleanup);
        xprt_wake_pending_tasks(xprt, -EAGAIN);
 out:
        spin_unlock_bh(&xprt->transport_lock);
@@ -689,7 +691,7 @@ xprt_init_autodisconnect(unsigned long data)
        if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
                goto out_abort;
        spin_unlock(&xprt->transport_lock);
-       queue_work(rpciod_workqueue, &xprt->task_cleanup);
+       queue_work(xprtiod_workqueue, &xprt->task_cleanup);
        return;
 out_abort:
        spin_unlock(&xprt->transport_lock);
index e7fd76975d86ff433912040bee03ba8416fc7a2d..66c9d63f4797bbd9a9ffd655925797dce9503952 100644 (file)
@@ -271,14 +271,12 @@ struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi,
                xprt_switch_find_xprt_t find_next)
 {
        struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
-       struct list_head *head;
 
        if (xps == NULL)
                return NULL;
-       head = &xps->xps_xprt_list;
-       if (xps->xps_nxprts < 2)
-               return xprt_switch_find_first_entry(head);
-       return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next);
+       return xprt_switch_set_next_cursor(&xps->xps_xprt_list,
+                       &xpi->xpi_cursor,
+                       find_next);
 }
 
 static
index dc9f3b513a05f3a2fc9027359f9e1e587f9d243b..ef19fa42c50ff2e15ecdee7d87f1207ffc3c445c 100644 (file)
@@ -1,7 +1,7 @@
 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
 
 rpcrdma-y := transport.o rpc_rdma.o verbs.o \
-       fmr_ops.o frwr_ops.o physical_ops.o \
+       fmr_ops.o frwr_ops.o \
        svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
        svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
        module.o
index 6326ebe8b5951a95c3e488bee905f6fa6f00a62b..21cb3b150b371dc5e8aab5e02bc4af99a6a3d856 100644 (file)
  * verb (fmr_op_unmap).
  */
 
-/* Transport recovery
- *
- * After a transport reconnect, fmr_op_map re-uses the MR already
- * allocated for the RPC, but generates a fresh rkey then maps the
- * MR again. This process is synchronous.
- */
-
 #include "xprt_rdma.h"
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 /* Maximum scatter/gather per FMR */
 #define RPCRDMA_MAX_FMR_SGES   (64)
 
-static struct workqueue_struct *fmr_recovery_wq;
-
-#define FMR_RECOVERY_WQ_FLAGS          (WQ_UNBOUND)
+/* Access mode of externally registered pages */
+enum {
+       RPCRDMA_FMR_ACCESS_FLAGS        = IB_ACCESS_REMOTE_WRITE |
+                                         IB_ACCESS_REMOTE_READ,
+};
 
-int
-fmr_alloc_recovery_wq(void)
+bool
+fmr_is_supported(struct rpcrdma_ia *ia)
 {
-       fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
-       return !fmr_recovery_wq ? -ENOMEM : 0;
+       if (!ia->ri_device->alloc_fmr) {
+               pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
+                       ia->ri_device->name);
+               return false;
+       }
+       return true;
 }
 
-void
-fmr_destroy_recovery_wq(void)
+static int
+fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
 {
-       struct workqueue_struct *wq;
+       static struct ib_fmr_attr fmr_attr = {
+               .max_pages      = RPCRDMA_MAX_FMR_SGES,
+               .max_maps       = 1,
+               .page_shift     = PAGE_SHIFT
+       };
 
-       if (!fmr_recovery_wq)
-               return;
+       mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
+                                      sizeof(u64), GFP_KERNEL);
+       if (!mw->fmr.fm_physaddrs)
+               goto out_free;
 
-       wq = fmr_recovery_wq;
-       fmr_recovery_wq = NULL;
-       destroy_workqueue(wq);
+       mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
+                           sizeof(*mw->mw_sg), GFP_KERNEL);
+       if (!mw->mw_sg)
+               goto out_free;
+
+       sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
+
+       mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
+                                    &fmr_attr);
+       if (IS_ERR(mw->fmr.fm_mr))
+               goto out_fmr_err;
+
+       return 0;
+
+out_fmr_err:
+       dprintk("RPC:       %s: ib_alloc_fmr returned %ld\n", __func__,
+               PTR_ERR(mw->fmr.fm_mr));
+
+out_free:
+       kfree(mw->mw_sg);
+       kfree(mw->fmr.fm_physaddrs);
+       return -ENOMEM;
 }
 
 static int
 __fmr_unmap(struct rpcrdma_mw *mw)
 {
        LIST_HEAD(l);
+       int rc;
 
-       list_add(&mw->fmr.fmr->list, &l);
-       return ib_unmap_fmr(&l);
+       list_add(&mw->fmr.fm_mr->list, &l);
+       rc = ib_unmap_fmr(&l);
+       list_del_init(&mw->fmr.fm_mr->list);
+       return rc;
 }
 
-/* Deferred reset of a single FMR. Generate a fresh rkey by
- * replacing the MR. There's no recovery if this fails.
- */
 static void
-__fmr_recovery_worker(struct work_struct *work)
+fmr_op_release_mr(struct rpcrdma_mw *r)
 {
-       struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
-                                           mw_work);
-       struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+       LIST_HEAD(unmap_list);
+       int rc;
 
-       __fmr_unmap(mw);
-       rpcrdma_put_mw(r_xprt, mw);
-       return;
+       /* Ensure MW is not on any rl_registered list */
+       if (!list_empty(&r->mw_list))
+               list_del(&r->mw_list);
+
+       kfree(r->fmr.fm_physaddrs);
+       kfree(r->mw_sg);
+
+       /* In case this one was left mapped, try to unmap it
+        * to prevent dealloc_fmr from failing with EBUSY
+        */
+       rc = __fmr_unmap(r);
+       if (rc)
+               pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
+                      r, rc);
+
+       rc = ib_dealloc_fmr(r->fmr.fm_mr);
+       if (rc)
+               pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
+                      r, rc);
+
+       kfree(r);
 }
 
-/* A broken MR was discovered in a context that can't sleep.
- * Defer recovery to the recovery worker.
+/* Reset of a single FMR.
  */
 static void
-__fmr_queue_recovery(struct rpcrdma_mw *mw)
+fmr_op_recover_mr(struct rpcrdma_mw *mw)
 {
-       INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
-       queue_work(fmr_recovery_wq, &mw->mw_work);
+       struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+       int rc;
+
+       /* ORDER: invalidate first */
+       rc = __fmr_unmap(mw);
+
+       /* ORDER: then DMA unmap */
+       ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+                       mw->mw_sg, mw->mw_nents, mw->mw_dir);
+       if (rc)
+               goto out_release;
+
+       rpcrdma_put_mw(r_xprt, mw);
+       r_xprt->rx_stats.mrs_recovered++;
+       return;
+
+out_release:
+       pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
+       r_xprt->rx_stats.mrs_orphaned++;
+
+       spin_lock(&r_xprt->rx_buf.rb_mwlock);
+       list_del(&mw->mw_all);
+       spin_unlock(&r_xprt->rx_buf.rb_mwlock);
+
+       fmr_op_release_mr(mw);
 }
 
 static int
@@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
                     RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
 }
 
-static int
-fmr_op_init(struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
-       struct ib_fmr_attr fmr_attr = {
-               .max_pages      = RPCRDMA_MAX_FMR_SGES,
-               .max_maps       = 1,
-               .page_shift     = PAGE_SHIFT
-       };
-       struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
-       struct rpcrdma_mw *r;
-       int i, rc;
-
-       spin_lock_init(&buf->rb_mwlock);
-       INIT_LIST_HEAD(&buf->rb_mws);
-       INIT_LIST_HEAD(&buf->rb_all);
-
-       i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
-       i += 2;                         /* head + tail */
-       i *= buf->rb_max_requests;      /* one set for each RPC slot */
-       dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
-
-       rc = -ENOMEM;
-       while (i--) {
-               r = kzalloc(sizeof(*r), GFP_KERNEL);
-               if (!r)
-                       goto out;
-
-               r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
-                                          sizeof(u64), GFP_KERNEL);
-               if (!r->fmr.physaddrs)
-                       goto out_free;
-
-               r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
-               if (IS_ERR(r->fmr.fmr))
-                       goto out_fmr_err;
-
-               r->mw_xprt = r_xprt;
-               list_add(&r->mw_list, &buf->rb_mws);
-               list_add(&r->mw_all, &buf->rb_all);
-       }
-       return 0;
-
-out_fmr_err:
-       rc = PTR_ERR(r->fmr.fmr);
-       dprintk("RPC:       %s: ib_alloc_fmr status %i\n", __func__, rc);
-       kfree(r->fmr.physaddrs);
-out_free:
-       kfree(r);
-out:
-       return rc;
-}
-
 /* Use the ib_map_phys_fmr() verb to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  */
 static int
 fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
-          int nsegs, bool writing)
+          int nsegs, bool writing, struct rpcrdma_mw **out)
 {
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct ib_device *device = ia->ri_device;
-       enum dma_data_direction direction = rpcrdma_data_dir(writing);
        struct rpcrdma_mr_seg *seg1 = seg;
        int len, pageoff, i, rc;
        struct rpcrdma_mw *mw;
+       u64 *dma_pages;
 
-       mw = seg1->rl_mw;
-       seg1->rl_mw = NULL;
-       if (!mw) {
-               mw = rpcrdma_get_mw(r_xprt);
-               if (!mw)
-                       return -ENOMEM;
-       } else {
-               /* this is a retransmit; generate a fresh rkey */
-               rc = __fmr_unmap(mw);
-               if (rc)
-                       return rc;
-       }
+       mw = rpcrdma_get_mw(r_xprt);
+       if (!mw)
+               return -ENOBUFS;
 
        pageoff = offset_in_page(seg1->mr_offset);
        seg1->mr_offset -= pageoff;     /* start of page */
@@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        if (nsegs > RPCRDMA_MAX_FMR_SGES)
                nsegs = RPCRDMA_MAX_FMR_SGES;
        for (i = 0; i < nsegs;) {
-               rpcrdma_map_one(device, seg, direction);
-               mw->fmr.physaddrs[i] = seg->mr_dma;
+               if (seg->mr_page)
+                       sg_set_page(&mw->mw_sg[i],
+                                   seg->mr_page,
+                                   seg->mr_len,
+                                   offset_in_page(seg->mr_offset));
+               else
+                       sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
+                                  seg->mr_len);
                len += seg->mr_len;
                ++seg;
                ++i;
@@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
                    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
                        break;
        }
-
-       rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs,
-                            i, seg1->mr_dma);
+       mw->mw_nents = i;
+       mw->mw_dir = rpcrdma_data_dir(writing);
+       if (i == 0)
+               goto out_dmamap_err;
+
+       if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device,
+                          mw->mw_sg, mw->mw_nents, mw->mw_dir))
+               goto out_dmamap_err;
+
+       for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
+               dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
+       rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
+                            dma_pages[0]);
        if (rc)
                goto out_maperr;
 
-       seg1->rl_mw = mw;
-       seg1->mr_rkey = mw->fmr.fmr->rkey;
-       seg1->mr_base = seg1->mr_dma + pageoff;
-       seg1->mr_nsegs = i;
-       seg1->mr_len = len;
-       return i;
+       mw->mw_handle = mw->fmr.fm_mr->rkey;
+       mw->mw_length = len;
+       mw->mw_offset = dma_pages[0] + pageoff;
 
-out_maperr:
-       dprintk("RPC:       %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
-               __func__, len, (unsigned long long)seg1->mr_dma,
-               pageoff, i, rc);
-       while (i--)
-               rpcrdma_unmap_one(device, --seg);
-       return rc;
-}
+       *out = mw;
+       return mw->mw_nents;
 
-static void
-__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
-       struct ib_device *device = r_xprt->rx_ia.ri_device;
-       int nsegs = seg->mr_nsegs;
+out_dmamap_err:
+       pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
+              mw->mw_sg, mw->mw_nents);
+       rpcrdma_defer_mr_recovery(mw);
+       return -EIO;
 
-       while (nsegs--)
-               rpcrdma_unmap_one(device, seg++);
+out_maperr:
+       pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
+              len, (unsigned long long)dma_pages[0],
+              pageoff, mw->mw_nents, rc);
+       rpcrdma_defer_mr_recovery(mw);
+       return -EIO;
 }
 
 /* Invalidate all memory regions that were registered for "req".
  *
  * Sleeps until it is safe for the host CPU to access the
  * previously mapped memory regions.
+ *
+ * Caller ensures that req->rl_registered is not empty.
  */
 static void
 fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
-       struct rpcrdma_mr_seg *seg;
-       unsigned int i, nchunks;
-       struct rpcrdma_mw *mw;
+       struct rpcrdma_mw *mw, *tmp;
        LIST_HEAD(unmap_list);
        int rc;
 
@@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
        /* ORDER: Invalidate all of the req's MRs first
         *
         * ib_unmap_fmr() is slow, so use a single call instead
-        * of one call per mapped MR.
+        * of one call per mapped FMR.
         */
-       for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
-               seg = &req->rl_segments[i];
-               mw = seg->rl_mw;
-
-               list_add(&mw->fmr.fmr->list, &unmap_list);
-
-               i += seg->mr_nsegs;
-       }
+       list_for_each_entry(mw, &req->rl_registered, mw_list)
+               list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
        rc = ib_unmap_fmr(&unmap_list);
        if (rc)
-               pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc);
+               goto out_reset;
 
        /* ORDER: Now DMA unmap all of the req's MRs, and return
         * them to the free MW list.
         */
-       for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
-               seg = &req->rl_segments[i];
+       list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+               list_del_init(&mw->mw_list);
+               list_del_init(&mw->fmr.fm_mr->list);
+               ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+                               mw->mw_sg, mw->mw_nents, mw->mw_dir);
+               rpcrdma_put_mw(r_xprt, mw);
+       }
 
-               __fmr_dma_unmap(r_xprt, seg);
-               rpcrdma_put_mw(r_xprt, seg->rl_mw);
+       return;
 
-               i += seg->mr_nsegs;
-               seg->mr_nsegs = 0;
-               seg->rl_mw = NULL;
-       }
+out_reset:
+       pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
 
-       req->rl_nchunks = 0;
+       list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+               list_del_init(&mw->fmr.fm_mr->list);
+               fmr_op_recover_mr(mw);
+       }
 }
 
 /* Use a slow, safe mechanism to invalidate all memory regions
  * that were registered for "req".
- *
- * In the asynchronous case, DMA unmapping occurs first here
- * because the rpcrdma_mr_seg is released immediately after this
- * call. It's contents won't be available in __fmr_dma_unmap later.
- * FIXME.
  */
 static void
 fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
                  bool sync)
 {
-       struct rpcrdma_mr_seg *seg;
        struct rpcrdma_mw *mw;
-       unsigned int i;
-
-       for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
-               seg = &req->rl_segments[i];
-               mw = seg->rl_mw;
-
-               if (sync) {
-                       /* ORDER */
-                       __fmr_unmap(mw);
-                       __fmr_dma_unmap(r_xprt, seg);
-                       rpcrdma_put_mw(r_xprt, mw);
-               } else {
-                       __fmr_dma_unmap(r_xprt, seg);
-                       __fmr_queue_recovery(mw);
-               }
-
-               i += seg->mr_nsegs;
-               seg->mr_nsegs = 0;
-               seg->rl_mw = NULL;
-       }
-}
-
-static void
-fmr_op_destroy(struct rpcrdma_buffer *buf)
-{
-       struct rpcrdma_mw *r;
-       int rc;
-
-       while (!list_empty(&buf->rb_all)) {
-               r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
-               list_del(&r->mw_all);
-               kfree(r->fmr.physaddrs);
 
-               rc = ib_dealloc_fmr(r->fmr.fmr);
-               if (rc)
-                       dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
-                               __func__, rc);
+       while (!list_empty(&req->rl_registered)) {
+               mw = list_first_entry(&req->rl_registered,
+                                     struct rpcrdma_mw, mw_list);
+               list_del_init(&mw->mw_list);
 
-               kfree(r);
+               if (sync)
+                       fmr_op_recover_mr(mw);
+               else
+                       rpcrdma_defer_mr_recovery(mw);
        }
 }
 
@@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
        .ro_map                         = fmr_op_map,
        .ro_unmap_sync                  = fmr_op_unmap_sync,
        .ro_unmap_safe                  = fmr_op_unmap_safe,
+       .ro_recover_mr                  = fmr_op_recover_mr,
        .ro_open                        = fmr_op_open,
        .ro_maxpages                    = fmr_op_maxpages,
-       .ro_init                        = fmr_op_init,
-       .ro_destroy                     = fmr_op_destroy,
+       .ro_init_mr                     = fmr_op_init_mr,
+       .ro_release_mr                  = fmr_op_release_mr,
        .ro_displayname                 = "fmr",
 };
index c0947544babeb976eea94ae4698ce955bd1a3c37..892b5e1d9b099b217fba65a26b47105a0c027c51 100644 (file)
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
-static struct workqueue_struct *frwr_recovery_wq;
-
-#define FRWR_RECOVERY_WQ_FLAGS         (WQ_UNBOUND | WQ_MEM_RECLAIM)
+bool
+frwr_is_supported(struct rpcrdma_ia *ia)
+{
+       struct ib_device_attr *attrs = &ia->ri_device->attrs;
+
+       if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+               goto out_not_supported;
+       if (attrs->max_fast_reg_page_list_len == 0)
+               goto out_not_supported;
+       return true;
+
+out_not_supported:
+       pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
+               ia->ri_device->name);
+       return false;
+}
 
-int
-frwr_alloc_recovery_wq(void)
+static int
+frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
 {
-       frwr_recovery_wq = alloc_workqueue("frwr_recovery",
-                                          FRWR_RECOVERY_WQ_FLAGS, 0);
-       return !frwr_recovery_wq ? -ENOMEM : 0;
+       unsigned int depth = ia->ri_max_frmr_depth;
+       struct rpcrdma_frmr *f = &r->frmr;
+       int rc;
+
+       f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
+       if (IS_ERR(f->fr_mr))
+               goto out_mr_err;
+
+       r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL);
+       if (!r->mw_sg)
+               goto out_list_err;
+
+       sg_init_table(r->mw_sg, depth);
+       init_completion(&f->fr_linv_done);
+       return 0;
+
+out_mr_err:
+       rc = PTR_ERR(f->fr_mr);
+       dprintk("RPC:       %s: ib_alloc_mr status %i\n",
+               __func__, rc);
+       return rc;
+
+out_list_err:
+       rc = -ENOMEM;
+       dprintk("RPC:       %s: sg allocation failure\n",
+               __func__);
+       ib_dereg_mr(f->fr_mr);
+       return rc;
 }
 
-void
-frwr_destroy_recovery_wq(void)
+static void
+frwr_op_release_mr(struct rpcrdma_mw *r)
 {
-       struct workqueue_struct *wq;
+       int rc;
 
-       if (!frwr_recovery_wq)
-               return;
+       /* Ensure MW is not on any rl_registered list */
+       if (!list_empty(&r->mw_list))
+               list_del(&r->mw_list);
 
-       wq = frwr_recovery_wq;
-       frwr_recovery_wq = NULL;
-       destroy_workqueue(wq);
+       rc = ib_dereg_mr(r->frmr.fr_mr);
+       if (rc)
+               pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
+                      r, rc);
+       kfree(r->mw_sg);
+       kfree(r);
 }
 
 static int
@@ -124,93 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
        return 0;
 }
 
-static void
-__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
-{
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_frmr *f = &mw->frmr;
-       int rc;
-
-       rc = __frwr_reset_mr(ia, mw);
-       ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
-       if (rc)
-               return;
-
-       rpcrdma_put_mw(r_xprt, mw);
-}
-
-/* Deferred reset of a single FRMR. Generate a fresh rkey by
- * replacing the MR.
+/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
  *
  * There's no recovery if this fails. The FRMR is abandoned, but
  * remains in rb_all. It will be cleaned up when the transport is
  * destroyed.
  */
 static void
-__frwr_recovery_worker(struct work_struct *work)
-{
-       struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
-                                           mw_work);
-
-       __frwr_reset_and_unmap(r->mw_xprt, r);
-       return;
-}
-
-/* A broken MR was discovered in a context that can't sleep.
- * Defer recovery to the recovery worker.
- */
-static void
-__frwr_queue_recovery(struct rpcrdma_mw *r)
-{
-       INIT_WORK(&r->mw_work, __frwr_recovery_worker);
-       queue_work(frwr_recovery_wq, &r->mw_work);
-}
-
-static int
-__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
-           unsigned int depth)
+frwr_op_recover_mr(struct rpcrdma_mw *mw)
 {
-       struct rpcrdma_frmr *f = &r->frmr;
+       struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
        int rc;
 
-       f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
-       if (IS_ERR(f->fr_mr))
-               goto out_mr_err;
-
-       f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
-       if (!f->fr_sg)
-               goto out_list_err;
-
-       sg_init_table(f->fr_sg, depth);
-
-       init_completion(&f->fr_linv_done);
-
-       return 0;
+       rc = __frwr_reset_mr(ia, mw);
+       ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
+       if (rc)
+               goto out_release;
 
-out_mr_err:
-       rc = PTR_ERR(f->fr_mr);
-       dprintk("RPC:       %s: ib_alloc_mr status %i\n",
-               __func__, rc);
-       return rc;
+       rpcrdma_put_mw(r_xprt, mw);
+       r_xprt->rx_stats.mrs_recovered++;
+       return;
 
-out_list_err:
-       rc = -ENOMEM;
-       dprintk("RPC:       %s: sg allocation failure\n",
-               __func__);
-       ib_dereg_mr(f->fr_mr);
-       return rc;
-}
+out_release:
+       pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw);
+       r_xprt->rx_stats.mrs_orphaned++;
 
-static void
-__frwr_release(struct rpcrdma_mw *r)
-{
-       int rc;
+       spin_lock(&r_xprt->rx_buf.rb_mwlock);
+       list_del(&mw->mw_all);
+       spin_unlock(&r_xprt->rx_buf.rb_mwlock);
 
-       rc = ib_dereg_mr(r->frmr.fr_mr);
-       if (rc)
-               dprintk("RPC:       %s: ib_dereg_mr status %i\n",
-                       __func__, rc);
-       kfree(r->frmr.fr_sg);
+       frwr_op_release_mr(mw);
 }
 
 static int
@@ -346,57 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
        complete_all(&frmr->fr_linv_done);
 }
 
-static int
-frwr_op_init(struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct ib_device *device = r_xprt->rx_ia.ri_device;
-       unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
-       struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
-       int i;
-
-       spin_lock_init(&buf->rb_mwlock);
-       INIT_LIST_HEAD(&buf->rb_mws);
-       INIT_LIST_HEAD(&buf->rb_all);
-
-       i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
-       i += 2;                         /* head + tail */
-       i *= buf->rb_max_requests;      /* one set for each RPC slot */
-       dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
-
-       while (i--) {
-               struct rpcrdma_mw *r;
-               int rc;
-
-               r = kzalloc(sizeof(*r), GFP_KERNEL);
-               if (!r)
-                       return -ENOMEM;
-
-               rc = __frwr_init(r, pd, device, depth);
-               if (rc) {
-                       kfree(r);
-                       return rc;
-               }
-
-               r->mw_xprt = r_xprt;
-               list_add(&r->mw_list, &buf->rb_mws);
-               list_add(&r->mw_all, &buf->rb_all);
-       }
-
-       return 0;
-}
-
-/* Post a FAST_REG Work Request to register a memory region
+/* Post a REG_MR Work Request to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  */
 static int
 frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
-           int nsegs, bool writing)
+           int nsegs, bool writing, struct rpcrdma_mw **out)
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct ib_device *device = ia->ri_device;
-       enum dma_data_direction direction = rpcrdma_data_dir(writing);
-       struct rpcrdma_mr_seg *seg1 = seg;
        struct rpcrdma_mw *mw;
        struct rpcrdma_frmr *frmr;
        struct ib_mr *mr;
@@ -405,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        int rc, i, n, dma_nents;
        u8 key;
 
-       mw = seg1->rl_mw;
-       seg1->rl_mw = NULL;
+       mw = NULL;
        do {
                if (mw)
-                       __frwr_queue_recovery(mw);
+                       rpcrdma_defer_mr_recovery(mw);
                mw = rpcrdma_get_mw(r_xprt);
                if (!mw)
-                       return -ENOMEM;
+                       return -ENOBUFS;
        } while (mw->frmr.fr_state != FRMR_IS_INVALID);
        frmr = &mw->frmr;
        frmr->fr_state = FRMR_IS_VALID;
@@ -421,15 +363,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 
        if (nsegs > ia->ri_max_frmr_depth)
                nsegs = ia->ri_max_frmr_depth;
-
        for (i = 0; i < nsegs;) {
                if (seg->mr_page)
-                       sg_set_page(&frmr->fr_sg[i],
+                       sg_set_page(&mw->mw_sg[i],
                                    seg->mr_page,
                                    seg->mr_len,
                                    offset_in_page(seg->mr_offset));
                else
-                       sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
+                       sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
                                   seg->mr_len);
 
                ++seg;
@@ -440,26 +381,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
                    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
                        break;
        }
-       frmr->fr_nents = i;
-       frmr->fr_dir = direction;
-
-       dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
-       if (!dma_nents) {
-               pr_err("RPC:       %s: failed to dma map sg %p sg_nents %u\n",
-                      __func__, frmr->fr_sg, frmr->fr_nents);
-               return -ENOMEM;
-       }
+       mw->mw_nents = i;
+       mw->mw_dir = rpcrdma_data_dir(writing);
+       if (i == 0)
+               goto out_dmamap_err;
 
-       n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE);
-       if (unlikely(n != frmr->fr_nents)) {
-               pr_err("RPC:       %s: failed to map mr %p (%u/%u)\n",
-                      __func__, frmr->fr_mr, n, frmr->fr_nents);
-               rc = n < 0 ? n : -EINVAL;
-               goto out_senderr;
-       }
+       dma_nents = ib_dma_map_sg(ia->ri_device,
+                                 mw->mw_sg, mw->mw_nents, mw->mw_dir);
+       if (!dma_nents)
+               goto out_dmamap_err;
+
+       n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE);
+       if (unlikely(n != mw->mw_nents))
+               goto out_mapmr_err;
 
        dprintk("RPC:       %s: Using frmr %p to map %u segments (%u bytes)\n",
-               __func__, mw, frmr->fr_nents, mr->length);
+               __func__, mw, mw->mw_nents, mr->length);
 
        key = (u8)(mr->rkey & 0x000000FF);
        ib_update_fast_reg_key(mr, ++key);
@@ -481,24 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        if (rc)
                goto out_senderr;
 
-       seg1->rl_mw = mw;
-       seg1->mr_rkey = mr->rkey;
-       seg1->mr_base = mr->iova;
-       seg1->mr_nsegs = frmr->fr_nents;
-       seg1->mr_len = mr->length;
+       mw->mw_handle = mr->rkey;
+       mw->mw_length = mr->length;
+       mw->mw_offset = mr->iova;
+
+       *out = mw;
+       return mw->mw_nents;
 
-       return frmr->fr_nents;
+out_dmamap_err:
+       pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
+              mw->mw_sg, mw->mw_nents);
+       rpcrdma_defer_mr_recovery(mw);
+       return -EIO;
+
+out_mapmr_err:
+       pr_err("rpcrdma: failed to map mr %p (%u/%u)\n",
+              frmr->fr_mr, n, mw->mw_nents);
+       rpcrdma_defer_mr_recovery(mw);
+       return -EIO;
 
 out_senderr:
-       dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
-       __frwr_queue_recovery(mw);
-       return rc;
+       pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
+       rpcrdma_defer_mr_recovery(mw);
+       return -ENOTCONN;
 }
 
 static struct ib_send_wr *
-__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
+__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
 {
-       struct rpcrdma_mw *mw = seg->rl_mw;
        struct rpcrdma_frmr *f = &mw->frmr;
        struct ib_send_wr *invalidate_wr;
 
@@ -518,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
  *
  * Sleeps until it is safe for the host CPU to access the
  * previously mapped memory regions.
+ *
+ * Caller ensures that req->rl_registered is not empty.
  */
 static void
 frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
        struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct rpcrdma_mr_seg *seg;
-       unsigned int i, nchunks;
+       struct rpcrdma_mw *mw, *tmp;
        struct rpcrdma_frmr *f;
-       struct rpcrdma_mw *mw;
        int rc;
 
        dprintk("RPC:       %s: req %p\n", __func__, req);
@@ -537,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         * Chain the LOCAL_INV Work Requests and post them with
         * a single ib_post_send() call.
         */
+       f = NULL;
        invalidate_wrs = pos = prev = NULL;
-       seg = NULL;
-       for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
-               seg = &req->rl_segments[i];
-
-               pos = __frwr_prepare_linv_wr(seg);
+       list_for_each_entry(mw, &req->rl_registered, mw_list) {
+               pos = __frwr_prepare_linv_wr(mw);
 
                if (!invalidate_wrs)
                        invalidate_wrs = pos;
                else
                        prev->next = pos;
                prev = pos;
-
-               i += seg->mr_nsegs;
+               f = &mw->frmr;
        }
-       f = &seg->rl_mw->frmr;
 
        /* Strong send queue ordering guarantees that when the
         * last WR in the chain completes, all WRs in the chain
@@ -577,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
         * them to the free MW list.
         */
 unmap:
-       for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
-               seg = &req->rl_segments[i];
-               mw = seg->rl_mw;
-               seg->rl_mw = NULL;
-
-               ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
-                               f->fr_dir);
+       list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+               list_del_init(&mw->mw_list);
+               ib_dma_unmap_sg(ia->ri_device,
+                               mw->mw_sg, mw->mw_nents, mw->mw_dir);
                rpcrdma_put_mw(r_xprt, mw);
-
-               i += seg->mr_nsegs;
-               seg->mr_nsegs = 0;
        }
-
-       req->rl_nchunks = 0;
        return;
 
 reset_mrs:
-       pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
+       pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc);
+       rdma_disconnect(ia->ri_id);
 
        /* Find and reset the MRs in the LOCAL_INV WRs that did not
         * get posted. This is synchronous, and slow.
         */
-       for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
-               seg = &req->rl_segments[i];
-               mw = seg->rl_mw;
+       list_for_each_entry(mw, &req->rl_registered, mw_list) {
                f = &mw->frmr;
-
                if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
                        __frwr_reset_mr(ia, mw);
                        bad_wr = bad_wr->next;
                }
-
-               i += seg->mr_nsegs;
        }
        goto unmap;
 }
@@ -621,38 +552,17 @@ static void
 frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
                   bool sync)
 {
-       struct rpcrdma_mr_seg *seg;
        struct rpcrdma_mw *mw;
-       unsigned int i;
 
-       for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
-               seg = &req->rl_segments[i];
-               mw = seg->rl_mw;
+       while (!list_empty(&req->rl_registered)) {
+               mw = list_first_entry(&req->rl_registered,
+                                     struct rpcrdma_mw, mw_list);
+               list_del_init(&mw->mw_list);
 
                if (sync)
-                       __frwr_reset_and_unmap(r_xprt, mw);
+                       frwr_op_recover_mr(mw);
                else
-                       __frwr_queue_recovery(mw);
-
-               i += seg->mr_nsegs;
-               seg->mr_nsegs = 0;
-               seg->rl_mw = NULL;
-       }
-}
-
-static void
-frwr_op_destroy(struct rpcrdma_buffer *buf)
-{
-       struct rpcrdma_mw *r;
-
-       /* Ensure stale MWs for "buf" are no longer in flight */
-       flush_workqueue(frwr_recovery_wq);
-
-       while (!list_empty(&buf->rb_all)) {
-               r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
-               list_del(&r->mw_all);
-               __frwr_release(r);
-               kfree(r);
+                       rpcrdma_defer_mr_recovery(mw);
        }
 }
 
@@ -660,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
        .ro_map                         = frwr_op_map,
        .ro_unmap_sync                  = frwr_op_unmap_sync,
        .ro_unmap_safe                  = frwr_op_unmap_safe,
+       .ro_recover_mr                  = frwr_op_recover_mr,
        .ro_open                        = frwr_op_open,
        .ro_maxpages                    = frwr_op_maxpages,
-       .ro_init                        = frwr_op_init,
-       .ro_destroy                     = frwr_op_destroy,
+       .ro_init_mr                     = frwr_op_init_mr,
+       .ro_release_mr                  = frwr_op_release_mr,
        .ro_displayname                 = "frwr",
 };
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
deleted file mode 100644 (file)
index 3750596..0000000
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2015 Oracle.  All rights reserved.
- * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
- */
-
-/* No-op chunk preparation. All client memory is pre-registered.
- * Sometimes referred to as ALLPHYSICAL mode.
- *
- * Physical registration is simple because all client memory is
- * pre-registered and never deregistered. This mode is good for
- * adapter bring up, but is considered not safe: the server is
- * trusted not to abuse its access to client memory not involved
- * in RDMA I/O.
- */
-
-#include "xprt_rdma.h"
-
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY       RPCDBG_TRANS
-#endif
-
-static int
-physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
-                struct rpcrdma_create_data_internal *cdata)
-{
-       struct ib_mr *mr;
-
-       /* Obtain an rkey to use for RPC data payloads.
-        */
-       mr = ib_get_dma_mr(ia->ri_pd,
-                          IB_ACCESS_LOCAL_WRITE |
-                          IB_ACCESS_REMOTE_WRITE |
-                          IB_ACCESS_REMOTE_READ);
-       if (IS_ERR(mr)) {
-               pr_err("%s: ib_get_dma_mr for failed with %lX\n",
-                      __func__, PTR_ERR(mr));
-               return -ENOMEM;
-       }
-       ia->ri_dma_mr = mr;
-
-       rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
-                                                     RPCRDMA_MAX_DATA_SEGS,
-                                                     RPCRDMA_MAX_HDR_SEGS));
-       return 0;
-}
-
-/* PHYSICAL memory registration conveys one page per chunk segment.
- */
-static size_t
-physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
-{
-       return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
-                    RPCRDMA_MAX_HDR_SEGS);
-}
-
-static int
-physical_op_init(struct rpcrdma_xprt *r_xprt)
-{
-       return 0;
-}
-
-/* The client's physical memory is already exposed for
- * remote access via RDMA READ or RDMA WRITE.
- */
-static int
-physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
-               int nsegs, bool writing)
-{
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
-       rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
-       seg->mr_rkey = ia->ri_dma_mr->rkey;
-       seg->mr_base = seg->mr_dma;
-       return 1;
-}
-
-/* DMA unmap all memory regions that were mapped for "req".
- */
-static void
-physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
-{
-       struct ib_device *device = r_xprt->rx_ia.ri_device;
-       unsigned int i;
-
-       for (i = 0; req->rl_nchunks; --req->rl_nchunks)
-               rpcrdma_unmap_one(device, &req->rl_segments[i++]);
-}
-
-/* Use a slow, safe mechanism to invalidate all memory regions
- * that were registered for "req".
- *
- * For physical memory registration, there is no good way to
- * fence a single MR that has been advertised to the server. The
- * client has already handed the server an R_key that cannot be
- * invalidated and is shared by all MRs on this connection.
- * Tearing down the PD might be the only safe choice, but it's
- * not clear that a freshly acquired DMA R_key would be different
- * than the one used by the PD that was just destroyed.
- * FIXME.
- */
-static void
-physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
-                      bool sync)
-{
-       physical_op_unmap_sync(r_xprt, req);
-}
-
-static void
-physical_op_destroy(struct rpcrdma_buffer *buf)
-{
-}
-
-const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
-       .ro_map                         = physical_op_map,
-       .ro_unmap_sync                  = physical_op_unmap_sync,
-       .ro_unmap_safe                  = physical_op_unmap_safe,
-       .ro_open                        = physical_op_open,
-       .ro_maxpages                    = physical_op_maxpages,
-       .ro_init                        = physical_op_init,
-       .ro_destroy                     = physical_op_destroy,
-       .ro_displayname                 = "physical",
-};
index 35a81096e83d50bd501726ed1d9376a5e4bcf54d..a47f170b20ef88d1ebe1f9ca406374dee1b84102 100644 (file)
@@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf)
  * MR when they can.
  */
 static int
-rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
-                    int n, int nsegs)
+rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
 {
        size_t page_offset;
        u32 remaining;
@@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
        base = vec->iov_base;
        page_offset = offset_in_page(base);
        remaining = vec->iov_len;
-       while (remaining && n < nsegs) {
+       while (remaining && n < RPCRDMA_MAX_SEGS) {
                seg[n].mr_page = NULL;
                seg[n].mr_offset = base;
                seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
@@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
 
 static int
 rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
-       enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
+       enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
 {
-       int len, n = 0, p;
-       int page_base;
+       int len, n, p, page_base;
        struct page **ppages;
 
+       n = 0;
        if (pos == 0) {
-               n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs);
-               if (n == nsegs)
-                       return -EIO;
+               n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n);
+               if (n == RPCRDMA_MAX_SEGS)
+                       goto out_overflow;
        }
 
        len = xdrbuf->page_len;
        ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
        page_base = xdrbuf->page_base & ~PAGE_MASK;
        p = 0;
-       while (len && n < nsegs) {
+       while (len && n < RPCRDMA_MAX_SEGS) {
                if (!ppages[p]) {
                        /* alloc the pagelist for receiving buffer */
                        ppages[p] = alloc_page(GFP_ATOMIC);
                        if (!ppages[p])
-                               return -ENOMEM;
+                               return -EAGAIN;
                }
                seg[n].mr_page = ppages[p];
                seg[n].mr_offset = (void *)(unsigned long) page_base;
                seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
                if (seg[n].mr_len > PAGE_SIZE)
-                       return -EIO;
+                       goto out_overflow;
                len -= seg[n].mr_len;
                ++n;
                ++p;
@@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
        }
 
        /* Message overflows the seg array */
-       if (len && n == nsegs)
-               return -EIO;
+       if (len && n == RPCRDMA_MAX_SEGS)
+               goto out_overflow;
 
        /* When encoding the read list, the tail is always sent inline */
        if (type == rpcrdma_readch)
@@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
                 * xdr pad bytes, saving the server an RDMA operation. */
                if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
                        return n;
-               n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs);
-               if (n == nsegs)
-                       return -EIO;
+               n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
+               if (n == RPCRDMA_MAX_SEGS)
+                       goto out_overflow;
        }
 
        return n;
+
+out_overflow:
+       pr_err("rpcrdma: segment array overflow\n");
+       return -EIO;
 }
 
 static inline __be32 *
-xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
 {
-       *iptr++ = cpu_to_be32(seg->mr_rkey);
-       *iptr++ = cpu_to_be32(seg->mr_len);
-       return xdr_encode_hyper(iptr, seg->mr_base);
+       *iptr++ = cpu_to_be32(mw->mw_handle);
+       *iptr++ = cpu_to_be32(mw->mw_length);
+       return xdr_encode_hyper(iptr, mw->mw_offset);
 }
 
 /* XDR-encode the Read list. Supports encoding a list of read
@@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
                         struct rpcrdma_req *req, struct rpc_rqst *rqst,
                         __be32 *iptr, enum rpcrdma_chunktype rtype)
 {
-       struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+       struct rpcrdma_mr_seg *seg;
+       struct rpcrdma_mw *mw;
        unsigned int pos;
        int n, nsegs;
 
@@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
        pos = rqst->rq_snd_buf.head[0].iov_len;
        if (rtype == rpcrdma_areadch)
                pos = 0;
-       nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
-                                    RPCRDMA_MAX_SEGS - req->rl_nchunks);
+       seg = req->rl_segments;
+       nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
        if (nsegs < 0)
                return ERR_PTR(nsegs);
 
        do {
-               n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
-               if (n <= 0)
+               n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+                                                false, &mw);
+               if (n < 0)
                        return ERR_PTR(n);
+               list_add(&mw->mw_list, &req->rl_registered);
 
                *iptr++ = xdr_one;      /* item present */
 
@@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
                 * have the same "position".
                 */
                *iptr++ = cpu_to_be32(pos);
-               iptr = xdr_encode_rdma_segment(iptr, seg);
+               iptr = xdr_encode_rdma_segment(iptr, mw);
 
-               dprintk("RPC: %5u %s: read segment pos %u "
-                       "%d@0x%016llx:0x%08x (%s)\n",
+               dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n",
                        rqst->rq_task->tk_pid, __func__, pos,
-                       seg->mr_len, (unsigned long long)seg->mr_base,
-                       seg->mr_rkey, n < nsegs ? "more" : "last");
+                       mw->mw_length, (unsigned long long)mw->mw_offset,
+                       mw->mw_handle, n < nsegs ? "more" : "last");
 
                r_xprt->rx_stats.read_chunk_count++;
-               req->rl_nchunks++;
                seg += n;
                nsegs -= n;
        } while (nsegs);
-       req->rl_nextseg = seg;
 
        /* Finish Read list */
        *iptr++ = xdr_zero;     /* Next item not present */
@@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
                          struct rpc_rqst *rqst, __be32 *iptr,
                          enum rpcrdma_chunktype wtype)
 {
-       struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+       struct rpcrdma_mr_seg *seg;
+       struct rpcrdma_mw *mw;
        int n, nsegs, nchunks;
        __be32 *segcount;
 
@@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
                return iptr;
        }
 
+       seg = req->rl_segments;
        nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
                                     rqst->rq_rcv_buf.head[0].iov_len,
-                                    wtype, seg,
-                                    RPCRDMA_MAX_SEGS - req->rl_nchunks);
+                                    wtype, seg);
        if (nsegs < 0)
                return ERR_PTR(nsegs);
 
@@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 
        nchunks = 0;
        do {
-               n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
-               if (n <= 0)
+               n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+                                                true, &mw);
+               if (n < 0)
                        return ERR_PTR(n);
+               list_add(&mw->mw_list, &req->rl_registered);
 
-               iptr = xdr_encode_rdma_segment(iptr, seg);
+               iptr = xdr_encode_rdma_segment(iptr, mw);
 
-               dprintk("RPC: %5u %s: write segment "
-                       "%d@0x016%llx:0x%08x (%s)\n",
+               dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n",
                        rqst->rq_task->tk_pid, __func__,
-                       seg->mr_len, (unsigned long long)seg->mr_base,
-                       seg->mr_rkey, n < nsegs ? "more" : "last");
+                       mw->mw_length, (unsigned long long)mw->mw_offset,
+                       mw->mw_handle, n < nsegs ? "more" : "last");
 
                r_xprt->rx_stats.write_chunk_count++;
                r_xprt->rx_stats.total_rdma_request += seg->mr_len;
-               req->rl_nchunks++;
                nchunks++;
                seg   += n;
                nsegs -= n;
        } while (nsegs);
-       req->rl_nextseg = seg;
 
        /* Update count of segments in this Write chunk */
        *segcount = cpu_to_be32(nchunks);
@@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
                           struct rpcrdma_req *req, struct rpc_rqst *rqst,
                           __be32 *iptr, enum rpcrdma_chunktype wtype)
 {
-       struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+       struct rpcrdma_mr_seg *seg;
+       struct rpcrdma_mw *mw;
        int n, nsegs, nchunks;
        __be32 *segcount;
 
@@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
                return iptr;
        }
 
-       nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
-                                    RPCRDMA_MAX_SEGS - req->rl_nchunks);
+       seg = req->rl_segments;
+       nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
        if (nsegs < 0)
                return ERR_PTR(nsegs);
 
@@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
 
        nchunks = 0;
        do {
-               n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
-               if (n <= 0)
+               n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+                                                true, &mw);
+               if (n < 0)
                        return ERR_PTR(n);
+               list_add(&mw->mw_list, &req->rl_registered);
 
-               iptr = xdr_encode_rdma_segment(iptr, seg);
+               iptr = xdr_encode_rdma_segment(iptr, mw);
 
-               dprintk("RPC: %5u %s: reply segment "
-                       "%d@0x%016llx:0x%08x (%s)\n",
+               dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n",
                        rqst->rq_task->tk_pid, __func__,
-                       seg->mr_len, (unsigned long long)seg->mr_base,
-                       seg->mr_rkey, n < nsegs ? "more" : "last");
+                       mw->mw_length, (unsigned long long)mw->mw_offset,
+                       mw->mw_handle, n < nsegs ? "more" : "last");
 
                r_xprt->rx_stats.reply_chunk_count++;
                r_xprt->rx_stats.total_rdma_request += seg->mr_len;
-               req->rl_nchunks++;
                nchunks++;
                seg   += n;
                nsegs -= n;
        } while (nsegs);
-       req->rl_nextseg = seg;
 
        /* Update count of segments in the Reply chunk */
        *segcount = cpu_to_be32(nchunks);
@@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
        enum rpcrdma_chunktype rtype, wtype;
        struct rpcrdma_msg *headerp;
+       bool ddp_allowed;
        ssize_t hdrlen;
        size_t rpclen;
        __be32 *iptr;
@@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
        headerp->rm_type = rdma_msg;
 
+       /* When the ULP employs a GSS flavor that guarantees integrity
+        * or privacy, direct data placement of individual data items
+        * is not allowed.
+        */
+       ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
+                                               RPCAUTH_AUTH_DATATOUCH);
+
        /*
         * Chunks needed for results?
         *
@@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         */
        if (rpcrdma_results_inline(r_xprt, rqst))
                wtype = rpcrdma_noch;
-       else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+       else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
                wtype = rpcrdma_writech;
        else
                wtype = rpcrdma_replych;
@@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
                rtype = rpcrdma_noch;
                rpcrdma_inline_pullup(rqst);
                rpclen = rqst->rq_svec[0].iov_len;
-       } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+       } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
                rtype = rpcrdma_readch;
                rpclen = rqst->rq_svec[0].iov_len;
                rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
@@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         * send a Call message with a Position Zero Read chunk and a
         * regular Read chunk at the same time.
         */
-       req->rl_nchunks = 0;
-       req->rl_nextseg = req->rl_segments;
        iptr = headerp->rm_body.rm_chunks;
        iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
        if (IS_ERR(iptr))
@@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 out_overflow:
        pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
                hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
-       /* Terminate this RPC. Chunks registered above will be
-        * released by xprt_release -> xprt_rmda_free .
-        */
-       return -EIO;
+       iptr = ERR_PTR(-EIO);
 
 out_unmap:
        r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
@@ -705,15 +711,13 @@ out_unmap:
  * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
  */
 static int
-rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
+rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
 {
        unsigned int i, total_len;
        struct rpcrdma_write_chunk *cur_wchunk;
        char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
 
        i = be32_to_cpu(**iptrp);
-       if (i > max)
-               return -1;
        cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
        total_len = 0;
        while (i--) {
@@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
        return total_len;
 }
 
-/*
- * Scatter inline received data back into provided iov's.
+/**
+ * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
+ * @rqst: controlling RPC request
+ * @srcp: points to RPC message payload in receive buffer
+ * @copy_len: remaining length of receive buffer content
+ * @pad: Write chunk pad bytes needed (zero for pure inline)
+ *
+ * The upper layer has set the maximum number of bytes it can
+ * receive in each component of rq_rcv_buf. These values are set in
+ * the head.iov_len, page_len, tail.iov_len, and buflen fields.
+ *
+ * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
+ * many cases this function simply updates iov_base pointers in
+ * rq_rcv_buf to point directly to the received reply data, to
+ * avoid copying reply data.
+ *
+ * Returns the count of bytes which had to be memcopied.
  */
-static void
+static unsigned long
 rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 {
-       int i, npages, curlen, olen;
+       unsigned long fixup_copy_count;
+       int i, npages, curlen;
        char *destp;
        struct page **ppages;
        int page_base;
 
+       /* The head iovec is redirected to the RPC reply message
+        * in the receive buffer, to avoid a memcopy.
+        */
+       rqst->rq_rcv_buf.head[0].iov_base = srcp;
+       rqst->rq_private_buf.head[0].iov_base = srcp;
+
+       /* The contents of the receive buffer that follow
+        * head.iov_len bytes are copied into the page list.
+        */
        curlen = rqst->rq_rcv_buf.head[0].iov_len;
-       if (curlen > copy_len) {        /* write chunk header fixup */
+       if (curlen > copy_len)
                curlen = copy_len;
-               rqst->rq_rcv_buf.head[0].iov_len = curlen;
-       }
-
        dprintk("RPC:       %s: srcp 0x%p len %d hdrlen %d\n",
                __func__, srcp, copy_len, curlen);
-
-       /* Shift pointer for first receive segment only */
-       rqst->rq_rcv_buf.head[0].iov_base = srcp;
        srcp += curlen;
        copy_len -= curlen;
 
-       olen = copy_len;
-       i = 0;
-       rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
        page_base = rqst->rq_rcv_buf.page_base;
        ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
        page_base &= ~PAGE_MASK;
-
+       fixup_copy_count = 0;
        if (copy_len && rqst->rq_rcv_buf.page_len) {
-               npages = PAGE_ALIGN(page_base +
-                       rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
-               for (; i < npages; i++) {
+               int pagelist_len;
+
+               pagelist_len = rqst->rq_rcv_buf.page_len;
+               if (pagelist_len > copy_len)
+                       pagelist_len = copy_len;
+               npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
+               for (i = 0; i < npages; i++) {
                        curlen = PAGE_SIZE - page_base;
-                       if (curlen > copy_len)
-                               curlen = copy_len;
+                       if (curlen > pagelist_len)
+                               curlen = pagelist_len;
+
                        dprintk("RPC:       %s: page %d"
                                " srcp 0x%p len %d curlen %d\n",
                                __func__, i, srcp, copy_len, curlen);
@@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
                        kunmap_atomic(destp);
                        srcp += curlen;
                        copy_len -= curlen;
-                       if (copy_len == 0)
+                       fixup_copy_count += curlen;
+                       pagelist_len -= curlen;
+                       if (!pagelist_len)
                                break;
                        page_base = 0;
                }
-       }
 
-       if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
-               curlen = copy_len;
-               if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
-                       curlen = rqst->rq_rcv_buf.tail[0].iov_len;
-               if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
-                       memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
-               dprintk("RPC:       %s: tail srcp 0x%p len %d curlen %d\n",
-                       __func__, srcp, copy_len, curlen);
-               rqst->rq_rcv_buf.tail[0].iov_len = curlen;
-               copy_len -= curlen; ++i;
-       } else
-               rqst->rq_rcv_buf.tail[0].iov_len = 0;
-
-       if (pad) {
-               /* implicit padding on terminal chunk */
-               unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
-               while (pad--)
-                       p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
+               /* Implicit padding for the last segment in a Write
+                * chunk is inserted inline at the front of the tail
+                * iovec. The upper layer ignores the content of
+                * the pad. Simply ensure inline content in the tail
+                * that follows the Write chunk is properly aligned.
+                */
+               if (pad)
+                       srcp -= pad;
        }
 
-       if (copy_len)
-               dprintk("RPC:       %s: %d bytes in"
-                       " %d extra segments (%d lost)\n",
-                       __func__, olen, i, copy_len);
+       /* The tail iovec is redirected to the remaining data
+        * in the receive buffer, to avoid a memcopy.
+        */
+       if (copy_len || pad) {
+               rqst->rq_rcv_buf.tail[0].iov_base = srcp;
+               rqst->rq_private_buf.tail[0].iov_base = srcp;
+       }
 
-       /* TBD avoid a warning from call_decode() */
-       rqst->rq_private_buf = rqst->rq_rcv_buf;
+       return fixup_copy_count;
 }
 
 void
@@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
                    (headerp->rm_body.rm_chunks[1] == xdr_zero &&
                     headerp->rm_body.rm_chunks[2] != xdr_zero) ||
                    (headerp->rm_body.rm_chunks[1] != xdr_zero &&
-                    req->rl_nchunks == 0))
+                    list_empty(&req->rl_registered)))
                        goto badheader;
                if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
                        /* count any expected write chunks in read reply */
                        /* start at write chunk array count */
                        iptr = &headerp->rm_body.rm_chunks[2];
-                       rdmalen = rpcrdma_count_chunks(rep,
-                                               req->rl_nchunks, 1, &iptr);
+                       rdmalen = rpcrdma_count_chunks(rep, 1, &iptr);
                        /* check for validity, and no reply chunk after */
                        if (rdmalen < 0 || *iptr++ != xdr_zero)
                                goto badheader;
@@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
                        rep->rr_len -= RPCRDMA_HDRLEN_MIN;
                        status = rep->rr_len;
                }
-               /* Fix up the rpc results for upper layer */
-               rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
+
+               r_xprt->rx_stats.fixup_copy_count +=
+                       rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len,
+                                            rdmalen);
                break;
 
        case rdma_nomsg:
@@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
                if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
                    headerp->rm_body.rm_chunks[1] != xdr_zero ||
                    headerp->rm_body.rm_chunks[2] != xdr_one ||
-                   req->rl_nchunks == 0)
+                   list_empty(&req->rl_registered))
                        goto badheader;
                iptr = (__be32 *)((unsigned char *)headerp +
                                                        RPCRDMA_HDRLEN_MIN);
-               rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
+               rdmalen = rpcrdma_count_chunks(rep, 0, &iptr);
                if (rdmalen < 0)
                        goto badheader;
                r_xprt->rx_stats.total_rdma_reply += rdmalen;
@@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 
 badheader:
        default:
-               dprintk("%s: invalid rpcrdma reply header (type %d):"
-                               " chunks[012] == %d %d %d"
-                               " expected chunks <= %d\n",
-                               __func__, be32_to_cpu(headerp->rm_type),
-                               headerp->rm_body.rm_chunks[0],
-                               headerp->rm_body.rm_chunks[1],
-                               headerp->rm_body.rm_chunks[2],
-                               req->rl_nchunks);
+               dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
+                       rqst->rq_task->tk_pid, __func__,
+                       be32_to_cpu(headerp->rm_type));
                status = -EIO;
                r_xprt->rx_stats.bad_reply_count++;
                break;
@@ -1035,7 +1049,7 @@ out:
         * control: waking the next RPC waits until this RPC has
         * relinquished all its Send Queue entries.
         */
-       if (req->rl_nchunks)
+       if (!list_empty(&req->rl_registered))
                r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
 
        spin_lock_bh(&xprt->transport_lock);
index 99d2e5b72726abd00f1ac5e5732d5fa02119f55a..81f0e879f019e43d35cc9c85ead0dfd17ebc8d30 100644 (file)
@@ -558,7 +558,6 @@ out_sendbuf:
 
 out_fail:
        rpcrdma_buffer_put(req);
-       r_xprt->rx_stats.failed_marshal_count++;
        return NULL;
 }
 
@@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer)
        rpcrdma_buffer_put(req);
 }
 
-/*
+/**
+ * xprt_rdma_send_request - marshal and send an RPC request
+ * @task: RPC task with an RPC message in rq_snd_buf
+ *
+ * Return values:
+ *        0:   The request has been sent
+ * ENOTCONN:   Caller needs to invoke connect logic then call again
+ *  ENOBUFS:   Call again later to send the request
+ *      EIO:   A permanent error occurred. The request was not sent,
+ *             and don't try it again
+ *
  * send_request invokes the meat of RPC RDMA. It must do the following:
+ *
  *  1.  Marshal the RPC request into an RPC RDMA request, which means
  *     putting a header in front of data, and creating IOVs for RDMA
  *     from those in the request.
@@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer)
  *     the request (rpcrdma_ep_post).
  *  4.  No partial sends are possible in the RPC-RDMA protocol (as in UDP).
  */
-
 static int
 xprt_rdma_send_request(struct rpc_task *task)
 {
@@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task)
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
        int rc = 0;
 
+       /* On retransmit, remove any previously registered chunks */
+       r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+
        rc = rpcrdma_marshal_req(rqst);
        if (rc < 0)
                goto failed_marshal;
@@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task)
        return 0;
 
 failed_marshal:
-       r_xprt->rx_stats.failed_marshal_count++;
        dprintk("RPC:       %s: rpcrdma_marshal_req failed, status %i\n",
                __func__, rc);
        if (rc == -EIO)
-               return -EIO;
+               r_xprt->rx_stats.failed_marshal_count++;
+       if (rc != -ENOTCONN)
+               return rc;
 drop_connection:
        xprt_disconnect_done(xprt);
        return -ENOTCONN;       /* implies disconnect */
@@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
                   xprt->stat.bad_xids,
                   xprt->stat.req_u,
                   xprt->stat.bklog_u);
-       seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n",
+       seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ",
                   r_xprt->rx_stats.read_chunk_count,
                   r_xprt->rx_stats.write_chunk_count,
                   r_xprt->rx_stats.reply_chunk_count,
@@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
                   r_xprt->rx_stats.failed_marshal_count,
                   r_xprt->rx_stats.bad_reply_count,
                   r_xprt->rx_stats.nomsg_call_count);
+       seq_printf(seq, "%lu %lu %lu\n",
+                  r_xprt->rx_stats.mrs_recovered,
+                  r_xprt->rx_stats.mrs_orphaned,
+                  r_xprt->rx_stats.mrs_allocated);
 }
 
 static int
@@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void)
                        __func__, rc);
 
        rpcrdma_destroy_wq();
-       frwr_destroy_recovery_wq();
 
        rc = xprt_unregister_transport(&xprt_rdma_bc);
        if (rc)
@@ -753,20 +769,13 @@ int xprt_rdma_init(void)
 {
        int rc;
 
-       rc = frwr_alloc_recovery_wq();
-       if (rc)
-               return rc;
-
        rc = rpcrdma_alloc_wq();
-       if (rc) {
-               frwr_destroy_recovery_wq();
+       if (rc)
                return rc;
-       }
 
        rc = xprt_register_transport(&xprt_rdma);
        if (rc) {
                rpcrdma_destroy_wq();
-               frwr_destroy_recovery_wq();
                return rc;
        }
 
@@ -774,7 +783,6 @@ int xprt_rdma_init(void)
        if (rc) {
                xprt_unregister_transport(&xprt_rdma);
                rpcrdma_destroy_wq();
-               frwr_destroy_recovery_wq();
                return rc;
        }
 
index b044d98a1370207422d129689bb43b35766fc76f..536d0be3f61bdd3995f95a5ae3893e9cf9d84997 100644 (file)
@@ -379,8 +379,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
        struct rpcrdma_ia *ia = &xprt->rx_ia;
        int rc;
 
-       ia->ri_dma_mr = NULL;
-
        ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
        if (IS_ERR(ia->ri_id)) {
                rc = PTR_ERR(ia->ri_id);
@@ -391,47 +389,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
        ia->ri_pd = ib_alloc_pd(ia->ri_device);
        if (IS_ERR(ia->ri_pd)) {
                rc = PTR_ERR(ia->ri_pd);
-               dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
-                       __func__, rc);
+               pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
                goto out2;
        }
 
-       if (memreg == RPCRDMA_FRMR) {
-               if (!(ia->ri_device->attrs.device_cap_flags &
-                               IB_DEVICE_MEM_MGT_EXTENSIONS) ||
-                   (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
-                       dprintk("RPC:       %s: FRMR registration "
-                               "not supported by HCA\n", __func__);
-                       memreg = RPCRDMA_MTHCAFMR;
-               }
-       }
-       if (memreg == RPCRDMA_MTHCAFMR) {
-               if (!ia->ri_device->alloc_fmr) {
-                       dprintk("RPC:       %s: MTHCAFMR registration "
-                               "not supported by HCA\n", __func__);
-                       rc = -EINVAL;
-                       goto out3;
-               }
-       }
-
        switch (memreg) {
        case RPCRDMA_FRMR:
-               ia->ri_ops = &rpcrdma_frwr_memreg_ops;
-               break;
-       case RPCRDMA_ALLPHYSICAL:
-               ia->ri_ops = &rpcrdma_physical_memreg_ops;
-               break;
+               if (frwr_is_supported(ia)) {
+                       ia->ri_ops = &rpcrdma_frwr_memreg_ops;
+                       break;
+               }
+               /*FALLTHROUGH*/
        case RPCRDMA_MTHCAFMR:
-               ia->ri_ops = &rpcrdma_fmr_memreg_ops;
-               break;
+               if (fmr_is_supported(ia)) {
+                       ia->ri_ops = &rpcrdma_fmr_memreg_ops;
+                       break;
+               }
+               /*FALLTHROUGH*/
        default:
-               printk(KERN_ERR "RPC: Unsupported memory "
-                               "registration mode: %d\n", memreg);
-               rc = -ENOMEM;
+               pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
+                      memreg);
+               rc = -EINVAL;
                goto out3;
        }
-       dprintk("RPC:       %s: memory registration strategy is '%s'\n",
-               __func__, ia->ri_ops->ro_displayname);
 
        return 0;
 
@@ -585,8 +565,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 out2:
        ib_free_cq(sendcq);
 out1:
-       if (ia->ri_dma_mr)
-               ib_dereg_mr(ia->ri_dma_mr);
        return rc;
 }
 
@@ -600,8 +578,6 @@ out1:
 void
 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 {
-       int rc;
-
        dprintk("RPC:       %s: entering, connected is %d\n",
                __func__, ep->rep_connected);
 
@@ -615,12 +591,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 
        ib_free_cq(ep->rep_attr.recv_cq);
        ib_free_cq(ep->rep_attr.send_cq);
-
-       if (ia->ri_dma_mr) {
-               rc = ib_dereg_mr(ia->ri_dma_mr);
-               dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
-                       __func__, rc);
-       }
 }
 
 /*
@@ -777,6 +747,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
        ib_drain_qp(ia->ri_id->qp);
 }
 
+static void
+rpcrdma_mr_recovery_worker(struct work_struct *work)
+{
+       struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
+                                                 rb_recovery_worker.work);
+       struct rpcrdma_mw *mw;
+
+       spin_lock(&buf->rb_recovery_lock);
+       while (!list_empty(&buf->rb_stale_mrs)) {
+               mw = list_first_entry(&buf->rb_stale_mrs,
+                                     struct rpcrdma_mw, mw_list);
+               list_del_init(&mw->mw_list);
+               spin_unlock(&buf->rb_recovery_lock);
+
+               dprintk("RPC:       %s: recovering MR %p\n", __func__, mw);
+               mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw);
+
+               spin_lock(&buf->rb_recovery_lock);
+       }
+       spin_unlock(&buf->rb_recovery_lock);
+}
+
+void
+rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
+{
+       struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+
+       spin_lock(&buf->rb_recovery_lock);
+       list_add(&mw->mw_list, &buf->rb_stale_mrs);
+       spin_unlock(&buf->rb_recovery_lock);
+
+       schedule_delayed_work(&buf->rb_recovery_worker, 0);
+}
+
+static void
+rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
+{
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       unsigned int count;
+       LIST_HEAD(free);
+       LIST_HEAD(all);
+
+       for (count = 0; count < 32; count++) {
+               struct rpcrdma_mw *mw;
+               int rc;
+
+               mw = kzalloc(sizeof(*mw), GFP_KERNEL);
+               if (!mw)
+                       break;
+
+               rc = ia->ri_ops->ro_init_mr(ia, mw);
+               if (rc) {
+                       kfree(mw);
+                       break;
+               }
+
+               mw->mw_xprt = r_xprt;
+
+               list_add(&mw->mw_list, &free);
+               list_add(&mw->mw_all, &all);
+       }
+
+       spin_lock(&buf->rb_mwlock);
+       list_splice(&free, &buf->rb_mws);
+       list_splice(&all, &buf->rb_all);
+       r_xprt->rx_stats.mrs_allocated += count;
+       spin_unlock(&buf->rb_mwlock);
+
+       dprintk("RPC:       %s: created %u MRs\n", __func__, count);
+}
+
+static void
+rpcrdma_mr_refresh_worker(struct work_struct *work)
+{
+       struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
+                                                 rb_refresh_worker.work);
+       struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
+                                                  rx_buf);
+
+       rpcrdma_create_mrs(r_xprt);
+}
+
 struct rpcrdma_req *
 rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
 {
@@ -793,6 +847,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
        spin_unlock(&buffer->rb_reqslock);
        req->rl_cqe.done = rpcrdma_wc_send;
        req->rl_buffer = &r_xprt->rx_buf;
+       INIT_LIST_HEAD(&req->rl_registered);
        return req;
 }
 
@@ -832,17 +887,23 @@ int
 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
        int i, rc;
 
        buf->rb_max_requests = r_xprt->rx_data.max_requests;
        buf->rb_bc_srv_max_requests = 0;
-       spin_lock_init(&buf->rb_lock);
        atomic_set(&buf->rb_credits, 1);
+       spin_lock_init(&buf->rb_mwlock);
+       spin_lock_init(&buf->rb_lock);
+       spin_lock_init(&buf->rb_recovery_lock);
+       INIT_LIST_HEAD(&buf->rb_mws);
+       INIT_LIST_HEAD(&buf->rb_all);
+       INIT_LIST_HEAD(&buf->rb_stale_mrs);
+       INIT_DELAYED_WORK(&buf->rb_refresh_worker,
+                         rpcrdma_mr_refresh_worker);
+       INIT_DELAYED_WORK(&buf->rb_recovery_worker,
+                         rpcrdma_mr_recovery_worker);
 
-       rc = ia->ri_ops->ro_init(r_xprt);
-       if (rc)
-               goto out;
+       rpcrdma_create_mrs(r_xprt);
 
        INIT_LIST_HEAD(&buf->rb_send_bufs);
        INIT_LIST_HEAD(&buf->rb_allreqs);
@@ -862,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
        }
 
        INIT_LIST_HEAD(&buf->rb_recv_bufs);
-       for (i = 0; i < buf->rb_max_requests + 2; i++) {
+       for (i = 0; i < buf->rb_max_requests; i++) {
                struct rpcrdma_rep *rep;
 
                rep = rpcrdma_create_rep(r_xprt);
@@ -918,11 +979,39 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
        kfree(req);
 }
 
+static void
+rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
+{
+       struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
+                                                  rx_buf);
+       struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+       struct rpcrdma_mw *mw;
+       unsigned int count;
+
+       count = 0;
+       spin_lock(&buf->rb_mwlock);
+       while (!list_empty(&buf->rb_all)) {
+               mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+               list_del(&mw->mw_all);
+
+               spin_unlock(&buf->rb_mwlock);
+               ia->ri_ops->ro_release_mr(mw);
+               count++;
+               spin_lock(&buf->rb_mwlock);
+       }
+       spin_unlock(&buf->rb_mwlock);
+       r_xprt->rx_stats.mrs_allocated = 0;
+
+       dprintk("RPC:       %s: released %u MRs\n", __func__, count);
+}
+
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
        struct rpcrdma_ia *ia = rdmab_to_ia(buf);
 
+       cancel_delayed_work_sync(&buf->rb_recovery_worker);
+
        while (!list_empty(&buf->rb_recv_bufs)) {
                struct rpcrdma_rep *rep;
 
@@ -944,7 +1033,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
        }
        spin_unlock(&buf->rb_reqslock);
 
-       ia->ri_ops->ro_destroy(buf);
+       rpcrdma_destroy_mrs(buf);
 }
 
 struct rpcrdma_mw *
@@ -962,8 +1051,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
        spin_unlock(&buf->rb_mwlock);
 
        if (!mw)
-               pr_err("RPC:       %s: no MWs available\n", __func__);
+               goto out_nomws;
        return mw;
+
+out_nomws:
+       dprintk("RPC:       %s: no MWs available\n", __func__);
+       schedule_delayed_work(&buf->rb_refresh_worker, 0);
+
+       /* Allow the reply handler and refresh worker to run */
+       cond_resched();
+
+       return NULL;
 }
 
 void
@@ -978,8 +1076,6 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
 
 /*
  * Get a set of request/reply buffers.
- *
- * Reply buffer (if available) is attached to send buffer upon return.
  */
 struct rpcrdma_req *
 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@@ -998,13 +1094,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 
 out_reqbuf:
        spin_unlock(&buffers->rb_lock);
-       pr_warn("RPC:       %s: out of request buffers\n", __func__);
+       pr_warn("rpcrdma: out of request buffers (%p)\n", buffers);
        return NULL;
 out_repbuf:
+       list_add(&req->rl_free, &buffers->rb_send_bufs);
        spin_unlock(&buffers->rb_lock);
-       pr_warn("RPC:       %s: out of reply buffers\n", __func__);
-       req->rl_reply = NULL;
-       return req;
+       pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers);
+       return NULL;
 }
 
 /*
@@ -1060,14 +1156,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
  */
 
-void
-rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
-{
-       dprintk("RPC:       map_one: offset %p iova %llx len %zu\n",
-               seg->mr_offset,
-               (unsigned long long)seg->mr_dma, seg->mr_dmalen);
-}
-
 /**
  * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
  * @ia: controlling rpcrdma_ia
@@ -1150,7 +1238,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
        if (rep) {
                rc = rpcrdma_ep_post_recv(ia, ep, rep);
                if (rc)
-                       goto out;
+                       return rc;
                req->rl_reply = NULL;
        }
 
@@ -1175,10 +1263,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 
        rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
        if (rc)
-               dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
-                       rc);
-out:
-       return rc;
+               goto out_postsend_err;
+       return 0;
+
+out_postsend_err:
+       pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
+       return -ENOTCONN;
 }
 
 /*
@@ -1203,11 +1293,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
                                   DMA_BIDIRECTIONAL);
 
        rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
-
        if (rc)
-               dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
-                       rc);
-       return rc;
+               goto out_postrecv;
+       return 0;
+
+out_postrecv:
+       pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
+       return -ENOTCONN;
 }
 
 /**
index 95cdc66225ee1f52542119b7a2e152b888cf46b0..670fad57153a109b5f8d14ac9cd3733e874c81d3 100644 (file)
@@ -68,7 +68,6 @@ struct rpcrdma_ia {
        struct ib_device        *ri_device;
        struct rdma_cm_id       *ri_id;
        struct ib_pd            *ri_pd;
-       struct ib_mr            *ri_dma_mr;
        struct completion       ri_done;
        int                     ri_async_rc;
        unsigned int            ri_max_frmr_depth;
@@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
  *   o recv buffer (posted to provider)
  *   o ib_sge (also donated to provider)
  *   o status of reply (length, success or not)
- *   o bookkeeping state to get run by tasklet (list, etc)
+ *   o bookkeeping state to get run by reply handler (list, etc)
  *
- * These are allocated during initialization, per-transport instance;
- * however, the tasklet execution list itself is global, as it should
- * always be pretty short.
+ * These are allocated during initialization, per-transport instance.
  *
  * N of these are associated with a transport instance, and stored in
  * struct rpcrdma_buffer. N is the max number of outstanding requests.
  */
 
-#define RPCRDMA_MAX_DATA_SEGS  ((1 * 1024 * 1024) / PAGE_SIZE)
-
-/* data segments + head/tail for Call + head/tail for Reply */
-#define RPCRDMA_MAX_SEGS       (RPCRDMA_MAX_DATA_SEGS + 4)
-
-struct rpcrdma_buffer;
-
 struct rpcrdma_rep {
        struct ib_cqe           rr_cqe;
        unsigned int            rr_len;
@@ -221,9 +211,6 @@ enum rpcrdma_frmr_state {
 };
 
 struct rpcrdma_frmr {
-       struct scatterlist              *fr_sg;
-       int                             fr_nents;
-       enum dma_data_direction         fr_dir;
        struct ib_mr                    *fr_mr;
        struct ib_cqe                   fr_cqe;
        enum rpcrdma_frmr_state         fr_state;
@@ -235,18 +222,23 @@ struct rpcrdma_frmr {
 };
 
 struct rpcrdma_fmr {
-       struct ib_fmr           *fmr;
-       u64                     *physaddrs;
+       struct ib_fmr           *fm_mr;
+       u64                     *fm_physaddrs;
 };
 
 struct rpcrdma_mw {
+       struct list_head        mw_list;
+       struct scatterlist      *mw_sg;
+       int                     mw_nents;
+       enum dma_data_direction mw_dir;
        union {
                struct rpcrdma_fmr      fmr;
                struct rpcrdma_frmr     frmr;
        };
-       struct work_struct      mw_work;
        struct rpcrdma_xprt     *mw_xprt;
-       struct list_head        mw_list;
+       u32                     mw_handle;
+       u32                     mw_length;
+       u64                     mw_offset;
        struct list_head        mw_all;
 };
 
@@ -266,33 +258,30 @@ struct rpcrdma_mw {
  * of iovs for send operations. The reason is that the iovs passed to
  * ib_post_{send,recv} must not be modified until the work request
  * completes.
- *
- * NOTES:
- *   o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
- *     marshal. The number needed varies depending on the iov lists that
- *     are passed to us, the memory registration mode we are in, and if
- *     physical addressing is used, the layout.
  */
 
+/* Maximum number of page-sized "segments" per chunk list to be
+ * registered or invalidated. Must handle a Reply chunk:
+ */
+enum {
+       RPCRDMA_MAX_IOV_SEGS    = 3,
+       RPCRDMA_MAX_DATA_SEGS   = ((1 * 1024 * 1024) / PAGE_SIZE) + 1,
+       RPCRDMA_MAX_SEGS        = RPCRDMA_MAX_DATA_SEGS +
+                                 RPCRDMA_MAX_IOV_SEGS,
+};
+
 struct rpcrdma_mr_seg {                /* chunk descriptors */
-       struct rpcrdma_mw *rl_mw;       /* registered MR */
-       u64             mr_base;        /* registration result */
-       u32             mr_rkey;        /* registration result */
        u32             mr_len;         /* length of chunk or segment */
-       int             mr_nsegs;       /* number of segments in chunk or 0 */
-       enum dma_data_direction mr_dir; /* segment mapping direction */
-       dma_addr_t      mr_dma;         /* segment mapping address */
-       size_t          mr_dmalen;      /* segment mapping length */
        struct page     *mr_page;       /* owning page, if any */
        char            *mr_offset;     /* kva if no page, else offset */
 };
 
 #define RPCRDMA_MAX_IOVS       (2)
 
+struct rpcrdma_buffer;
 struct rpcrdma_req {
        struct list_head        rl_free;
        unsigned int            rl_niovs;
-       unsigned int            rl_nchunks;
        unsigned int            rl_connect_cookie;
        struct rpc_task         *rl_task;
        struct rpcrdma_buffer   *rl_buffer;
@@ -300,12 +289,13 @@ struct rpcrdma_req {
        struct ib_sge           rl_send_iov[RPCRDMA_MAX_IOVS];
        struct rpcrdma_regbuf   *rl_rdmabuf;
        struct rpcrdma_regbuf   *rl_sendbuf;
-       struct rpcrdma_mr_seg   rl_segments[RPCRDMA_MAX_SEGS];
-       struct rpcrdma_mr_seg   *rl_nextseg;
 
        struct ib_cqe           rl_cqe;
        struct list_head        rl_all;
        bool                    rl_backchannel;
+
+       struct list_head        rl_registered;  /* registered segments */
+       struct rpcrdma_mr_seg   rl_segments[RPCRDMA_MAX_SEGS];
 };
 
 static inline struct rpcrdma_req *
@@ -341,6 +331,11 @@ struct rpcrdma_buffer {
        struct list_head        rb_allreqs;
 
        u32                     rb_bc_max_requests;
+
+       spinlock_t              rb_recovery_lock; /* protect rb_stale_mrs */
+       struct list_head        rb_stale_mrs;
+       struct delayed_work     rb_recovery_worker;
+       struct delayed_work     rb_refresh_worker;
 };
 #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
 
@@ -387,6 +382,9 @@ struct rpcrdma_stats {
        unsigned long           bad_reply_count;
        unsigned long           nomsg_call_count;
        unsigned long           bcall_count;
+       unsigned long           mrs_recovered;
+       unsigned long           mrs_orphaned;
+       unsigned long           mrs_allocated;
 };
 
 /*
@@ -395,23 +393,25 @@ struct rpcrdma_stats {
 struct rpcrdma_xprt;
 struct rpcrdma_memreg_ops {
        int             (*ro_map)(struct rpcrdma_xprt *,
-                                 struct rpcrdma_mr_seg *, int, bool);
+                                 struct rpcrdma_mr_seg *, int, bool,
+                                 struct rpcrdma_mw **);
        void            (*ro_unmap_sync)(struct rpcrdma_xprt *,
                                         struct rpcrdma_req *);
        void            (*ro_unmap_safe)(struct rpcrdma_xprt *,
                                         struct rpcrdma_req *, bool);
+       void            (*ro_recover_mr)(struct rpcrdma_mw *);
        int             (*ro_open)(struct rpcrdma_ia *,
                                   struct rpcrdma_ep *,
                                   struct rpcrdma_create_data_internal *);
        size_t          (*ro_maxpages)(struct rpcrdma_xprt *);
-       int             (*ro_init)(struct rpcrdma_xprt *);
-       void            (*ro_destroy)(struct rpcrdma_buffer *);
+       int             (*ro_init_mr)(struct rpcrdma_ia *,
+                                     struct rpcrdma_mw *);
+       void            (*ro_release_mr)(struct rpcrdma_mw *);
        const char      *ro_displayname;
 };
 
 extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
 extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
-extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
 
 /*
  * RPCRDMA transport -- encapsulates the structures above for
@@ -446,6 +446,8 @@ extern int xprt_rdma_pad_optimize;
  */
 int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
 void rpcrdma_ia_close(struct rpcrdma_ia *);
+bool frwr_is_supported(struct rpcrdma_ia *);
+bool fmr_is_supported(struct rpcrdma_ia *);
 
 /*
  * Endpoint calls - xprtrdma/verbs.c
@@ -477,6 +479,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
 
+void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
+
 struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
                                            size_t, gfp_t);
 void rpcrdma_free_regbuf(struct rpcrdma_ia *,
@@ -484,9 +488,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
 
 int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
 
-int frwr_alloc_recovery_wq(void);
-void frwr_destroy_recovery_wq(void);
-
 int rpcrdma_alloc_wq(void);
 void rpcrdma_destroy_wq(void);
 
@@ -494,45 +495,12 @@ void rpcrdma_destroy_wq(void);
  * Wrappers for chunk registration, shared by read/write chunk code.
  */
 
-void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
-
 static inline enum dma_data_direction
 rpcrdma_data_dir(bool writing)
 {
        return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 }
 
-static inline void
-rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
-               enum dma_data_direction direction)
-{
-       seg->mr_dir = direction;
-       seg->mr_dmalen = seg->mr_len;
-
-       if (seg->mr_page)
-               seg->mr_dma = ib_dma_map_page(device,
-                               seg->mr_page, offset_in_page(seg->mr_offset),
-                               seg->mr_dmalen, seg->mr_dir);
-       else
-               seg->mr_dma = ib_dma_map_single(device,
-                               seg->mr_offset,
-                               seg->mr_dmalen, seg->mr_dir);
-
-       if (ib_dma_mapping_error(device, seg->mr_dma))
-               rpcrdma_mapping_error(seg);
-}
-
-static inline void
-rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
-{
-       if (seg->mr_page)
-               ib_dma_unmap_page(device,
-                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
-       else
-               ib_dma_unmap_single(device,
-                                   seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
-}
-
 /*
  * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
  */
index 7e2b2fa189c340e7f0968aead6f878879e8a9a72..111767ab124aa4037dfe8c7040866d7196343292 100644 (file)
@@ -124,7 +124,7 @@ static struct ctl_table xs_tunables_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &xprt_min_resvport_limit,
-               .extra2         = &xprt_max_resvport_limit
+               .extra2         = &xprt_max_resvport
        },
        {
                .procname       = "max_resvport",
@@ -132,7 +132,7 @@ static struct ctl_table xs_tunables_table[] = {
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xprt_min_resvport_limit,
+               .extra1         = &xprt_min_resvport,
                .extra2         = &xprt_max_resvport_limit
        },
        {
@@ -642,6 +642,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
        struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
        struct xdr_buf *xdr = &req->rq_snd_buf;
        bool zerocopy = true;
+       bool vm_wait = false;
        int status;
        int sent;
 
@@ -677,15 +678,33 @@ static int xs_tcp_send_request(struct rpc_task *task)
                        return 0;
                }
 
+               WARN_ON_ONCE(sent == 0 && status == 0);
+
+               if (status == -EAGAIN ) {
+                       /*
+                        * Return EAGAIN if we're sure we're hitting the
+                        * socket send buffer limits.
+                        */
+                       if (test_bit(SOCK_NOSPACE, &transport->sock->flags))
+                               break;
+                       /*
+                        * Did we hit a memory allocation failure?
+                        */
+                       if (sent == 0) {
+                               status = -ENOBUFS;
+                               if (vm_wait)
+                                       break;
+                               /* Retry, knowing now that we're below the
+                                * socket send buffer limit
+                                */
+                               vm_wait = true;
+                       }
+                       continue;
+               }
                if (status < 0)
                        break;
-               if (sent == 0) {
-                       status = -EAGAIN;
-                       break;
-               }
+               vm_wait = false;
        }
-       if (status == -EAGAIN && sk_stream_is_writeable(transport->inet))
-               status = -ENOBUFS;
 
        switch (status) {
        case -ENOTSOCK:
@@ -755,11 +774,19 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
        sk->sk_error_report = transport->old_error_report;
 }
 
+static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
+{
+       struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+       clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+}
+
 static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
 {
        smp_mb__before_atomic();
        clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
        clear_bit(XPRT_CLOSING, &xprt->state);
+       xs_sock_reset_state_flags(xprt);
        smp_mb__after_atomic();
 }
 
@@ -962,10 +989,13 @@ static void xs_local_data_receive(struct sock_xprt *transport)
                goto out;
        for (;;) {
                skb = skb_recv_datagram(sk, 0, 1, &err);
-               if (skb == NULL)
+               if (skb != NULL) {
+                       xs_local_data_read_skb(&transport->xprt, sk, skb);
+                       skb_free_datagram(sk, skb);
+                       continue;
+               }
+               if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
                        break;
-               xs_local_data_read_skb(&transport->xprt, sk, skb);
-               skb_free_datagram(sk, skb);
        }
 out:
        mutex_unlock(&transport->recv_mutex);
@@ -1043,10 +1073,13 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
                goto out;
        for (;;) {
                skb = skb_recv_datagram(sk, 0, 1, &err);
-               if (skb == NULL)
+               if (skb != NULL) {
+                       xs_udp_data_read_skb(&transport->xprt, sk, skb);
+                       skb_free_datagram(sk, skb);
+                       continue;
+               }
+               if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
                        break;
-               xs_udp_data_read_skb(&transport->xprt, sk, skb);
-               skb_free_datagram(sk, skb);
        }
 out:
        mutex_unlock(&transport->recv_mutex);
@@ -1074,7 +1107,14 @@ static void xs_data_ready(struct sock *sk)
        if (xprt != NULL) {
                struct sock_xprt *transport = container_of(xprt,
                                struct sock_xprt, xprt);
-               queue_work(rpciod_workqueue, &transport->recv_worker);
+               transport->old_data_ready(sk);
+               /* Any data means we had a useful conversation, so
+                * then we don't need to delay the next reconnect
+                */
+               if (xprt->reestablish_timeout)
+                       xprt->reestablish_timeout = 0;
+               if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+                       queue_work(xprtiod_workqueue, &transport->recv_worker);
        }
        read_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1474,10 +1514,15 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
        for (;;) {
                lock_sock(sk);
                read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
-               release_sock(sk);
-               if (read <= 0)
-                       break;
-               total += read;
+               if (read <= 0) {
+                       clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+                       release_sock(sk);
+                       if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+                               break;
+               } else {
+                       release_sock(sk);
+                       total += read;
+               }
                rd_desc.count = 65536;
        }
 out:
@@ -1492,34 +1537,6 @@ static void xs_tcp_data_receive_workfn(struct work_struct *work)
        xs_tcp_data_receive(transport);
 }
 
-/**
- * xs_tcp_data_ready - "data ready" callback for TCP sockets
- * @sk: socket with data to read
- *
- */
-static void xs_tcp_data_ready(struct sock *sk)
-{
-       struct sock_xprt *transport;
-       struct rpc_xprt *xprt;
-
-       dprintk("RPC:       xs_tcp_data_ready...\n");
-
-       read_lock_bh(&sk->sk_callback_lock);
-       if (!(xprt = xprt_from_sock(sk)))
-               goto out;
-       transport = container_of(xprt, struct sock_xprt, xprt);
-
-       /* Any data means we had a useful conversation, so
-        * the we don't need to delay the next reconnect
-        */
-       if (xprt->reestablish_timeout)
-               xprt->reestablish_timeout = 0;
-       queue_work(rpciod_workqueue, &transport->recv_worker);
-
-out:
-       read_unlock_bh(&sk->sk_callback_lock);
-}
-
 /**
  * xs_tcp_state_change - callback to handle TCP socket state changes
  * @sk: socket whose state has changed
@@ -1714,7 +1731,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
 
 static unsigned short xs_get_random_port(void)
 {
-       unsigned short range = xprt_max_resvport - xprt_min_resvport;
+       unsigned short range = xprt_max_resvport - xprt_min_resvport + 1;
        unsigned short rand = (unsigned short) prandom_u32() % range;
        return rand + xprt_min_resvport;
 }
@@ -2241,7 +2258,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
                xs_save_old_callbacks(transport, sk);
 
                sk->sk_user_data = xprt;
-               sk->sk_data_ready = xs_tcp_data_ready;
+               sk->sk_data_ready = xs_data_ready;
                sk->sk_state_change = xs_tcp_state_change;
                sk->sk_write_space = xs_tcp_write_space;
                sock_set_flag(sk, SOCK_FASYNC);
@@ -2380,7 +2397,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
                /* Start by resetting any existing state */
                xs_reset_transport(transport);
 
-               queue_delayed_work(rpciod_workqueue,
+               queue_delayed_work(xprtiod_workqueue,
                                   &transport->connect_worker,
                                   xprt->reestablish_timeout);
                xprt->reestablish_timeout <<= 1;
@@ -2390,7 +2407,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
                        xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
        } else {
                dprintk("RPC:       xs_connect scheduled xprt %p\n", xprt);
-               queue_delayed_work(rpciod_workqueue,
+               queue_delayed_work(xprtiod_workqueue,
                                   &transport->connect_worker, 0);
        }
 }
@@ -3153,8 +3170,12 @@ static int param_set_uint_minmax(const char *val,
 
 static int param_set_portnr(const char *val, const struct kernel_param *kp)
 {
-       return param_set_uint_minmax(val, kp,
+       if (kp->arg == &xprt_min_resvport)
+               return param_set_uint_minmax(val, kp,
                        RPC_MIN_RESVPORT,
+                       xprt_max_resvport);
+       return param_set_uint_minmax(val, kp,
+                       xprt_min_resvport,
                        RPC_MAX_RESVPORT);
 }