]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
Merge tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 19 Dec 2014 04:10:44 +0000 (20:10 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 19 Dec 2014 04:10:44 +0000 (20:10 -0800)
Pull infiniband updates from Roland Dreier:
 "Main batch of InfiniBand/RDMA changes for 3.19:

   - On-demand paging support in core midlayer and mlx5 driver.  This
     lets userspace create non-pinned memory regions and have the
     adapter HW trigger page faults.
   - iSER and IPoIB updates and fixes.
   - Low-level HW driver updates for cxgb4, mlx4 and ocrdma.
   - Other miscellaneous fixes"

* tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband: (56 commits)
  IB/mlx5: Implement on demand paging by adding support for MMU notifiers
  IB/mlx5: Add support for RDMA read/write responder page faults
  IB/mlx5: Handle page faults
  IB/mlx5: Page faults handling infrastructure
  IB/mlx5: Add mlx5_ib_update_mtt to update page tables after creation
  IB/mlx5: Changes in memory region creation to support on-demand paging
  IB/mlx5: Implement the ODP capability query verb
  mlx5_core: Add support for page faults events and low level handling
  mlx5_core: Re-add MLX5_DEV_CAP_FLAG_ON_DMND_PG flag
  IB/srp: Allow newline separator for connection string
  IB/core: Implement support for MMU notifiers regarding on demand paging regions
  IB/core: Add support for on demand paging regions
  IB/core: Add flags for on demand paging support
  IB/core: Add support for extended query device caps
  IB/mlx5: Add function to read WQE from user-space
  IB/core: Add umem function to read data from user-space
  IB/core: Replace ib_umem's offset field with a full address
  IB/mlx5: Enhance UMR support to allow partial page table update
  IB/mlx5: Remove per-MR pas and dma pointers
  RDMA/ocrdma: Always resolve destination mac from GRH for UD QPs
  ...

53 files changed:
drivers/infiniband/Kconfig
drivers/infiniband/core/Makefile
drivers/infiniband/core/addr.c
drivers/infiniband/core/multicast.c
drivers/infiniband/core/umem.c
drivers/infiniband/core/umem_odp.c [new file with mode: 0644]
drivers/infiniband/core/umem_rbtree.c [new file with mode: 0644]
drivers/infiniband/core/uverbs.h
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_main.c
drivers/infiniband/core/verbs.c
drivers/infiniband/hw/amso1100/c2_provider.c
drivers/infiniband/hw/cxgb4/cm.c
drivers/infiniband/hw/cxgb4/device.c
drivers/infiniband/hw/cxgb4/mem.c
drivers/infiniband/hw/cxgb4/qp.c
drivers/infiniband/hw/ehca/ehca_mrmw.c
drivers/infiniband/hw/ipath/ipath_mr.c
drivers/infiniband/hw/mlx4/mr.c
drivers/infiniband/hw/mlx5/Makefile
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mem.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/odp.c [new file with mode: 0644]
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/nes/nes_verbs.c
drivers/infiniband/hw/ocrdma/ocrdma_ah.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
drivers/infiniband/hw/qib/qib_mr.c
drivers/infiniband/ulp/ipoib/ipoib.h
drivers/infiniband/ulp/ipoib/ipoib_cm.c
drivers/infiniband/ulp/ipoib/ipoib_ib.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_multicast.c
drivers/infiniband/ulp/ipoib/ipoib_verbs.c
drivers/infiniband/ulp/iser/iscsi_iser.c
drivers/infiniband/ulp/iser/iscsi_iser.h
drivers/infiniband/ulp/iser/iser_initiator.c
drivers/infiniband/ulp/iser/iser_memory.c
drivers/infiniband/ulp/iser/iser_verbs.c
drivers/infiniband/ulp/srp/ib_srp.c
drivers/net/ethernet/mellanox/mlx4/main.c
drivers/net/ethernet/mellanox/mlx5/core/eq.c
drivers/net/ethernet/mellanox/mlx5/core/fw.c
drivers/net/ethernet/mellanox/mlx5/core/qp.c
include/linux/mlx5/device.h
include/linux/mlx5/driver.h
include/linux/mlx5/qp.h
include/rdma/ib_umem.h
include/rdma/ib_umem_odp.h [new file with mode: 0644]
include/rdma/ib_verbs.h
include/uapi/rdma/ib_user_verbs.h

index 77089399359b70bbcdccabc264c5e4e8089a1bdb..b899531498eb0dc7924e4587f67dbc3f7c116313 100644 (file)
@@ -38,6 +38,17 @@ config INFINIBAND_USER_MEM
        depends on INFINIBAND_USER_ACCESS != n
        default y
 
+config INFINIBAND_ON_DEMAND_PAGING
+       bool "InfiniBand on-demand paging support"
+       depends on INFINIBAND_USER_MEM
+       select MMU_NOTIFIER
+       default y
+       ---help---
+         On demand paging support for the InfiniBand subsystem.
+         Together with driver support this allows registration of
+         memory regions without pinning their pages, fetching the
+         pages on demand instead.
+
 config INFINIBAND_ADDR_TRANS
        bool
        depends on INFINIBAND
index ffd0af6734af63120b78aa8402f2f48a1ec6fc20..acf73676444593704267ac9176696f95caa52335 100644 (file)
@@ -11,6 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=       ib_uverbs.o ib_ucm.o \
 ib_core-y :=                   packer.o ud_header.o verbs.o sysfs.o \
                                device.o fmr_pool.o cache.o netlink.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 
 ib_mad-y :=                    mad.o smi.o agent.o mad_rmpp.o
 
index 8172d37f9addae8485041ad0cabc1d4261097647..f80da50d84a5b6585a10656b369bcc33aedd0507 100644 (file)
@@ -176,8 +176,8 @@ static void set_timeout(unsigned long time)
        unsigned long delay;
 
        delay = time - jiffies;
-       if ((long)delay <= 0)
-               delay = 1;
+       if ((long)delay < 0)
+               delay = 0;
 
        mod_delayed_work(addr_wq, &work, delay);
 }
index d2360a8ef0b29ea6d6c9a3bebc05cf07b087a02f..fa17b552ff78bc16b1547215682f26649f5af7ac 100644 (file)
@@ -525,17 +525,22 @@ static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
        if (status)
                process_join_error(group, status);
        else {
+               int mgids_changed, is_mgid0;
                ib_find_pkey(group->port->dev->device, group->port->port_num,
                             be16_to_cpu(rec->pkey), &pkey_index);
 
                spin_lock_irq(&group->port->lock);
-               group->rec = *rec;
                if (group->state == MCAST_BUSY &&
                    group->pkey_index == MCAST_INVALID_PKEY_INDEX)
                        group->pkey_index = pkey_index;
-               if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
+               mgids_changed = memcmp(&rec->mgid, &group->rec.mgid,
+                                      sizeof(group->rec.mgid));
+               group->rec = *rec;
+               if (mgids_changed) {
                        rb_erase(&group->node, &group->port->table);
-                       mcast_insert(group->port, group, 1);
+                       is_mgid0 = !memcmp(&mgid0, &group->rec.mgid,
+                                          sizeof(mgid0));
+                       mcast_insert(group->port, group, is_mgid0);
                }
                spin_unlock_irq(&group->port->lock);
        }
index df0c4f605a219c1f3f5cd5faabeadc705622b65e..aec7a6aa2951db47bc6b5be969a29d1867688b23 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/hugetlb.h>
 #include <linux/dma-attrs.h>
 #include <linux/slab.h>
+#include <rdma/ib_umem_odp.h>
 
 #include "uverbs.h"
 
@@ -69,6 +70,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
 
 /**
  * ib_umem_get - Pin and DMA map userspace memory.
+ *
+ * If access flags indicate ODP memory, avoid pinning. Instead, stores
+ * the mm for future page fault handling in conjunction with MMU notifiers.
+ *
  * @context: userspace context to pin memory for
  * @addr: userspace virtual address to start at
  * @size: length of region to pin
@@ -103,17 +108,30 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 
        umem->context   = context;
        umem->length    = size;
-       umem->offset    = addr & ~PAGE_MASK;
+       umem->address   = addr;
        umem->page_size = PAGE_SIZE;
        umem->pid       = get_task_pid(current, PIDTYPE_PID);
        /*
-        * We ask for writable memory if any access flags other than
-        * "remote read" are set.  "Local write" and "remote write"
+        * We ask for writable memory if any of the following
+        * access flags are set.  "Local write" and "remote write"
         * obviously require write access.  "Remote atomic" can do
         * things like fetch and add, which will modify memory, and
         * "MW bind" can change permissions by binding a window.
         */
-       umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
+       umem->writable  = !!(access &
+               (IB_ACCESS_LOCAL_WRITE   | IB_ACCESS_REMOTE_WRITE |
+                IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
+
+       if (access & IB_ACCESS_ON_DEMAND) {
+               ret = ib_umem_odp_get(context, umem);
+               if (ret) {
+                       kfree(umem);
+                       return ERR_PTR(ret);
+               }
+               return umem;
+       }
+
+       umem->odp_data = NULL;
 
        /* We assume the memory is from hugetlb until proved otherwise */
        umem->hugetlb   = 1;
@@ -132,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
        if (!vma_list)
                umem->hugetlb = 0;
 
-       npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
+       npages = ib_umem_num_pages(umem);
 
        down_write(&current->mm->mmap_sem);
 
@@ -235,6 +253,11 @@ void ib_umem_release(struct ib_umem *umem)
        struct task_struct *task;
        unsigned long diff;
 
+       if (umem->odp_data) {
+               ib_umem_odp_release(umem);
+               return;
+       }
+
        __ib_umem_release(umem->context->device, umem, 1);
 
        task = get_pid_task(umem->pid, PIDTYPE_PID);
@@ -246,7 +269,7 @@ void ib_umem_release(struct ib_umem *umem)
        if (!mm)
                goto out;
 
-       diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
+       diff = ib_umem_num_pages(umem);
 
        /*
         * We may be called with the mm's mmap_sem already held.  This
@@ -283,6 +306,9 @@ int ib_umem_page_count(struct ib_umem *umem)
        int n;
        struct scatterlist *sg;
 
+       if (umem->odp_data)
+               return ib_umem_num_pages(umem);
+
        shift = ilog2(umem->page_size);
 
        n = 0;
@@ -292,3 +318,37 @@ int ib_umem_page_count(struct ib_umem *umem)
        return n;
 }
 EXPORT_SYMBOL(ib_umem_page_count);
+
+/*
+ * Copy from the given ib_umem's pages to the given buffer.
+ *
+ * umem - the umem to copy from
+ * offset - offset to start copying from
+ * dst - destination buffer
+ * length - buffer length
+ *
+ * Returns 0 on success, or an error code.
+ */
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+                     size_t length)
+{
+       size_t end = offset + length;
+       int ret;
+
+       if (offset > umem->length || length > umem->length - offset) {
+               pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
+                      offset, umem->length, end);
+               return -EINVAL;
+       }
+
+       ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length,
+                                offset + ib_umem_offset(umem));
+
+       if (ret < 0)
+               return ret;
+       else if (ret != length)
+               return -EINVAL;
+       else
+               return 0;
+}
+EXPORT_SYMBOL(ib_umem_copy_from);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
new file mode 100644 (file)
index 0000000..6095872
--- /dev/null
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/pid.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+
+static void ib_umem_notifier_start_account(struct ib_umem *item)
+{
+       mutex_lock(&item->odp_data->umem_mutex);
+
+       /* Only update private counters for this umem if it has them.
+        * Otherwise skip it. All page faults will be delayed for this umem. */
+       if (item->odp_data->mn_counters_active) {
+               int notifiers_count = item->odp_data->notifiers_count++;
+
+               if (notifiers_count == 0)
+                       /* Initialize the completion object for waiting on
+                        * notifiers. Since notifier_count is zero, no one
+                        * should be waiting right now. */
+                       reinit_completion(&item->odp_data->notifier_completion);
+       }
+       mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+static void ib_umem_notifier_end_account(struct ib_umem *item)
+{
+       mutex_lock(&item->odp_data->umem_mutex);
+
+       /* Only update private counters for this umem if it has them.
+        * Otherwise skip it. All page faults will be delayed for this umem. */
+       if (item->odp_data->mn_counters_active) {
+               /*
+                * This sequence increase will notify the QP page fault that
+                * the page that is going to be mapped in the spte could have
+                * been freed.
+                */
+               ++item->odp_data->notifiers_seq;
+               if (--item->odp_data->notifiers_count == 0)
+                       complete_all(&item->odp_data->notifier_completion);
+       }
+       mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+/* Account for a new mmu notifier in an ib_ucontext. */
+static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
+{
+       atomic_inc(&context->notifier_count);
+}
+
+/* Account for a terminating mmu notifier in an ib_ucontext.
+ *
+ * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
+ * the function takes the semaphore itself. */
+static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
+{
+       int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
+
+       if (zero_notifiers &&
+           !list_empty(&context->no_private_counters)) {
+               /* No currently running mmu notifiers. Now is the chance to
+                * add private accounting to all previously added umems. */
+               struct ib_umem_odp *odp_data, *next;
+
+               /* Prevent concurrent mmu notifiers from working on the
+                * no_private_counters list. */
+               down_write(&context->umem_rwsem);
+
+               /* Read the notifier_count again, with the umem_rwsem
+                * semaphore taken for write. */
+               if (!atomic_read(&context->notifier_count)) {
+                       list_for_each_entry_safe(odp_data, next,
+                                                &context->no_private_counters,
+                                                no_private_counters) {
+                               mutex_lock(&odp_data->umem_mutex);
+                               odp_data->mn_counters_active = true;
+                               list_del(&odp_data->no_private_counters);
+                               complete_all(&odp_data->notifier_completion);
+                               mutex_unlock(&odp_data->umem_mutex);
+                       }
+               }
+
+               up_write(&context->umem_rwsem);
+       }
+}
+
+static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
+                                              u64 end, void *cookie) {
+       /*
+        * Increase the number of notifiers running, to
+        * prevent any further fault handling on this MR.
+        */
+       ib_umem_notifier_start_account(item);
+       item->odp_data->dying = 1;
+       /* Make sure that the fact the umem is dying is out before we release
+        * all pending page faults. */
+       smp_wmb();
+       complete_all(&item->odp_data->notifier_completion);
+       item->context->invalidate_range(item, ib_umem_start(item),
+                                       ib_umem_end(item));
+       return 0;
+}
+
+static void ib_umem_notifier_release(struct mmu_notifier *mn,
+                                    struct mm_struct *mm)
+{
+       struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+       if (!context->invalidate_range)
+               return;
+
+       ib_ucontext_notifier_start_account(context);
+       down_read(&context->umem_rwsem);
+       rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
+                                     ULLONG_MAX,
+                                     ib_umem_notifier_release_trampoline,
+                                     NULL);
+       up_read(&context->umem_rwsem);
+}
+
+static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
+                                     u64 end, void *cookie)
+{
+       ib_umem_notifier_start_account(item);
+       item->context->invalidate_range(item, start, start + PAGE_SIZE);
+       ib_umem_notifier_end_account(item);
+       return 0;
+}
+
+static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
+                                            struct mm_struct *mm,
+                                            unsigned long address)
+{
+       struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+       if (!context->invalidate_range)
+               return;
+
+       ib_ucontext_notifier_start_account(context);
+       down_read(&context->umem_rwsem);
+       rbt_ib_umem_for_each_in_range(&context->umem_tree, address,
+                                     address + PAGE_SIZE,
+                                     invalidate_page_trampoline, NULL);
+       up_read(&context->umem_rwsem);
+       ib_ucontext_notifier_end_account(context);
+}
+
+static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
+                                            u64 end, void *cookie)
+{
+       ib_umem_notifier_start_account(item);
+       item->context->invalidate_range(item, start, end);
+       return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
+                                                   struct mm_struct *mm,
+                                                   unsigned long start,
+                                                   unsigned long end)
+{
+       struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+       if (!context->invalidate_range)
+               return;
+
+       ib_ucontext_notifier_start_account(context);
+       down_read(&context->umem_rwsem);
+       rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+                                     end,
+                                     invalidate_range_start_trampoline, NULL);
+       up_read(&context->umem_rwsem);
+}
+
+static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
+                                          u64 end, void *cookie)
+{
+       ib_umem_notifier_end_account(item);
+       return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
+                                                 struct mm_struct *mm,
+                                                 unsigned long start,
+                                                 unsigned long end)
+{
+       struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+       if (!context->invalidate_range)
+               return;
+
+       down_read(&context->umem_rwsem);
+       rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+                                     end,
+                                     invalidate_range_end_trampoline, NULL);
+       up_read(&context->umem_rwsem);
+       ib_ucontext_notifier_end_account(context);
+}
+
+static struct mmu_notifier_ops ib_umem_notifiers = {
+       .release                    = ib_umem_notifier_release,
+       .invalidate_page            = ib_umem_notifier_invalidate_page,
+       .invalidate_range_start     = ib_umem_notifier_invalidate_range_start,
+       .invalidate_range_end       = ib_umem_notifier_invalidate_range_end,
+};
+
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
+{
+       int ret_val;
+       struct pid *our_pid;
+       struct mm_struct *mm = get_task_mm(current);
+
+       if (!mm)
+               return -EINVAL;
+
+       /* Prevent creating ODP MRs in child processes */
+       rcu_read_lock();
+       our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
+       rcu_read_unlock();
+       put_pid(our_pid);
+       if (context->tgid != our_pid) {
+               ret_val = -EINVAL;
+               goto out_mm;
+       }
+
+       umem->hugetlb = 0;
+       umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
+       if (!umem->odp_data) {
+               ret_val = -ENOMEM;
+               goto out_mm;
+       }
+       umem->odp_data->umem = umem;
+
+       mutex_init(&umem->odp_data->umem_mutex);
+
+       init_completion(&umem->odp_data->notifier_completion);
+
+       umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
+                                           sizeof(*umem->odp_data->page_list));
+       if (!umem->odp_data->page_list) {
+               ret_val = -ENOMEM;
+               goto out_odp_data;
+       }
+
+       umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) *
+                                         sizeof(*umem->odp_data->dma_list));
+       if (!umem->odp_data->dma_list) {
+               ret_val = -ENOMEM;
+               goto out_page_list;
+       }
+
+       /*
+        * When using MMU notifiers, we will get a
+        * notification before the "current" task (and MM) is
+        * destroyed. We use the umem_rwsem semaphore to synchronize.
+        */
+       down_write(&context->umem_rwsem);
+       context->odp_mrs_count++;
+       if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+               rbt_ib_umem_insert(&umem->odp_data->interval_tree,
+                                  &context->umem_tree);
+       if (likely(!atomic_read(&context->notifier_count)))
+               umem->odp_data->mn_counters_active = true;
+       else
+               list_add(&umem->odp_data->no_private_counters,
+                        &context->no_private_counters);
+       downgrade_write(&context->umem_rwsem);
+
+       if (context->odp_mrs_count == 1) {
+               /*
+                * Note that at this point, no MMU notifier is running
+                * for this context!
+                */
+               atomic_set(&context->notifier_count, 0);
+               INIT_HLIST_NODE(&context->mn.hlist);
+               context->mn.ops = &ib_umem_notifiers;
+               /*
+                * Lock-dep detects a false positive for mmap_sem vs.
+                * umem_rwsem, due to not grasping downgrade_write correctly.
+                */
+               lockdep_off();
+               ret_val = mmu_notifier_register(&context->mn, mm);
+               lockdep_on();
+               if (ret_val) {
+                       pr_err("Failed to register mmu_notifier %d\n", ret_val);
+                       ret_val = -EBUSY;
+                       goto out_mutex;
+               }
+       }
+
+       up_read(&context->umem_rwsem);
+
+       /*
+        * Note that doing an mmput can cause a notifier for the relevant mm.
+        * If the notifier is called while we hold the umem_rwsem, this will
+        * cause a deadlock. Therefore, we release the reference only after we
+        * released the semaphore.
+        */
+       mmput(mm);
+       return 0;
+
+out_mutex:
+       up_read(&context->umem_rwsem);
+       vfree(umem->odp_data->dma_list);
+out_page_list:
+       vfree(umem->odp_data->page_list);
+out_odp_data:
+       kfree(umem->odp_data);
+out_mm:
+       mmput(mm);
+       return ret_val;
+}
+
+void ib_umem_odp_release(struct ib_umem *umem)
+{
+       struct ib_ucontext *context = umem->context;
+
+       /*
+        * Ensure that no more pages are mapped in the umem.
+        *
+        * It is the driver's responsibility to ensure, before calling us,
+        * that the hardware will not attempt to access the MR any more.
+        */
+       ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
+                                   ib_umem_end(umem));
+
+       down_write(&context->umem_rwsem);
+       if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+               rbt_ib_umem_remove(&umem->odp_data->interval_tree,
+                                  &context->umem_tree);
+       context->odp_mrs_count--;
+       if (!umem->odp_data->mn_counters_active) {
+               list_del(&umem->odp_data->no_private_counters);
+               complete_all(&umem->odp_data->notifier_completion);
+       }
+
+       /*
+        * Downgrade the lock to a read lock. This ensures that the notifiers
+        * (who lock the mutex for reading) will be able to finish, and we
+        * will be able to enventually obtain the mmu notifiers SRCU. Note
+        * that since we are doing it atomically, no other user could register
+        * and unregister while we do the check.
+        */
+       downgrade_write(&context->umem_rwsem);
+       if (!context->odp_mrs_count) {
+               struct task_struct *owning_process = NULL;
+               struct mm_struct *owning_mm        = NULL;
+
+               owning_process = get_pid_task(context->tgid,
+                                             PIDTYPE_PID);
+               if (owning_process == NULL)
+                       /*
+                        * The process is already dead, notifier were removed
+                        * already.
+                        */
+                       goto out;
+
+               owning_mm = get_task_mm(owning_process);
+               if (owning_mm == NULL)
+                       /*
+                        * The process' mm is already dead, notifier were
+                        * removed already.
+                        */
+                       goto out_put_task;
+               mmu_notifier_unregister(&context->mn, owning_mm);
+
+               mmput(owning_mm);
+
+out_put_task:
+               put_task_struct(owning_process);
+       }
+out:
+       up_read(&context->umem_rwsem);
+
+       vfree(umem->odp_data->dma_list);
+       vfree(umem->odp_data->page_list);
+       kfree(umem->odp_data);
+       kfree(umem);
+}
+
+/*
+ * Map for DMA and insert a single page into the on-demand paging page tables.
+ *
+ * @umem: the umem to insert the page to.
+ * @page_index: index in the umem to add the page to.
+ * @page: the page struct to map and add.
+ * @access_mask: access permissions needed for this page.
+ * @current_seq: sequence number for synchronization with invalidations.
+ *               the sequence number is taken from
+ *               umem->odp_data->notifiers_seq.
+ *
+ * The function returns -EFAULT if the DMA mapping operation fails. It returns
+ * -EAGAIN if a concurrent invalidation prevents us from updating the page.
+ *
+ * The page is released via put_page even if the operation failed. For
+ * on-demand pinning, the page is released whenever it isn't stored in the
+ * umem.
+ */
+static int ib_umem_odp_map_dma_single_page(
+               struct ib_umem *umem,
+               int page_index,
+               u64 base_virt_addr,
+               struct page *page,
+               u64 access_mask,
+               unsigned long current_seq)
+{
+       struct ib_device *dev = umem->context->device;
+       dma_addr_t dma_addr;
+       int stored_page = 0;
+       int remove_existing_mapping = 0;
+       int ret = 0;
+
+       mutex_lock(&umem->odp_data->umem_mutex);
+       /*
+        * Note: we avoid writing if seq is different from the initial seq, to
+        * handle case of a racing notifier. This check also allows us to bail
+        * early if we have a notifier running in parallel with us.
+        */
+       if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
+               ret = -EAGAIN;
+               goto out;
+       }
+       if (!(umem->odp_data->dma_list[page_index])) {
+               dma_addr = ib_dma_map_page(dev,
+                                          page,
+                                          0, PAGE_SIZE,
+                                          DMA_BIDIRECTIONAL);
+               if (ib_dma_mapping_error(dev, dma_addr)) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+               umem->odp_data->dma_list[page_index] = dma_addr | access_mask;
+               umem->odp_data->page_list[page_index] = page;
+               stored_page = 1;
+       } else if (umem->odp_data->page_list[page_index] == page) {
+               umem->odp_data->dma_list[page_index] |= access_mask;
+       } else {
+               pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
+                      umem->odp_data->page_list[page_index], page);
+               /* Better remove the mapping now, to prevent any further
+                * damage. */
+               remove_existing_mapping = 1;
+       }
+
+out:
+       mutex_unlock(&umem->odp_data->umem_mutex);
+
+       /* On Demand Paging - avoid pinning the page */
+       if (umem->context->invalidate_range || !stored_page)
+               put_page(page);
+
+       if (remove_existing_mapping && umem->context->invalidate_range) {
+               invalidate_page_trampoline(
+                       umem,
+                       base_virt_addr + (page_index * PAGE_SIZE),
+                       base_virt_addr + ((page_index+1)*PAGE_SIZE),
+                       NULL);
+               ret = -EAGAIN;
+       }
+
+       return ret;
+}
+
+/**
+ * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
+ *
+ * Pins the range of pages passed in the argument, and maps them to
+ * DMA addresses. The DMA addresses of the mapped pages is updated in
+ * umem->odp_data->dma_list.
+ *
+ * Returns the number of pages mapped in success, negative error code
+ * for failure.
+ * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
+ * the function from completing its task.
+ *
+ * @umem: the umem to map and pin
+ * @user_virt: the address from which we need to map.
+ * @bcnt: the minimal number of bytes to pin and map. The mapping might be
+ *        bigger due to alignment, and may also be smaller in case of an error
+ *        pinning or mapping a page. The actual pages mapped is returned in
+ *        the return value.
+ * @access_mask: bit mask of the requested access permissions for the given
+ *               range.
+ * @current_seq: the MMU notifiers sequance value for synchronization with
+ *               invalidations. the sequance number is read from
+ *               umem->odp_data->notifiers_seq before calling this function
+ */
+int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
+                             u64 access_mask, unsigned long current_seq)
+{
+       struct task_struct *owning_process  = NULL;
+       struct mm_struct   *owning_mm       = NULL;
+       struct page       **local_page_list = NULL;
+       u64 off;
+       int j, k, ret = 0, start_idx, npages = 0;
+       u64 base_virt_addr;
+
+       if (access_mask == 0)
+               return -EINVAL;
+
+       if (user_virt < ib_umem_start(umem) ||
+           user_virt + bcnt > ib_umem_end(umem))
+               return -EFAULT;
+
+       local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
+       if (!local_page_list)
+               return -ENOMEM;
+
+       off = user_virt & (~PAGE_MASK);
+       user_virt = user_virt & PAGE_MASK;
+       base_virt_addr = user_virt;
+       bcnt += off; /* Charge for the first page offset as well. */
+
+       owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
+       if (owning_process == NULL) {
+               ret = -EINVAL;
+               goto out_no_task;
+       }
+
+       owning_mm = get_task_mm(owning_process);
+       if (owning_mm == NULL) {
+               ret = -EINVAL;
+               goto out_put_task;
+       }
+
+       start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT;
+       k = start_idx;
+
+       while (bcnt > 0) {
+               const size_t gup_num_pages =
+                       min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE,
+                             PAGE_SIZE / sizeof(struct page *));
+
+               down_read(&owning_mm->mmap_sem);
+               /*
+                * Note: this might result in redundent page getting. We can
+                * avoid this by checking dma_list to be 0 before calling
+                * get_user_pages. However, this make the code much more
+                * complex (and doesn't gain us much performance in most use
+                * cases).
+                */
+               npages = get_user_pages(owning_process, owning_mm, user_virt,
+                                       gup_num_pages,
+                                       access_mask & ODP_WRITE_ALLOWED_BIT, 0,
+                                       local_page_list, NULL);
+               up_read(&owning_mm->mmap_sem);
+
+               if (npages < 0)
+                       break;
+
+               bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
+               user_virt += npages << PAGE_SHIFT;
+               for (j = 0; j < npages; ++j) {
+                       ret = ib_umem_odp_map_dma_single_page(
+                               umem, k, base_virt_addr, local_page_list[j],
+                               access_mask, current_seq);
+                       if (ret < 0)
+                               break;
+                       k++;
+               }
+
+               if (ret < 0) {
+                       /* Release left over pages when handling errors. */
+                       for (++j; j < npages; ++j)
+                               put_page(local_page_list[j]);
+                       break;
+               }
+       }
+
+       if (ret >= 0) {
+               if (npages < 0 && k == start_idx)
+                       ret = npages;
+               else
+                       ret = k - start_idx;
+       }
+
+       mmput(owning_mm);
+out_put_task:
+       put_task_struct(owning_process);
+out_no_task:
+       free_page((unsigned long)local_page_list);
+       return ret;
+}
+EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
+
+void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
+                                u64 bound)
+{
+       int idx;
+       u64 addr;
+       struct ib_device *dev = umem->context->device;
+
+       virt  = max_t(u64, virt,  ib_umem_start(umem));
+       bound = min_t(u64, bound, ib_umem_end(umem));
+       /* Note that during the run of this function, the
+        * notifiers_count of the MR is > 0, preventing any racing
+        * faults from completion. We might be racing with other
+        * invalidations, so we must make sure we free each page only
+        * once. */
+       for (addr = virt; addr < bound; addr += (u64)umem->page_size) {
+               idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
+               mutex_lock(&umem->odp_data->umem_mutex);
+               if (umem->odp_data->page_list[idx]) {
+                       struct page *page = umem->odp_data->page_list[idx];
+                       struct page *head_page = compound_head(page);
+                       dma_addr_t dma = umem->odp_data->dma_list[idx];
+                       dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
+
+                       WARN_ON(!dma_addr);
+
+                       ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
+                                         DMA_BIDIRECTIONAL);
+                       if (dma & ODP_WRITE_ALLOWED_BIT)
+                               /*
+                                * set_page_dirty prefers being called with
+                                * the page lock. However, MMU notifiers are
+                                * called sometimes with and sometimes without
+                                * the lock. We rely on the umem_mutex instead
+                                * to prevent other mmu notifiers from
+                                * continuing and allowing the page mapping to
+                                * be removed.
+                                */
+                               set_page_dirty(head_page);
+                       /* on demand pinning support */
+                       if (!umem->context->invalidate_range)
+                               put_page(page);
+                       umem->odp_data->page_list[idx] = NULL;
+                       umem->odp_data->dma_list[idx] = 0;
+               }
+               mutex_unlock(&umem->odp_data->umem_mutex);
+       }
+}
+EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c
new file mode 100644 (file)
index 0000000..727d788
--- /dev/null
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interval_tree_generic.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <rdma/ib_umem_odp.h>
+
+/*
+ * The ib_umem list keeps track of memory regions for which the HW
+ * device request to receive notification when the related memory
+ * mapping is changed.
+ *
+ * ib_umem_lock protects the list.
+ */
+
+static inline u64 node_start(struct umem_odp_node *n)
+{
+       struct ib_umem_odp *umem_odp =
+                       container_of(n, struct ib_umem_odp, interval_tree);
+
+       return ib_umem_start(umem_odp->umem);
+}
+
+/* Note that the representation of the intervals in the interval tree
+ * considers the ending point as contained in the interval, while the
+ * function ib_umem_end returns the first address which is not contained
+ * in the umem.
+ */
+static inline u64 node_last(struct umem_odp_node *n)
+{
+       struct ib_umem_odp *umem_odp =
+                       container_of(n, struct ib_umem_odp, interval_tree);
+
+       return ib_umem_end(umem_odp->umem) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
+                    node_start, node_last, , rbt_ib_umem)
+
+/* @last is not a part of the interval. See comment for function
+ * node_last.
+ */
+int rbt_ib_umem_for_each_in_range(struct rb_root *root,
+                                 u64 start, u64 last,
+                                 umem_call_back cb,
+                                 void *cookie)
+{
+       int ret_val = 0;
+       struct umem_odp_node *node;
+       struct ib_umem_odp *umem;
+
+       if (unlikely(start == last))
+               return ret_val;
+
+       for (node = rbt_ib_umem_iter_first(root, start, last - 1); node;
+                       node = rbt_ib_umem_iter_next(node, start, last - 1)) {
+               umem = container_of(node, struct ib_umem_odp, interval_tree);
+               ret_val = cb(umem->umem, start, last, cookie) || ret_val;
+       }
+
+       return ret_val;
+}
index 643c08a025a52d015431b8a27be1ddcacbd36845..b716b08156446e186c9ae608f3f4e6343c6f200f 100644 (file)
@@ -258,5 +258,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
 
 IB_UVERBS_DECLARE_EX_CMD(create_flow);
 IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
+IB_UVERBS_DECLARE_EX_CMD(query_device);
 
 #endif /* UVERBS_H */
index 5ba2a86aab6a8ac4a5b6f5693b6627d0832a5f6c..532d8eba8b0203ab65a2a8ed1f096389253b1dca 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 #include <asm/uaccess.h>
 
@@ -288,6 +289,9 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
        struct ib_uverbs_get_context_resp resp;
        struct ib_udata                   udata;
        struct ib_device                 *ibdev = file->device->ib_dev;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       struct ib_device_attr             dev_attr;
+#endif
        struct ib_ucontext               *ucontext;
        struct file                      *filp;
        int ret;
@@ -325,8 +329,25 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
        INIT_LIST_HEAD(&ucontext->ah_list);
        INIT_LIST_HEAD(&ucontext->xrcd_list);
        INIT_LIST_HEAD(&ucontext->rule_list);
+       rcu_read_lock();
+       ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+       rcu_read_unlock();
        ucontext->closing = 0;
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       ucontext->umem_tree = RB_ROOT;
+       init_rwsem(&ucontext->umem_rwsem);
+       ucontext->odp_mrs_count = 0;
+       INIT_LIST_HEAD(&ucontext->no_private_counters);
+
+       ret = ib_query_device(ibdev, &dev_attr);
+       if (ret)
+               goto err_free;
+       if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+               ucontext->invalidate_range = NULL;
+
+#endif
+
        resp.num_comp_vectors = file->device->num_comp_vectors;
 
        ret = get_unused_fd_flags(O_CLOEXEC);
@@ -371,6 +392,7 @@ err_fd:
        put_unused_fd(resp.async_fd);
 
 err_free:
+       put_pid(ucontext->tgid);
        ibdev->dealloc_ucontext(ucontext);
 
 err:
@@ -378,6 +400,52 @@ err:
        return ret;
 }
 
+static void copy_query_dev_fields(struct ib_uverbs_file *file,
+                                 struct ib_uverbs_query_device_resp *resp,
+                                 struct ib_device_attr *attr)
+{
+       resp->fw_ver            = attr->fw_ver;
+       resp->node_guid         = file->device->ib_dev->node_guid;
+       resp->sys_image_guid    = attr->sys_image_guid;
+       resp->max_mr_size       = attr->max_mr_size;
+       resp->page_size_cap     = attr->page_size_cap;
+       resp->vendor_id         = attr->vendor_id;
+       resp->vendor_part_id    = attr->vendor_part_id;
+       resp->hw_ver            = attr->hw_ver;
+       resp->max_qp            = attr->max_qp;
+       resp->max_qp_wr         = attr->max_qp_wr;
+       resp->device_cap_flags  = attr->device_cap_flags;
+       resp->max_sge           = attr->max_sge;
+       resp->max_sge_rd        = attr->max_sge_rd;
+       resp->max_cq            = attr->max_cq;
+       resp->max_cqe           = attr->max_cqe;
+       resp->max_mr            = attr->max_mr;
+       resp->max_pd            = attr->max_pd;
+       resp->max_qp_rd_atom    = attr->max_qp_rd_atom;
+       resp->max_ee_rd_atom    = attr->max_ee_rd_atom;
+       resp->max_res_rd_atom   = attr->max_res_rd_atom;
+       resp->max_qp_init_rd_atom       = attr->max_qp_init_rd_atom;
+       resp->max_ee_init_rd_atom       = attr->max_ee_init_rd_atom;
+       resp->atomic_cap                = attr->atomic_cap;
+       resp->max_ee                    = attr->max_ee;
+       resp->max_rdd                   = attr->max_rdd;
+       resp->max_mw                    = attr->max_mw;
+       resp->max_raw_ipv6_qp           = attr->max_raw_ipv6_qp;
+       resp->max_raw_ethy_qp           = attr->max_raw_ethy_qp;
+       resp->max_mcast_grp             = attr->max_mcast_grp;
+       resp->max_mcast_qp_attach       = attr->max_mcast_qp_attach;
+       resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach;
+       resp->max_ah                    = attr->max_ah;
+       resp->max_fmr                   = attr->max_fmr;
+       resp->max_map_per_fmr           = attr->max_map_per_fmr;
+       resp->max_srq                   = attr->max_srq;
+       resp->max_srq_wr                = attr->max_srq_wr;
+       resp->max_srq_sge               = attr->max_srq_sge;
+       resp->max_pkeys                 = attr->max_pkeys;
+       resp->local_ca_ack_delay        = attr->local_ca_ack_delay;
+       resp->phys_port_cnt             = file->device->ib_dev->phys_port_cnt;
+}
+
 ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
                               const char __user *buf,
                               int in_len, int out_len)
@@ -398,47 +466,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
                return ret;
 
        memset(&resp, 0, sizeof resp);
-
-       resp.fw_ver                    = attr.fw_ver;
-       resp.node_guid                 = file->device->ib_dev->node_guid;
-       resp.sys_image_guid            = attr.sys_image_guid;
-       resp.max_mr_size               = attr.max_mr_size;
-       resp.page_size_cap             = attr.page_size_cap;
-       resp.vendor_id                 = attr.vendor_id;
-       resp.vendor_part_id            = attr.vendor_part_id;
-       resp.hw_ver                    = attr.hw_ver;
-       resp.max_qp                    = attr.max_qp;
-       resp.max_qp_wr                 = attr.max_qp_wr;
-       resp.device_cap_flags          = attr.device_cap_flags;
-       resp.max_sge                   = attr.max_sge;
-       resp.max_sge_rd                = attr.max_sge_rd;
-       resp.max_cq                    = attr.max_cq;
-       resp.max_cqe                   = attr.max_cqe;
-       resp.max_mr                    = attr.max_mr;
-       resp.max_pd                    = attr.max_pd;
-       resp.max_qp_rd_atom            = attr.max_qp_rd_atom;
-       resp.max_ee_rd_atom            = attr.max_ee_rd_atom;
-       resp.max_res_rd_atom           = attr.max_res_rd_atom;
-       resp.max_qp_init_rd_atom       = attr.max_qp_init_rd_atom;
-       resp.max_ee_init_rd_atom       = attr.max_ee_init_rd_atom;
-       resp.atomic_cap                = attr.atomic_cap;
-       resp.max_ee                    = attr.max_ee;
-       resp.max_rdd                   = attr.max_rdd;
-       resp.max_mw                    = attr.max_mw;
-       resp.max_raw_ipv6_qp           = attr.max_raw_ipv6_qp;
-       resp.max_raw_ethy_qp           = attr.max_raw_ethy_qp;
-       resp.max_mcast_grp             = attr.max_mcast_grp;
-       resp.max_mcast_qp_attach       = attr.max_mcast_qp_attach;
-       resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach;
-       resp.max_ah                    = attr.max_ah;
-       resp.max_fmr                   = attr.max_fmr;
-       resp.max_map_per_fmr           = attr.max_map_per_fmr;
-       resp.max_srq                   = attr.max_srq;
-       resp.max_srq_wr                = attr.max_srq_wr;
-       resp.max_srq_sge               = attr.max_srq_sge;
-       resp.max_pkeys                 = attr.max_pkeys;
-       resp.local_ca_ack_delay        = attr.local_ca_ack_delay;
-       resp.phys_port_cnt             = file->device->ib_dev->phys_port_cnt;
+       copy_query_dev_fields(file, &resp, &attr);
 
        if (copy_to_user((void __user *) (unsigned long) cmd.response,
                         &resp, sizeof resp))
@@ -947,6 +975,18 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
                goto err_free;
        }
 
+       if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
+               struct ib_device_attr attr;
+
+               ret = ib_query_device(pd->device, &attr);
+               if (ret || !(attr.device_cap_flags &
+                               IB_DEVICE_ON_DEMAND_PAGING)) {
+                       pr_debug("ODP support not available\n");
+                       ret = -EINVAL;
+                       goto err_put;
+               }
+       }
+
        mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
                                     cmd.access_flags, &udata);
        if (IS_ERR(mr)) {
@@ -3253,3 +3293,52 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
 
        return ret ? ret : in_len;
 }
+
+int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+                             struct ib_udata *ucore,
+                             struct ib_udata *uhw)
+{
+       struct ib_uverbs_ex_query_device_resp resp;
+       struct ib_uverbs_ex_query_device  cmd;
+       struct ib_device_attr attr;
+       struct ib_device *device;
+       int err;
+
+       device = file->device->ib_dev;
+       if (ucore->inlen < sizeof(cmd))
+               return -EINVAL;
+
+       err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+       if (err)
+               return err;
+
+       if (cmd.reserved)
+               return -EINVAL;
+
+       err = device->query_device(device, &attr);
+       if (err)
+               return err;
+
+       memset(&resp, 0, sizeof(resp));
+       copy_query_dev_fields(file, &resp.base, &attr);
+       resp.comp_mask = 0;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       if (cmd.comp_mask & IB_USER_VERBS_EX_QUERY_DEVICE_ODP) {
+               resp.odp_caps.general_caps = attr.odp_caps.general_caps;
+               resp.odp_caps.per_transport_caps.rc_odp_caps =
+                       attr.odp_caps.per_transport_caps.rc_odp_caps;
+               resp.odp_caps.per_transport_caps.uc_odp_caps =
+                       attr.odp_caps.per_transport_caps.uc_odp_caps;
+               resp.odp_caps.per_transport_caps.ud_odp_caps =
+                       attr.odp_caps.per_transport_caps.ud_odp_caps;
+               resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP;
+       }
+#endif
+
+       err = ib_copy_to_udata(ucore, &resp, sizeof(resp));
+       if (err)
+               return err;
+
+       return 0;
+}
index 71ab83fde47292e95315c2170b00fec6277b7861..e6c23b9eab336818fa785bae49f5c78c47221fbb 100644 (file)
@@ -122,7 +122,8 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
                                    struct ib_udata *ucore,
                                    struct ib_udata *uhw) = {
        [IB_USER_VERBS_EX_CMD_CREATE_FLOW]      = ib_uverbs_ex_create_flow,
-       [IB_USER_VERBS_EX_CMD_DESTROY_FLOW]     = ib_uverbs_ex_destroy_flow
+       [IB_USER_VERBS_EX_CMD_DESTROY_FLOW]     = ib_uverbs_ex_destroy_flow,
+       [IB_USER_VERBS_EX_CMD_QUERY_DEVICE]     = ib_uverbs_ex_query_device
 };
 
 static void ib_uverbs_add_one(struct ib_device *device);
@@ -296,6 +297,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
                kfree(uobj);
        }
 
+       put_pid(context->tgid);
+
        return context->device->dealloc_ucontext(context);
 }
 
index c2b89cc5dbcad5153ef3e3d072ea0c5e3d384caa..f93eb8da7b5ad443900c3b8b423da505d0531a95 100644 (file)
@@ -879,7 +879,8 @@ int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
                if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) {
                        rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac);
                        rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac);
-                       qp_attr->vlan_id = rdma_get_vlan_id(&sgid);
+                       if (!(*qp_attr_mask & IB_QP_VID))
+                               qp_attr->vlan_id = rdma_get_vlan_id(&sgid);
                } else {
                        ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid,
                                        qp_attr->ah_attr.dmac, &qp_attr->vlan_id);
index 2d5cbf4363e4de0758a4d7260341a136b3ba2544..bdf3507810cb767767cd8496ea510f9a14497f7c 100644 (file)
@@ -476,7 +476,7 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                                         c2mr->umem->page_size,
                                         i,
                                         length,
-                                        c2mr->umem->offset,
+                                        ib_umem_offset(c2mr->umem),
                                         &kva,
                                         c2_convert_access(acc),
                                         c2mr);
index 4b8c6116c058247f4980fa6e06ec986700ef0922..9edc200b311d861a9faa3eb0c051c5c9c7fbf3cc 100644 (file)
@@ -1640,7 +1640,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
                __state_set(&ep->com, MPA_REQ_RCVD);
 
                /* drive upcall */
-               mutex_lock(&ep->parent_ep->com.mutex);
+               mutex_lock_nested(&ep->parent_ep->com.mutex,
+                                 SINGLE_DEPTH_NESTING);
                if (ep->parent_ep->com.state != DEAD) {
                        if (connect_request_upcall(ep))
                                abort_connection(ep, skb, GFP_KERNEL);
@@ -3126,6 +3127,8 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
                err = c4iw_wait_for_reply(&ep->com.dev->rdev,
                                          &ep->com.wr_wait,
                                          0, 0, __func__);
+       else if (err > 0)
+               err = net_xmit_errno(err);
        if (err)
                pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
                       err, ep->stid,
@@ -3159,6 +3162,8 @@ static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
                        err = c4iw_wait_for_reply(&ep->com.dev->rdev,
                                                  &ep->com.wr_wait,
                                                  0, 0, __func__);
+               else if (err > 0)
+                       err = net_xmit_errno(err);
        }
        if (err)
                pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n"
index 72f1f052e88c3886596004b16243c196a789c172..eb5df4e62703d8adb254a9a4f6163a90d40ebd18 100644 (file)
@@ -670,7 +670,7 @@ static int ep_open(struct inode *inode, struct file *file)
        idr_for_each(&epd->devp->stid_idr, count_idrs, &count);
        spin_unlock_irq(&epd->devp->lock);
 
-       epd->bufsize = count * 160;
+       epd->bufsize = count * 240;
        epd->buf = vmalloc(epd->bufsize);
        if (!epd->buf) {
                ret = -ENOMEM;
index 0744455cd88b75f67a948be3f787f915ff8d7795..cb43c2299ac00b94ec4252a6074013ecf2730955 100644 (file)
@@ -50,6 +50,13 @@ static int inline_threshold = C4IW_INLINE_THRESHOLD;
 module_param(inline_threshold, int, 0644);
 MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)");
 
+static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length)
+{
+       return (is_t4(dev->rdev.lldi.adapter_type) ||
+               is_t5(dev->rdev.lldi.adapter_type)) &&
+               length >= 8*1024*1024*1024ULL;
+}
+
 static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
                                       u32 len, dma_addr_t data, int wait)
 {
@@ -369,9 +376,11 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
        int ret;
 
        ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
-                             FW_RI_STAG_NSMR, mhp->attr.perms,
+                             FW_RI_STAG_NSMR, mhp->attr.len ?
+                             mhp->attr.perms : 0,
                              mhp->attr.mw_bind_enable, mhp->attr.zbva,
-                             mhp->attr.va_fbo, mhp->attr.len, shift - 12,
+                             mhp->attr.va_fbo, mhp->attr.len ?
+                             mhp->attr.len : -1, shift - 12,
                              mhp->attr.pbl_size, mhp->attr.pbl_addr);
        if (ret)
                return ret;
@@ -536,6 +545,11 @@ int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask,
                        return ret;
        }
 
+       if (mr_exceeds_hw_limits(rhp, total_size)) {
+               kfree(page_list);
+               return -EINVAL;
+       }
+
        ret = reregister_mem(rhp, php, &mh, shift, npages);
        kfree(page_list);
        if (ret)
@@ -596,6 +610,12 @@ struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
        if (ret)
                goto err;
 
+       if (mr_exceeds_hw_limits(rhp, total_size)) {
+               kfree(page_list);
+               ret = -EINVAL;
+               goto err;
+       }
+
        ret = alloc_pbl(mhp, npages);
        if (ret) {
                kfree(page_list);
@@ -699,6 +719,10 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
        php = to_c4iw_pd(pd);
        rhp = php->rhp;
+
+       if (mr_exceeds_hw_limits(rhp, length))
+               return ERR_PTR(-EINVAL);
+
        mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
        if (!mhp)
                return ERR_PTR(-ENOMEM);
index 2ed3ece2b2ee38de0bf38525914ffb85cb7ed80d..bb85d479e66eec254fa5c389e9dc4cc335248bb4 100644 (file)
@@ -1538,9 +1538,9 @@ err:
        set_state(qhp, C4IW_QP_STATE_ERROR);
        free = 1;
        abort = 1;
-       wake_up(&qhp->wait);
        BUG_ON(!ep);
        flush_qp(qhp);
+       wake_up(&qhp->wait);
 out:
        mutex_unlock(&qhp->mutex);
 
index 3488e8c9fcb44b89211e0650c47a87a2fd469e65..f914b30999f8d1519ee28edeb134762cf4796b52 100644 (file)
@@ -399,7 +399,7 @@ reg_user_mr_fallback:
        pginfo.num_kpages = num_kpages;
        pginfo.num_hwpages = num_hwpages;
        pginfo.u.usr.region = e_mr->umem;
-       pginfo.next_hwpage = e_mr->umem->offset / hwpage_size;
+       pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size;
        pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;
        ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
                          e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
index 5e61e9bff697b6c3d837e4281ff5f4231306076b..c7278f6a8217798c3572366828491aa1c57d5b49 100644 (file)
@@ -214,7 +214,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        mr->mr.user_base = start;
        mr->mr.iova = virt_addr;
        mr->mr.length = length;
-       mr->mr.offset = umem->offset;
+       mr->mr.offset = ib_umem_offset(umem);
        mr->mr.access_flags = mr_access_flags;
        mr->mr.max_segs = n;
        mr->umem = umem;
index 8f9325cfc85d3638dbb407573e30d3388cf3e934..c36ccbd9a644f16e85cdcb4a323cbf1803683292 100644 (file)
@@ -223,7 +223,6 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
 
        if (flags & IB_MR_REREG_TRANS) {
                int shift;
-               int err;
                int n;
 
                mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
index 4ea0135af48461878b5bc7d3da5cbd2065f1a4e6..27a70159e2ea8813ad7cbffc4913b3561897af34 100644 (file)
@@ -1,3 +1,4 @@
 obj-$(CONFIG_MLX5_INFINIBAND)  += mlx5_ib.o
 
 mlx5_ib-y :=   main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o
+mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
index 1ba6c42e4df80cef8bddce8dfd0bc9525e84123a..8a87404e9c76e763709e478b17b00485f38e678f 100644 (file)
@@ -244,6 +244,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
                                           props->max_mcast_grp;
        props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       if (dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)
+               props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
+       props->odp_caps = dev->odp_caps;
+#endif
+
 out:
        kfree(in_mad);
        kfree(out_mad);
@@ -568,6 +574,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
                        goto out_count;
        }
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
+#endif
+
        INIT_LIST_HEAD(&context->db_page_list);
        mutex_init(&context->db_page_mutex);
 
@@ -858,7 +868,7 @@ static ssize_t show_reg_pages(struct device *device,
        struct mlx5_ib_dev *dev =
                container_of(device, struct mlx5_ib_dev, ib_dev.dev);
 
-       return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages);
+       return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
 }
 
 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
@@ -1321,6 +1331,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
                (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
                (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)         |
                (1ull << IB_USER_VERBS_CMD_OPEN_QP);
+       dev->ib_dev.uverbs_ex_cmd_mask =
+               (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
 
        dev->ib_dev.query_device        = mlx5_ib_query_device;
        dev->ib_dev.query_port          = mlx5_ib_query_port;
@@ -1366,6 +1378,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
        dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;
        dev->ib_dev.check_mr_status     = mlx5_ib_check_mr_status;
 
+       mlx5_ib_internal_query_odp_caps(dev);
+
        if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) {
                dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
                dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
@@ -1379,16 +1393,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
                goto err_eqs;
 
        mutex_init(&dev->cap_mask_mutex);
-       spin_lock_init(&dev->mr_lock);
 
        err = create_dev_resources(&dev->devr);
        if (err)
                goto err_eqs;
 
-       err = ib_register_device(&dev->ib_dev, NULL);
+       err = mlx5_ib_odp_init_one(dev);
        if (err)
                goto err_rsrc;
 
+       err = ib_register_device(&dev->ib_dev, NULL);
+       if (err)
+               goto err_odp;
+
        err = create_umr_res(dev);
        if (err)
                goto err_dev;
@@ -1410,6 +1427,9 @@ err_umrc:
 err_dev:
        ib_unregister_device(&dev->ib_dev);
 
+err_odp:
+       mlx5_ib_odp_remove_one(dev);
+
 err_rsrc:
        destroy_dev_resources(&dev->devr);
 
@@ -1425,8 +1445,10 @@ err_dealloc:
 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 {
        struct mlx5_ib_dev *dev = context;
+
        ib_unregister_device(&dev->ib_dev);
        destroy_umrc_res(dev);
+       mlx5_ib_odp_remove_one(dev);
        destroy_dev_resources(&dev->devr);
        free_comp_eqs(dev);
        ib_dealloc_device(&dev->ib_dev);
@@ -1440,15 +1462,30 @@ static struct mlx5_interface mlx5_ib_interface = {
 
 static int __init mlx5_ib_init(void)
 {
+       int err;
+
        if (deprecated_prof_sel != 2)
                pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
 
-       return mlx5_register_interface(&mlx5_ib_interface);
+       err = mlx5_ib_odp_init();
+       if (err)
+               return err;
+
+       err = mlx5_register_interface(&mlx5_ib_interface);
+       if (err)
+               goto clean_odp;
+
+       return err;
+
+clean_odp:
+       mlx5_ib_odp_cleanup();
+       return err;
 }
 
 static void __exit mlx5_ib_cleanup(void)
 {
        mlx5_unregister_interface(&mlx5_ib_interface);
+       mlx5_ib_odp_cleanup();
 }
 
 module_init(mlx5_ib_init);
index dae07eae95073b8d08b42de972e62600560d17e3..b56e4c5593ee92f3fe69d542c77919ee93e3cfc7 100644 (file)
@@ -32,6 +32,7 @@
 
 #include <linux/module.h>
 #include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
 #include "mlx5_ib.h"
 
 /* @umem: umem object to scan
@@ -57,6 +58,17 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
        int entry;
        unsigned long page_shift = ilog2(umem->page_size);
 
+       /* With ODP we must always match OS page size. */
+       if (umem->odp_data) {
+               *count = ib_umem_page_count(umem);
+               *shift = PAGE_SHIFT;
+               *ncont = *count;
+               if (order)
+                       *order = ilog2(roundup_pow_of_two(*count));
+
+               return;
+       }
+
        addr = addr >> page_shift;
        tmp = (unsigned long)addr;
        m = find_first_bit(&tmp, sizeof(tmp));
@@ -108,8 +120,36 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
        *count = i;
 }
 
-void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
-                         int page_shift, __be64 *pas, int umr)
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
+{
+       u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
+
+       if (umem_dma & ODP_READ_ALLOWED_BIT)
+               mtt_entry |= MLX5_IB_MTT_READ;
+       if (umem_dma & ODP_WRITE_ALLOWED_BIT)
+               mtt_entry |= MLX5_IB_MTT_WRITE;
+
+       return mtt_entry;
+}
+#endif
+
+/*
+ * Populate the given array with bus addresses from the umem.
+ *
+ * dev - mlx5_ib device
+ * umem - umem to use to fill the pages
+ * page_shift - determines the page size used in the resulting array
+ * offset - offset into the umem to start from,
+ *          only implemented for ODP umems
+ * num_pages - total number of pages to fill
+ * pas - bus addresses array to fill
+ * access_flags - access flags to set on all present pages.
+                 use enum mlx5_ib_mtt_access_flags for this.
+ */
+void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+                           int page_shift, size_t offset, size_t num_pages,
+                           __be64 *pas, int access_flags)
 {
        unsigned long umem_page_shift = ilog2(umem->page_size);
        int shift = page_shift - umem_page_shift;
@@ -120,6 +160,21 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
        int len;
        struct scatterlist *sg;
        int entry;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       const bool odp = umem->odp_data != NULL;
+
+       if (odp) {
+               WARN_ON(shift != 0);
+               WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE));
+
+               for (i = 0; i < num_pages; ++i) {
+                       dma_addr_t pa = umem->odp_data->dma_list[offset + i];
+
+                       pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
+               }
+               return;
+       }
+#endif
 
        i = 0;
        for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
@@ -128,8 +183,7 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
                for (k = 0; k < len; k++) {
                        if (!(i & mask)) {
                                cur = base + (k << umem_page_shift);
-                               if (umr)
-                                       cur |= 3;
+                               cur |= access_flags;
 
                                pas[i >> shift] = cpu_to_be64(cur);
                                mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n",
@@ -142,6 +196,13 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
        }
 }
 
+void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+                         int page_shift, __be64 *pas, int access_flags)
+{
+       return __mlx5_ib_populate_pas(dev, umem, page_shift, 0,
+                                     ib_umem_num_pages(umem), pas,
+                                     access_flags);
+}
 int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
 {
        u64 page_size;
index 386780f0d1e18304927d8a3c42705c4be916b731..83f22fe297c8ac522efa80001819990cdc31508b 100644 (file)
@@ -111,6 +111,8 @@ struct mlx5_ib_pd {
  */
 
 #define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START
+#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1)
+#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2)
 #define MLX5_IB_QPT_REG_UMR    IB_QPT_RESERVED1
 #define MLX5_IB_WR_UMR         IB_WR_RESERVED1
 
@@ -147,6 +149,29 @@ enum {
        MLX5_QP_EMPTY
 };
 
+/*
+ * Connect-IB can trigger up to four concurrent pagefaults
+ * per-QP.
+ */
+enum mlx5_ib_pagefault_context {
+       MLX5_IB_PAGEFAULT_RESPONDER_READ,
+       MLX5_IB_PAGEFAULT_REQUESTOR_READ,
+       MLX5_IB_PAGEFAULT_RESPONDER_WRITE,
+       MLX5_IB_PAGEFAULT_REQUESTOR_WRITE,
+       MLX5_IB_PAGEFAULT_CONTEXTS
+};
+
+static inline enum mlx5_ib_pagefault_context
+       mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault)
+{
+       return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE);
+}
+
+struct mlx5_ib_pfault {
+       struct work_struct      work;
+       struct mlx5_pagefault   mpfault;
+};
+
 struct mlx5_ib_qp {
        struct ib_qp            ibqp;
        struct mlx5_core_qp     mqp;
@@ -192,6 +217,21 @@ struct mlx5_ib_qp {
 
        /* Store signature errors */
        bool                    signature_en;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       /*
+        * A flag that is true for QP's that are in a state that doesn't
+        * allow page faults, and shouldn't schedule any more faults.
+        */
+       int                     disable_page_faults;
+       /*
+        * The disable_page_faults_lock protects a QP's disable_page_faults
+        * field, allowing for a thread to atomically check whether the QP
+        * allows page faults, and if so schedule a page fault.
+        */
+       spinlock_t              disable_page_faults_lock;
+       struct mlx5_ib_pfault   pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
+#endif
 };
 
 struct mlx5_ib_cq_buf {
@@ -206,6 +246,19 @@ enum mlx5_ib_qp_flags {
        MLX5_IB_QP_SIGNATURE_HANDLING           = 1 << 1,
 };
 
+struct mlx5_umr_wr {
+       union {
+               u64                     virt_addr;
+               u64                     offset;
+       } target;
+       struct ib_pd                   *pd;
+       unsigned int                    page_shift;
+       unsigned int                    npages;
+       u32                             length;
+       int                             access_flags;
+       u32                             mkey;
+};
+
 struct mlx5_shared_mr_info {
        int mr_id;
        struct ib_umem          *umem;
@@ -253,6 +306,13 @@ struct mlx5_ib_xrcd {
        u32                     xrcdn;
 };
 
+enum mlx5_ib_mtt_access_flags {
+       MLX5_IB_MTT_READ  = (1 << 0),
+       MLX5_IB_MTT_WRITE = (1 << 1),
+};
+
+#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
+
 struct mlx5_ib_mr {
        struct ib_mr            ibmr;
        struct mlx5_core_mr     mmr;
@@ -261,12 +321,11 @@ struct mlx5_ib_mr {
        struct list_head        list;
        int                     order;
        int                     umred;
-       __be64                  *pas;
-       dma_addr_t              dma;
        int                     npages;
        struct mlx5_ib_dev     *dev;
        struct mlx5_create_mkey_mbox_out out;
        struct mlx5_core_sig_ctx    *sig;
+       int                     live;
 };
 
 struct mlx5_ib_fast_reg_page_list {
@@ -372,11 +431,18 @@ struct mlx5_ib_dev {
        struct umr_common               umrc;
        /* sync used page count stats
         */
-       spinlock_t                      mr_lock;
        struct mlx5_ib_resources        devr;
        struct mlx5_mr_cache            cache;
        struct timer_list               delay_timer;
        int                             fill_delay;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       struct ib_odp_caps      odp_caps;
+       /*
+        * Sleepable RCU that prevents destruction of MRs while they are still
+        * being used by a page fault handler.
+        */
+       struct srcu_struct      mr_srcu;
+#endif
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -490,6 +556,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
                      struct ib_recv_wr **bad_wr);
 void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
+int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
+                         void *buffer, u32 length);
 struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
                                int vector, struct ib_ucontext *context,
                                struct ib_udata *udata);
@@ -502,6 +570,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                                  u64 virt_addr, int access_flags,
                                  struct ib_udata *udata);
+int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index,
+                      int npages, int zap);
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
 int mlx5_ib_destroy_mr(struct ib_mr *ibmr);
 struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
@@ -533,8 +603,11 @@ int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev);
 void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev);
 void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
                        int *ncont, int *order);
+void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+                           int page_shift, size_t offset, size_t num_pages,
+                           __be64 *pas, int access_flags);
 void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
-                         int page_shift, __be64 *pas, int umr);
+                         int page_shift, __be64 *pas, int access_flags);
 void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
 int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq);
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
@@ -544,6 +617,38 @@ void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context);
 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
                            struct ib_mr_status *mr_status);
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+extern struct workqueue_struct *mlx5_ib_page_fault_wq;
+
+int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev);
+void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
+                              struct mlx5_ib_pfault *pfault);
+void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp);
+int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
+void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev);
+int __init mlx5_ib_odp_init(void);
+void mlx5_ib_odp_cleanup(void);
+void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
+void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
+void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
+                             unsigned long end);
+
+#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
+{
+       return 0;
+}
+
+static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)                {}
+static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
+static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)   {}
+static inline int mlx5_ib_odp_init(void) { return 0; }
+static inline void mlx5_ib_odp_cleanup(void)                           {}
+static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {}
+static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)  {}
+
+#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
 static inline void init_query_mad(struct ib_smp *mad)
 {
        mad->base_version  = 1;
@@ -561,4 +666,7 @@ static inline u8 convert_access(int acc)
               MLX5_PERM_LOCAL_READ;
 }
 
+#define MLX5_MAX_UMR_SHIFT 16
+#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
+
 #endif /* MLX5_IB_H */
index 5a80dd9937612f5804d64135838ff389570e6b6e..32a28bd50b20ae08c41a9a7086b72045f7934c3d 100644 (file)
 #include <linux/export.h>
 #include <linux/delay.h>
 #include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+#include <rdma/ib_verbs.h>
 #include "mlx5_ib.h"
 
 enum {
        MAX_PENDING_REG_MR = 8,
 };
 
-enum {
-       MLX5_UMR_ALIGN  = 2048
-};
+#define MLX5_UMR_ALIGN 2048
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+static __be64 mlx5_ib_update_mtt_emergency_buffer[
+               MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)]
+       __aligned(MLX5_UMR_ALIGN);
+static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
+#endif
+
+static int clean_mr(struct mlx5_ib_mr *mr);
 
-static __be64 *mr_align(__be64 *ptr, int align)
+static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
-       unsigned long mask = align - 1;
+       int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
 
-       return (__be64 *)(((unsigned long)ptr + mask) & ~mask);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       /* Wait until all page fault handlers using the mr complete. */
+       synchronize_srcu(&dev->mr_srcu);
+#endif
+
+       return err;
 }
 
 static int order2idx(struct mlx5_ib_dev *dev, int order)
@@ -146,7 +159,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
                mr->order = ent->order;
                mr->umred = 1;
                mr->dev = dev;
-               in->seg.status = 1 << 6;
+               in->seg.status = MLX5_MKEY_STATUS_FREE;
                in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
                in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
                in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
@@ -191,7 +204,7 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
                ent->cur--;
                ent->size--;
                spin_unlock_irq(&ent->lock);
-               err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+               err = destroy_mkey(dev, mr);
                if (err)
                        mlx5_ib_warn(dev, "failed destroy mkey\n");
                else
@@ -482,7 +495,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
                ent->cur--;
                ent->size--;
                spin_unlock_irq(&ent->lock);
-               err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+               err = destroy_mkey(dev, mr);
                if (err)
                        mlx5_ib_warn(dev, "failed destroy mkey\n");
                else
@@ -668,7 +681,7 @@ static int get_octo_len(u64 addr, u64 len, int page_size)
 
 static int use_umr(int order)
 {
-       return order <= 17;
+       return order <= MLX5_MAX_UMR_SHIFT;
 }
 
 static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
@@ -678,6 +691,7 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
 {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
        struct ib_mr *mr = dev->umrc.mr;
+       struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
 
        sg->addr = dma;
        sg->length = ALIGN(sizeof(u64) * n, 64);
@@ -692,21 +706,24 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
                wr->num_sge = 0;
 
        wr->opcode = MLX5_IB_WR_UMR;
-       wr->wr.fast_reg.page_list_len = n;
-       wr->wr.fast_reg.page_shift = page_shift;
-       wr->wr.fast_reg.rkey = key;
-       wr->wr.fast_reg.iova_start = virt_addr;
-       wr->wr.fast_reg.length = len;
-       wr->wr.fast_reg.access_flags = access_flags;
-       wr->wr.fast_reg.page_list = (struct ib_fast_reg_page_list *)pd;
+
+       umrwr->npages = n;
+       umrwr->page_shift = page_shift;
+       umrwr->mkey = key;
+       umrwr->target.virt_addr = virt_addr;
+       umrwr->length = len;
+       umrwr->access_flags = access_flags;
+       umrwr->pd = pd;
 }
 
 static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
                               struct ib_send_wr *wr, u32 key)
 {
-       wr->send_flags = MLX5_IB_SEND_UMR_UNREG;
+       struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
+
+       wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE;
        wr->opcode = MLX5_IB_WR_UMR;
-       wr->wr.fast_reg.rkey = key;
+       umrwr->mkey = key;
 }
 
 void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context)
@@ -742,7 +759,10 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
        struct ib_send_wr wr, *bad;
        struct mlx5_ib_mr *mr;
        struct ib_sge sg;
-       int size = sizeof(u64) * npages;
+       int size;
+       __be64 *mr_pas;
+       __be64 *pas;
+       dma_addr_t dma;
        int err = 0;
        int i;
 
@@ -761,25 +781,31 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
        if (!mr)
                return ERR_PTR(-EAGAIN);
 
-       mr->pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
-       if (!mr->pas) {
+       /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
+        * To avoid copying garbage after the pas array, we allocate
+        * a little more. */
+       size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT);
+       mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
+       if (!mr_pas) {
                err = -ENOMEM;
                goto free_mr;
        }
 
-       mlx5_ib_populate_pas(dev, umem, page_shift,
-                            mr_align(mr->pas, MLX5_UMR_ALIGN), 1);
+       pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN);
+       mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
+       /* Clear padding after the actual pages. */
+       memset(pas + npages, 0, size - npages * sizeof(u64));
 
-       mr->dma = dma_map_single(ddev, mr_align(mr->pas, MLX5_UMR_ALIGN), size,
-                                DMA_TO_DEVICE);
-       if (dma_mapping_error(ddev, mr->dma)) {
+       dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
+       if (dma_mapping_error(ddev, dma)) {
                err = -ENOMEM;
                goto free_pas;
        }
 
        memset(&wr, 0, sizeof(wr));
        wr.wr_id = (u64)(unsigned long)&umr_context;
-       prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags);
+       prep_umr_reg_wqe(pd, &wr, &sg, dma, npages, mr->mmr.key, page_shift,
+                        virt_addr, len, access_flags);
 
        mlx5_ib_init_umr_context(&umr_context);
        down(&umrc->sem);
@@ -799,12 +825,14 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
        mr->mmr.size = len;
        mr->mmr.pd = to_mpd(pd)->pdn;
 
+       mr->live = 1;
+
 unmap_dma:
        up(&umrc->sem);
-       dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE);
+       dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
 
 free_pas:
-       kfree(mr->pas);
+       kfree(mr_pas);
 
 free_mr:
        if (err) {
@@ -815,6 +843,128 @@ free_mr:
        return mr;
 }
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
+                      int zap)
+{
+       struct mlx5_ib_dev *dev = mr->dev;
+       struct device *ddev = dev->ib_dev.dma_device;
+       struct umr_common *umrc = &dev->umrc;
+       struct mlx5_ib_umr_context umr_context;
+       struct ib_umem *umem = mr->umem;
+       int size;
+       __be64 *pas;
+       dma_addr_t dma;
+       struct ib_send_wr wr, *bad;
+       struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr.wr.fast_reg;
+       struct ib_sge sg;
+       int err = 0;
+       const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
+       const int page_index_mask = page_index_alignment - 1;
+       size_t pages_mapped = 0;
+       size_t pages_to_map = 0;
+       size_t pages_iter = 0;
+       int use_emergency_buf = 0;
+
+       /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
+        * so we need to align the offset and length accordingly */
+       if (start_page_index & page_index_mask) {
+               npages += start_page_index & page_index_mask;
+               start_page_index &= ~page_index_mask;
+       }
+
+       pages_to_map = ALIGN(npages, page_index_alignment);
+
+       if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
+               return -EINVAL;
+
+       size = sizeof(u64) * pages_to_map;
+       size = min_t(int, PAGE_SIZE, size);
+       /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
+        * code, when we are called from an invalidation. The pas buffer must
+        * be 2k-aligned for Connect-IB. */
+       pas = (__be64 *)get_zeroed_page(GFP_ATOMIC);
+       if (!pas) {
+               mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n");
+               pas = mlx5_ib_update_mtt_emergency_buffer;
+               size = MLX5_UMR_MTT_MIN_CHUNK_SIZE;
+               use_emergency_buf = 1;
+               mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
+               memset(pas, 0, size);
+       }
+       pages_iter = size / sizeof(u64);
+       dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
+       if (dma_mapping_error(ddev, dma)) {
+               mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
+               err = -ENOMEM;
+               goto free_pas;
+       }
+
+       for (pages_mapped = 0;
+            pages_mapped < pages_to_map && !err;
+            pages_mapped += pages_iter, start_page_index += pages_iter) {
+               dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
+
+               npages = min_t(size_t,
+                              pages_iter,
+                              ib_umem_num_pages(umem) - start_page_index);
+
+               if (!zap) {
+                       __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT,
+                                              start_page_index, npages, pas,
+                                              MLX5_IB_MTT_PRESENT);
+                       /* Clear padding after the pages brought from the
+                        * umem. */
+                       memset(pas + npages, 0, size - npages * sizeof(u64));
+               }
+
+               dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
+
+               memset(&wr, 0, sizeof(wr));
+               wr.wr_id = (u64)(unsigned long)&umr_context;
+
+               sg.addr = dma;
+               sg.length = ALIGN(npages * sizeof(u64),
+                               MLX5_UMR_MTT_ALIGNMENT);
+               sg.lkey = dev->umrc.mr->lkey;
+
+               wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
+                               MLX5_IB_SEND_UMR_UPDATE_MTT;
+               wr.sg_list = &sg;
+               wr.num_sge = 1;
+               wr.opcode = MLX5_IB_WR_UMR;
+               umrwr->npages = sg.length / sizeof(u64);
+               umrwr->page_shift = PAGE_SHIFT;
+               umrwr->mkey = mr->mmr.key;
+               umrwr->target.offset = start_page_index;
+
+               mlx5_ib_init_umr_context(&umr_context);
+               down(&umrc->sem);
+               err = ib_post_send(umrc->qp, &wr, &bad);
+               if (err) {
+                       mlx5_ib_err(dev, "UMR post send failed, err %d\n", err);
+               } else {
+                       wait_for_completion(&umr_context.done);
+                       if (umr_context.status != IB_WC_SUCCESS) {
+                               mlx5_ib_err(dev, "UMR completion failed, code %d\n",
+                                           umr_context.status);
+                               err = -EFAULT;
+                       }
+               }
+               up(&umrc->sem);
+       }
+       dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
+
+free_pas:
+       if (!use_emergency_buf)
+               free_page((unsigned long)pas);
+       else
+               mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
+
+       return err;
+}
+#endif
+
 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
                                     u64 length, struct ib_umem *umem,
                                     int npages, int page_shift,
@@ -825,6 +975,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
        struct mlx5_ib_mr *mr;
        int inlen;
        int err;
+       bool pg_cap = !!(dev->mdev->caps.gen.flags &
+                        MLX5_DEV_CAP_FLAG_ON_DMND_PG);
 
        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
        if (!mr)
@@ -836,8 +988,12 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
                err = -ENOMEM;
                goto err_1;
        }
-       mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, 0);
+       mlx5_ib_populate_pas(dev, umem, page_shift, in->pas,
+                            pg_cap ? MLX5_IB_MTT_PRESENT : 0);
 
+       /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags
+        * in the page list submitted with the command. */
+       in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0;
        in->seg.flags = convert_access(access_flags) |
                MLX5_ACCESS_MODE_MTT;
        in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
@@ -856,6 +1012,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
                goto err_2;
        }
        mr->umem = umem;
+       mr->live = 1;
        kvfree(in);
 
        mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
@@ -910,6 +1067,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                        mlx5_ib_dbg(dev, "cache empty for order %d", order);
                        mr = NULL;
                }
+       } else if (access_flags & IB_ACCESS_ON_DEMAND) {
+               err = -EINVAL;
+               pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
+               goto error;
        }
 
        if (!mr)
@@ -925,16 +1086,51 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
        mr->umem = umem;
        mr->npages = npages;
-       spin_lock(&dev->mr_lock);
-       dev->mdev->priv.reg_pages += npages;
-       spin_unlock(&dev->mr_lock);
+       atomic_add(npages, &dev->mdev->priv.reg_pages);
        mr->ibmr.lkey = mr->mmr.key;
        mr->ibmr.rkey = mr->mmr.key;
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       if (umem->odp_data) {
+               /*
+                * This barrier prevents the compiler from moving the
+                * setting of umem->odp_data->private to point to our
+                * MR, before reg_umr finished, to ensure that the MR
+                * initialization have finished before starting to
+                * handle invalidations.
+                */
+               smp_wmb();
+               mr->umem->odp_data->private = mr;
+               /*
+                * Make sure we will see the new
+                * umem->odp_data->private value in the invalidation
+                * routines, before we can get page faults on the
+                * MR. Page faults can happen once we put the MR in
+                * the tree, below this line. Without the barrier,
+                * there can be a fault handling and an invalidation
+                * before umem->odp_data->private == mr is visible to
+                * the invalidation handler.
+                */
+               smp_wmb();
+       }
+#endif
+
        return &mr->ibmr;
 
 error:
+       /*
+        * Destroy the umem *before* destroying the MR, to ensure we
+        * will not have any in-flight notifiers when destroying the
+        * MR.
+        *
+        * As the MR is completely invalid to begin with, and this
+        * error path is only taken if we can't push the mr entry into
+        * the pagefault tree, this is safe.
+        */
+
        ib_umem_release(umem);
+       /* Kill the MR, and return an error code. */
+       clean_mr(mr);
        return ERR_PTR(err);
 }
 
@@ -971,17 +1167,14 @@ error:
        return err;
 }
 
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+static int clean_mr(struct mlx5_ib_mr *mr)
 {
-       struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
-       struct mlx5_ib_mr *mr = to_mmr(ibmr);
-       struct ib_umem *umem = mr->umem;
-       int npages = mr->npages;
+       struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
        int umred = mr->umred;
        int err;
 
        if (!umred) {
-               err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+               err = destroy_mkey(dev, mr);
                if (err) {
                        mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
                                     mr->mmr.key, err);
@@ -996,15 +1189,47 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
                free_cached_mr(dev, mr);
        }
 
-       if (umem) {
+       if (!umred)
+               kfree(mr);
+
+       return 0;
+}
+
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+{
+       struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+       struct mlx5_ib_mr *mr = to_mmr(ibmr);
+       int npages = mr->npages;
+       struct ib_umem *umem = mr->umem;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       if (umem && umem->odp_data) {
+               /* Prevent new page faults from succeeding */
+               mr->live = 0;
+               /* Wait for all running page-fault handlers to finish. */
+               synchronize_srcu(&dev->mr_srcu);
+               /* Destroy all page mappings */
+               mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
+                                        ib_umem_end(umem));
+               /*
+                * We kill the umem before the MR for ODP,
+                * so that there will not be any invalidations in
+                * flight, looking at the *mr struct.
+                */
                ib_umem_release(umem);
-               spin_lock(&dev->mr_lock);
-               dev->mdev->priv.reg_pages -= npages;
-               spin_unlock(&dev->mr_lock);
+               atomic_sub(npages, &dev->mdev->priv.reg_pages);
+
+               /* Avoid double-freeing the umem. */
+               umem = NULL;
        }
+#endif
 
-       if (!umred)
-               kfree(mr);
+       clean_mr(mr);
+
+       if (umem) {
+               ib_umem_release(umem);
+               atomic_sub(npages, &dev->mdev->priv.reg_pages);
+       }
 
        return 0;
 }
@@ -1028,7 +1253,7 @@ struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
                goto err_free;
        }
 
-       in->seg.status = 1 << 6; /* free */
+       in->seg.status = MLX5_MKEY_STATUS_FREE;
        in->seg.xlt_oct_size = cpu_to_be32(ndescs);
        in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
        in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
@@ -1113,7 +1338,7 @@ int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
                kfree(mr->sig);
        }
 
-       err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+       err = destroy_mkey(dev, mr);
        if (err) {
                mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
                             mr->mmr.key, err);
@@ -1143,7 +1368,7 @@ struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
                goto err_free;
        }
 
-       in->seg.status = 1 << 6; /* free */
+       in->seg.status = MLX5_MKEY_STATUS_FREE;
        in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2);
        in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
        in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT;
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
new file mode 100644 (file)
index 0000000..a2c541c
--- /dev/null
@@ -0,0 +1,798 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+
+#include "mlx5_ib.h"
+
+#define MAX_PREFETCH_LEN (4*1024*1024U)
+
+/* Timeout in ms to wait for an active mmu notifier to complete when handling
+ * a pagefault. */
+#define MMU_NOTIFIER_TIMEOUT 1000
+
+struct workqueue_struct *mlx5_ib_page_fault_wq;
+
+void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
+                             unsigned long end)
+{
+       struct mlx5_ib_mr *mr;
+       const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1;
+       u64 idx = 0, blk_start_idx = 0;
+       int in_block = 0;
+       u64 addr;
+
+       if (!umem || !umem->odp_data) {
+               pr_err("invalidation called on NULL umem or non-ODP umem\n");
+               return;
+       }
+
+       mr = umem->odp_data->private;
+
+       if (!mr || !mr->ibmr.pd)
+               return;
+
+       start = max_t(u64, ib_umem_start(umem), start);
+       end = min_t(u64, ib_umem_end(umem), end);
+
+       /*
+        * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
+        * while we are doing the invalidation, no page fault will attempt to
+        * overwrite the same MTTs.  Concurent invalidations might race us,
+        * but they will write 0s as well, so no difference in the end result.
+        */
+
+       for (addr = start; addr < end; addr += (u64)umem->page_size) {
+               idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
+               /*
+                * Strive to write the MTTs in chunks, but avoid overwriting
+                * non-existing MTTs. The huristic here can be improved to
+                * estimate the cost of another UMR vs. the cost of bigger
+                * UMR.
+                */
+               if (umem->odp_data->dma_list[idx] &
+                   (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
+                       if (!in_block) {
+                               blk_start_idx = idx;
+                               in_block = 1;
+                       }
+               } else {
+                       u64 umr_offset = idx & umr_block_mask;
+
+                       if (in_block && umr_offset == 0) {
+                               mlx5_ib_update_mtt(mr, blk_start_idx,
+                                                  idx - blk_start_idx, 1);
+                               in_block = 0;
+                       }
+               }
+       }
+       if (in_block)
+               mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1,
+                                  1);
+
+       /*
+        * We are now sure that the device will not access the
+        * memory. We can safely unmap it, and mark it as dirty if
+        * needed.
+        */
+
+       ib_umem_odp_unmap_dma_pages(umem, start, end);
+}
+
+#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do {        \
+       if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name)  \
+               ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name;       \
+} while (0)
+
+int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
+{
+       int err;
+       struct mlx5_odp_caps hw_caps;
+       struct ib_odp_caps *caps = &dev->odp_caps;
+
+       memset(caps, 0, sizeof(*caps));
+
+       if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG))
+               return 0;
+
+       err = mlx5_query_odp_caps(dev->mdev, &hw_caps);
+       if (err)
+               goto out;
+
+       caps->general_caps = IB_ODP_SUPPORT;
+       COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps,
+                              SEND);
+       COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
+                              SEND);
+       COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
+                              RECV);
+       COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
+                              WRITE);
+       COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
+                              READ);
+
+out:
+       return err;
+}
+
+static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
+                                                  u32 key)
+{
+       u32 base_key = mlx5_base_mkey(key);
+       struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
+       struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr);
+
+       if (!mmr || mmr->key != key || !mr->live)
+               return NULL;
+
+       return container_of(mmr, struct mlx5_ib_mr, mmr);
+}
+
+static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
+                                     struct mlx5_ib_pfault *pfault,
+                                     int error) {
+       struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
+       int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn,
+                                             pfault->mpfault.flags,
+                                             error);
+       if (ret)
+               pr_err("Failed to resolve the page fault on QP 0x%x\n",
+                      qp->mqp.qpn);
+}
+
+/*
+ * Handle a single data segment in a page-fault WQE.
+ *
+ * Returns number of pages retrieved on success. The caller will continue to
+ * the next data segment.
+ * Can return the following error codes:
+ * -EAGAIN to designate a temporary error. The caller will abort handling the
+ *  page fault and resolve it.
+ * -EFAULT when there's an error mapping the requested pages. The caller will
+ *  abort the page fault handling and possibly move the QP to an error state.
+ * On other errors the QP should also be closed with an error.
+ */
+static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
+                                        struct mlx5_ib_pfault *pfault,
+                                        u32 key, u64 io_virt, size_t bcnt,
+                                        u32 *bytes_mapped)
+{
+       struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device);
+       int srcu_key;
+       unsigned int current_seq;
+       u64 start_idx;
+       int npages = 0, ret = 0;
+       struct mlx5_ib_mr *mr;
+       u64 access_mask = ODP_READ_ALLOWED_BIT;
+
+       srcu_key = srcu_read_lock(&mib_dev->mr_srcu);
+       mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key);
+       /*
+        * If we didn't find the MR, it means the MR was closed while we were
+        * handling the ODP event. In this case we return -EFAULT so that the
+        * QP will be closed.
+        */
+       if (!mr || !mr->ibmr.pd) {
+               pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
+                      key);
+               ret = -EFAULT;
+               goto srcu_unlock;
+       }
+       if (!mr->umem->odp_data) {
+               pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
+                        key);
+               if (bytes_mapped)
+                       *bytes_mapped +=
+                               (bcnt - pfault->mpfault.bytes_committed);
+               goto srcu_unlock;
+       }
+       if (mr->ibmr.pd != qp->ibqp.pd) {
+               pr_err("Page-fault with different PDs for QP and MR.\n");
+               ret = -EFAULT;
+               goto srcu_unlock;
+       }
+
+       current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
+       /*
+        * Ensure the sequence number is valid for some time before we call
+        * gup.
+        */
+       smp_rmb();
+
+       /*
+        * Avoid branches - this code will perform correctly
+        * in all iterations (in iteration 2 and above,
+        * bytes_committed == 0).
+        */
+       io_virt += pfault->mpfault.bytes_committed;
+       bcnt -= pfault->mpfault.bytes_committed;
+
+       start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT;
+
+       if (mr->umem->writable)
+               access_mask |= ODP_WRITE_ALLOWED_BIT;
+       npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt,
+                                          access_mask, current_seq);
+       if (npages < 0) {
+               ret = npages;
+               goto srcu_unlock;
+       }
+
+       if (npages > 0) {
+               mutex_lock(&mr->umem->odp_data->umem_mutex);
+               if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
+                       /*
+                        * No need to check whether the MTTs really belong to
+                        * this MR, since ib_umem_odp_map_dma_pages already
+                        * checks this.
+                        */
+                       ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
+               } else {
+                       ret = -EAGAIN;
+               }
+               mutex_unlock(&mr->umem->odp_data->umem_mutex);
+               if (ret < 0) {
+                       if (ret != -EAGAIN)
+                               pr_err("Failed to update mkey page tables\n");
+                       goto srcu_unlock;
+               }
+
+               if (bytes_mapped) {
+                       u32 new_mappings = npages * PAGE_SIZE -
+                               (io_virt - round_down(io_virt, PAGE_SIZE));
+                       *bytes_mapped += min_t(u32, new_mappings, bcnt);
+               }
+       }
+
+srcu_unlock:
+       if (ret == -EAGAIN) {
+               if (!mr->umem->odp_data->dying) {
+                       struct ib_umem_odp *odp_data = mr->umem->odp_data;
+                       unsigned long timeout =
+                               msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
+
+                       if (!wait_for_completion_timeout(
+                                       &odp_data->notifier_completion,
+                                       timeout)) {
+                               pr_warn("timeout waiting for mmu notifier completion\n");
+                       }
+               } else {
+                       /* The MR is being killed, kill the QP as well. */
+                       ret = -EFAULT;
+               }
+       }
+       srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
+       pfault->mpfault.bytes_committed = 0;
+       return ret ? ret : npages;
+}
+
+/**
+ * Parse a series of data segments for page fault handling.
+ *
+ * @qp the QP on which the fault occurred.
+ * @pfault contains page fault information.
+ * @wqe points at the first data segment in the WQE.
+ * @wqe_end points after the end of the WQE.
+ * @bytes_mapped receives the number of bytes that the function was able to
+ *               map. This allows the caller to decide intelligently whether
+ *               enough memory was mapped to resolve the page fault
+ *               successfully (e.g. enough for the next MTU, or the entire
+ *               WQE).
+ * @total_wqe_bytes receives the total data size of this WQE in bytes (minus
+ *                  the committed bytes).
+ *
+ * Returns the number of pages loaded if positive, zero for an empty WQE, or a
+ * negative error code.
+ */
+static int pagefault_data_segments(struct mlx5_ib_qp *qp,
+                                  struct mlx5_ib_pfault *pfault, void *wqe,
+                                  void *wqe_end, u32 *bytes_mapped,
+                                  u32 *total_wqe_bytes, int receive_queue)
+{
+       int ret = 0, npages = 0;
+       u64 io_virt;
+       u32 key;
+       u32 byte_count;
+       size_t bcnt;
+       int inline_segment;
+
+       /* Skip SRQ next-WQE segment. */
+       if (receive_queue && qp->ibqp.srq)
+               wqe += sizeof(struct mlx5_wqe_srq_next_seg);
+
+       if (bytes_mapped)
+               *bytes_mapped = 0;
+       if (total_wqe_bytes)
+               *total_wqe_bytes = 0;
+
+       while (wqe < wqe_end) {
+               struct mlx5_wqe_data_seg *dseg = wqe;
+
+               io_virt = be64_to_cpu(dseg->addr);
+               key = be32_to_cpu(dseg->lkey);
+               byte_count = be32_to_cpu(dseg->byte_count);
+               inline_segment = !!(byte_count &  MLX5_INLINE_SEG);
+               bcnt           = byte_count & ~MLX5_INLINE_SEG;
+
+               if (inline_segment) {
+                       bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK;
+                       wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt,
+                                    16);
+               } else {
+                       wqe += sizeof(*dseg);
+               }
+
+               /* receive WQE end of sg list. */
+               if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY &&
+                   io_virt == 0)
+                       break;
+
+               if (!inline_segment && total_wqe_bytes) {
+                       *total_wqe_bytes += bcnt - min_t(size_t, bcnt,
+                                       pfault->mpfault.bytes_committed);
+               }
+
+               /* A zero length data segment designates a length of 2GB. */
+               if (bcnt == 0)
+                       bcnt = 1U << 31;
+
+               if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) {
+                       pfault->mpfault.bytes_committed -=
+                               min_t(size_t, bcnt,
+                                     pfault->mpfault.bytes_committed);
+                       continue;
+               }
+
+               ret = pagefault_single_data_segment(qp, pfault, key, io_virt,
+                                                   bcnt, bytes_mapped);
+               if (ret < 0)
+                       break;
+               npages += ret;
+       }
+
+       return ret < 0 ? ret : npages;
+}
+
+/*
+ * Parse initiator WQE. Advances the wqe pointer to point at the
+ * scatter-gather list, and set wqe_end to the end of the WQE.
+ */
+static int mlx5_ib_mr_initiator_pfault_handler(
+       struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault,
+       void **wqe, void **wqe_end, int wqe_length)
+{
+       struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
+       struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
+       u16 wqe_index = pfault->mpfault.wqe.wqe_index;
+       unsigned ds, opcode;
+#if defined(DEBUG)
+       u32 ctrl_wqe_index, ctrl_qpn;
+#endif
+
+       ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
+       if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
+               mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
+                           ds, wqe_length);
+               return -EFAULT;
+       }
+
+       if (ds == 0) {
+               mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
+                           wqe_index, qp->mqp.qpn);
+               return -EFAULT;
+       }
+
+#if defined(DEBUG)
+       ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) &
+                       MLX5_WQE_CTRL_WQE_INDEX_MASK) >>
+                       MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
+       if (wqe_index != ctrl_wqe_index) {
+               mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
+                           wqe_index, qp->mqp.qpn,
+                           ctrl_wqe_index);
+               return -EFAULT;
+       }
+
+       ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
+               MLX5_WQE_CTRL_QPN_SHIFT;
+       if (qp->mqp.qpn != ctrl_qpn) {
+               mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
+                           wqe_index, qp->mqp.qpn,
+                           ctrl_qpn);
+               return -EFAULT;
+       }
+#endif /* DEBUG */
+
+       *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
+       *wqe += sizeof(*ctrl);
+
+       opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
+                MLX5_WQE_CTRL_OPCODE_MASK;
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_RC:
+               switch (opcode) {
+               case MLX5_OPCODE_SEND:
+               case MLX5_OPCODE_SEND_IMM:
+               case MLX5_OPCODE_SEND_INVAL:
+                       if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
+                             IB_ODP_SUPPORT_SEND))
+                               goto invalid_transport_or_opcode;
+                       break;
+               case MLX5_OPCODE_RDMA_WRITE:
+               case MLX5_OPCODE_RDMA_WRITE_IMM:
+                       if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
+                             IB_ODP_SUPPORT_WRITE))
+                               goto invalid_transport_or_opcode;
+                       *wqe += sizeof(struct mlx5_wqe_raddr_seg);
+                       break;
+               case MLX5_OPCODE_RDMA_READ:
+                       if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
+                             IB_ODP_SUPPORT_READ))
+                               goto invalid_transport_or_opcode;
+                       *wqe += sizeof(struct mlx5_wqe_raddr_seg);
+                       break;
+               default:
+                       goto invalid_transport_or_opcode;
+               }
+               break;
+       case IB_QPT_UD:
+               switch (opcode) {
+               case MLX5_OPCODE_SEND:
+               case MLX5_OPCODE_SEND_IMM:
+                       if (!(dev->odp_caps.per_transport_caps.ud_odp_caps &
+                             IB_ODP_SUPPORT_SEND))
+                               goto invalid_transport_or_opcode;
+                       *wqe += sizeof(struct mlx5_wqe_datagram_seg);
+                       break;
+               default:
+                       goto invalid_transport_or_opcode;
+               }
+               break;
+       default:
+invalid_transport_or_opcode:
+               mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n",
+                           qp->ibqp.qp_type, opcode);
+               return -EFAULT;
+       }
+
+       return 0;
+}
+
+/*
+ * Parse responder WQE. Advances the wqe pointer to point at the
+ * scatter-gather list, and set wqe_end to the end of the WQE.
+ */
+static int mlx5_ib_mr_responder_pfault_handler(
+       struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault,
+       void **wqe, void **wqe_end, int wqe_length)
+{
+       struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
+       struct mlx5_ib_wq *wq = &qp->rq;
+       int wqe_size = 1 << wq->wqe_shift;
+
+       if (qp->ibqp.srq) {
+               mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n");
+               return -EFAULT;
+       }
+
+       if (qp->wq_sig) {
+               mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
+               return -EFAULT;
+       }
+
+       if (wqe_size > wqe_length) {
+               mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
+               return -EFAULT;
+       }
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_RC:
+               if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
+                     IB_ODP_SUPPORT_RECV))
+                       goto invalid_transport_or_opcode;
+               break;
+       default:
+invalid_transport_or_opcode:
+               mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
+                           qp->ibqp.qp_type);
+               return -EFAULT;
+       }
+
+       *wqe_end = *wqe + wqe_size;
+
+       return 0;
+}
+
+static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
+                                         struct mlx5_ib_pfault *pfault)
+{
+       struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
+       int ret;
+       void *wqe, *wqe_end;
+       u32 bytes_mapped, total_wqe_bytes;
+       char *buffer = NULL;
+       int resume_with_error = 0;
+       u16 wqe_index = pfault->mpfault.wqe.wqe_index;
+       int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR;
+
+       buffer = (char *)__get_free_page(GFP_KERNEL);
+       if (!buffer) {
+               mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
+               resume_with_error = 1;
+               goto resolve_page_fault;
+       }
+
+       ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
+                                   PAGE_SIZE);
+       if (ret < 0) {
+               mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n",
+                           -ret, wqe_index, qp->mqp.qpn);
+               resume_with_error = 1;
+               goto resolve_page_fault;
+       }
+
+       wqe = buffer;
+       if (requestor)
+               ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe,
+                                                         &wqe_end, ret);
+       else
+               ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe,
+                                                         &wqe_end, ret);
+       if (ret < 0) {
+               resume_with_error = 1;
+               goto resolve_page_fault;
+       }
+
+       if (wqe >= wqe_end) {
+               mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
+               resume_with_error = 1;
+               goto resolve_page_fault;
+       }
+
+       ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped,
+                                     &total_wqe_bytes, !requestor);
+       if (ret == -EAGAIN) {
+               goto resolve_page_fault;
+       } else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
+               mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n",
+                           -ret);
+               resume_with_error = 1;
+               goto resolve_page_fault;
+       }
+
+resolve_page_fault:
+       mlx5_ib_page_fault_resume(qp, pfault, resume_with_error);
+       mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n",
+                   qp->mqp.qpn, resume_with_error, pfault->mpfault.flags);
+
+       free_page((unsigned long)buffer);
+}
+
+static int pages_in_range(u64 address, u32 length)
+{
+       return (ALIGN(address + length, PAGE_SIZE) -
+               (address & PAGE_MASK)) >> PAGE_SHIFT;
+}
+
+static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp,
+                                          struct mlx5_ib_pfault *pfault)
+{
+       struct mlx5_pagefault *mpfault = &pfault->mpfault;
+       u64 address;
+       u32 length;
+       u32 prefetch_len = mpfault->bytes_committed;
+       int prefetch_activated = 0;
+       u32 rkey = mpfault->rdma.r_key;
+       int ret;
+
+       /* The RDMA responder handler handles the page fault in two parts.
+        * First it brings the necessary pages for the current packet
+        * (and uses the pfault context), and then (after resuming the QP)
+        * prefetches more pages. The second operation cannot use the pfault
+        * context and therefore uses the dummy_pfault context allocated on
+        * the stack */
+       struct mlx5_ib_pfault dummy_pfault = {};
+
+       dummy_pfault.mpfault.bytes_committed = 0;
+
+       mpfault->rdma.rdma_va += mpfault->bytes_committed;
+       mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed,
+                                        mpfault->rdma.rdma_op_len);
+       mpfault->bytes_committed = 0;
+
+       address = mpfault->rdma.rdma_va;
+       length  = mpfault->rdma.rdma_op_len;
+
+       /* For some operations, the hardware cannot tell the exact message
+        * length, and in those cases it reports zero. Use prefetch
+        * logic. */
+       if (length == 0) {
+               prefetch_activated = 1;
+               length = mpfault->rdma.packet_size;
+               prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
+       }
+
+       ret = pagefault_single_data_segment(qp, pfault, rkey, address, length,
+                                           NULL);
+       if (ret == -EAGAIN) {
+               /* We're racing with an invalidation, don't prefetch */
+               prefetch_activated = 0;
+       } else if (ret < 0 || pages_in_range(address, length) > ret) {
+               mlx5_ib_page_fault_resume(qp, pfault, 1);
+               return;
+       }
+
+       mlx5_ib_page_fault_resume(qp, pfault, 0);
+
+       /* At this point, there might be a new pagefault already arriving in
+        * the eq, switch to the dummy pagefault for the rest of the
+        * processing. We're still OK with the objects being alive as the
+        * work-queue is being fenced. */
+
+       if (prefetch_activated) {
+               ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey,
+                                                   address,
+                                                   prefetch_len,
+                                                   NULL);
+               if (ret < 0) {
+                       pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n",
+                               ret, prefetch_activated,
+                               qp->ibqp.qp_num, address, prefetch_len);
+               }
+       }
+}
+
+void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
+                              struct mlx5_ib_pfault *pfault)
+{
+       u8 event_subtype = pfault->mpfault.event_subtype;
+
+       switch (event_subtype) {
+       case MLX5_PFAULT_SUBTYPE_WQE:
+               mlx5_ib_mr_wqe_pfault_handler(qp, pfault);
+               break;
+       case MLX5_PFAULT_SUBTYPE_RDMA:
+               mlx5_ib_mr_rdma_pfault_handler(qp, pfault);
+               break;
+       default:
+               pr_warn("Invalid page fault event subtype: 0x%x\n",
+                       event_subtype);
+               mlx5_ib_page_fault_resume(qp, pfault, 1);
+               break;
+       }
+}
+
+static void mlx5_ib_qp_pfault_action(struct work_struct *work)
+{
+       struct mlx5_ib_pfault *pfault = container_of(work,
+                                                    struct mlx5_ib_pfault,
+                                                    work);
+       enum mlx5_ib_pagefault_context context =
+               mlx5_ib_get_pagefault_context(&pfault->mpfault);
+       struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp,
+                                            pagefaults[context]);
+       mlx5_ib_mr_pfault_handler(qp, pfault);
+}
+
+void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
+       qp->disable_page_faults = 1;
+       spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
+
+       /*
+        * Note that at this point, we are guarenteed that no more
+        * work queue elements will be posted to the work queue with
+        * the QP we are closing.
+        */
+       flush_workqueue(mlx5_ib_page_fault_wq);
+}
+
+void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
+       qp->disable_page_faults = 0;
+       spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
+}
+
+static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp,
+                                  struct mlx5_pagefault *pfault)
+{
+       /*
+        * Note that we will only get one fault event per QP per context
+        * (responder/initiator, read/write), until we resolve the page fault
+        * with the mlx5_ib_page_fault_resume command. Since this function is
+        * called from within the work element, there is no risk of missing
+        * events.
+        */
+       struct mlx5_ib_qp *mibqp = to_mibqp(qp);
+       enum mlx5_ib_pagefault_context context =
+               mlx5_ib_get_pagefault_context(pfault);
+       struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context];
+
+       qp_pfault->mpfault = *pfault;
+
+       /* No need to stop interrupts here since we are in an interrupt */
+       spin_lock(&mibqp->disable_page_faults_lock);
+       if (!mibqp->disable_page_faults)
+               queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work);
+       spin_unlock(&mibqp->disable_page_faults_lock);
+}
+
+void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
+{
+       int i;
+
+       qp->disable_page_faults = 1;
+       spin_lock_init(&qp->disable_page_faults_lock);
+
+       qp->mqp.pfault_handler  = mlx5_ib_pfault_handler;
+
+       for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
+               INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
+}
+
+int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev)
+{
+       int ret;
+
+       ret = init_srcu_struct(&ibdev->mr_srcu);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)
+{
+       cleanup_srcu_struct(&ibdev->mr_srcu);
+}
+
+int __init mlx5_ib_odp_init(void)
+{
+       mlx5_ib_page_fault_wq =
+               create_singlethread_workqueue("mlx5_ib_page_faults");
+       if (!mlx5_ib_page_fault_wq)
+               return -ENOMEM;
+
+       return 0;
+}
+
+void mlx5_ib_odp_cleanup(void)
+{
+       destroy_workqueue(mlx5_ib_page_fault_wq);
+}
index 1cae1c7132b4b6f84ed5055c1c53e426edf8a354..be0cd358b080977ed50fdc2dedcd2e0257a19a56 100644 (file)
@@ -70,15 +70,6 @@ static const u32 mlx5_ib_opcode[] = {
        [MLX5_IB_WR_UMR]                        = MLX5_OPCODE_UMR,
 };
 
-struct umr_wr {
-       u64                             virt_addr;
-       struct ib_pd                   *pd;
-       unsigned int                    page_shift;
-       unsigned int                    npages;
-       u32                             length;
-       int                             access_flags;
-       u32                             mkey;
-};
 
 static int is_qp0(enum ib_qp_type qp_type)
 {
@@ -110,6 +101,77 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
        return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE));
 }
 
+/**
+ * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space.
+ *
+ * @qp: QP to copy from.
+ * @send: copy from the send queue when non-zero, use the receive queue
+ *       otherwise.
+ * @wqe_index:  index to start copying from. For send work queues, the
+ *             wqe_index is in units of MLX5_SEND_WQE_BB.
+ *             For receive work queue, it is the number of work queue
+ *             element in the queue.
+ * @buffer: destination buffer.
+ * @length: maximum number of bytes to copy.
+ *
+ * Copies at least a single WQE, but may copy more data.
+ *
+ * Return: the number of bytes copied, or an error code.
+ */
+int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
+                         void *buffer, u32 length)
+{
+       struct ib_device *ibdev = qp->ibqp.device;
+       struct mlx5_ib_dev *dev = to_mdev(ibdev);
+       struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
+       size_t offset;
+       size_t wq_end;
+       struct ib_umem *umem = qp->umem;
+       u32 first_copy_length;
+       int wqe_length;
+       int ret;
+
+       if (wq->wqe_cnt == 0) {
+               mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n",
+                           qp->ibqp.qp_type);
+               return -EINVAL;
+       }
+
+       offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift);
+       wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift);
+
+       if (send && length < sizeof(struct mlx5_wqe_ctrl_seg))
+               return -EINVAL;
+
+       if (offset > umem->length ||
+           (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length))
+               return -EINVAL;
+
+       first_copy_length = min_t(u32, offset + length, wq_end) - offset;
+       ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length);
+       if (ret)
+               return ret;
+
+       if (send) {
+               struct mlx5_wqe_ctrl_seg *ctrl = buffer;
+               int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
+
+               wqe_length = ds * MLX5_WQE_DS_UNITS;
+       } else {
+               wqe_length = 1 << wq->wqe_shift;
+       }
+
+       if (wqe_length <= first_copy_length)
+               return first_copy_length;
+
+       ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset,
+                               wqe_length - first_copy_length);
+       if (ret)
+               return ret;
+
+       return wqe_length;
+}
+
 static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
 {
        struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
@@ -814,6 +876,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
        int inlen = sizeof(*in);
        int err;
 
+       mlx5_ib_odp_create_qp(qp);
+
        gen = &dev->mdev->caps.gen;
        mutex_init(&qp->mutex);
        spin_lock_init(&qp->sq.lock);
@@ -1098,11 +1162,13 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
        in = kzalloc(sizeof(*in), GFP_KERNEL);
        if (!in)
                return;
-       if (qp->state != IB_QPS_RESET)
+       if (qp->state != IB_QPS_RESET) {
+               mlx5_ib_qp_disable_pagefaults(qp);
                if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state),
                                        MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp))
                        mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
                                     qp->mqp.qpn);
+       }
 
        get_cqs(qp, &send_cq, &recv_cq);
 
@@ -1650,6 +1716,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
        if (mlx5_st < 0)
                goto out;
 
+       /* If moving to a reset or error state, we must disable page faults on
+        * this QP and flush all current page faults. Otherwise a stale page
+        * fault may attempt to work on this QP after it is reset and moved
+        * again to RTS, and may cause the driver and the device to get out of
+        * sync. */
+       if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
+           (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+               mlx5_ib_qp_disable_pagefaults(qp);
+
        optpar = ib_mask_to_mlx5_opt(attr_mask);
        optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
        in->optparam = cpu_to_be32(optpar);
@@ -1659,6 +1734,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
        if (err)
                goto out;
 
+       if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+               mlx5_ib_qp_enable_pagefaults(qp);
+
        qp->state = new_state;
 
        if (attr_mask & IB_QP_ACCESS_FLAGS)
@@ -1848,37 +1926,70 @@ static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
        umr->mkey_mask = frwr_mkey_mask();
 }
 
+static __be64 get_umr_reg_mr_mask(void)
+{
+       u64 result;
+
+       result = MLX5_MKEY_MASK_LEN             |
+                MLX5_MKEY_MASK_PAGE_SIZE       |
+                MLX5_MKEY_MASK_START_ADDR      |
+                MLX5_MKEY_MASK_PD              |
+                MLX5_MKEY_MASK_LR              |
+                MLX5_MKEY_MASK_LW              |
+                MLX5_MKEY_MASK_KEY             |
+                MLX5_MKEY_MASK_RR              |
+                MLX5_MKEY_MASK_RW              |
+                MLX5_MKEY_MASK_A               |
+                MLX5_MKEY_MASK_FREE;
+
+       return cpu_to_be64(result);
+}
+
+static __be64 get_umr_unreg_mr_mask(void)
+{
+       u64 result;
+
+       result = MLX5_MKEY_MASK_FREE;
+
+       return cpu_to_be64(result);
+}
+
+static __be64 get_umr_update_mtt_mask(void)
+{
+       u64 result;
+
+       result = MLX5_MKEY_MASK_FREE;
+
+       return cpu_to_be64(result);
+}
+
 static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
                                struct ib_send_wr *wr)
 {
-       struct umr_wr *umrwr = (struct umr_wr *)&wr->wr.fast_reg;
-       u64 mask;
+       struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
 
        memset(umr, 0, sizeof(*umr));
 
+       if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE)
+               umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */
+       else
+               umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */
+
        if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) {
-               umr->flags = 1 << 5; /* fail if not free */
                umr->klm_octowords = get_klm_octo(umrwr->npages);
-               mask =  MLX5_MKEY_MASK_LEN              |
-                       MLX5_MKEY_MASK_PAGE_SIZE        |
-                       MLX5_MKEY_MASK_START_ADDR       |
-                       MLX5_MKEY_MASK_PD               |
-                       MLX5_MKEY_MASK_LR               |
-                       MLX5_MKEY_MASK_LW               |
-                       MLX5_MKEY_MASK_KEY              |
-                       MLX5_MKEY_MASK_RR               |
-                       MLX5_MKEY_MASK_RW               |
-                       MLX5_MKEY_MASK_A                |
-                       MLX5_MKEY_MASK_FREE;
-               umr->mkey_mask = cpu_to_be64(mask);
+               if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) {
+                       umr->mkey_mask = get_umr_update_mtt_mask();
+                       umr->bsf_octowords = get_klm_octo(umrwr->target.offset);
+                       umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
+               } else {
+                       umr->mkey_mask = get_umr_reg_mr_mask();
+               }
        } else {
-               umr->flags = 2 << 5; /* fail if free */
-               mask = MLX5_MKEY_MASK_FREE;
-               umr->mkey_mask = cpu_to_be64(mask);
+               umr->mkey_mask = get_umr_unreg_mr_mask();
        }
 
        if (!wr->num_sge)
-               umr->flags |= (1 << 7); /* inline */
+               umr->flags |= MLX5_UMR_INLINE;
 }
 
 static u8 get_umr_flags(int acc)
@@ -1895,7 +2006,7 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr,
 {
        memset(seg, 0, sizeof(*seg));
        if (li) {
-               seg->status = 1 << 6;
+               seg->status = MLX5_MKEY_STATUS_FREE;
                return;
        }
 
@@ -1912,19 +2023,23 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr,
 
 static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr)
 {
+       struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
+
        memset(seg, 0, sizeof(*seg));
        if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) {
-               seg->status = 1 << 6;
+               seg->status = MLX5_MKEY_STATUS_FREE;
                return;
        }
 
-       seg->flags = convert_access(wr->wr.fast_reg.access_flags);
-       seg->flags_pd = cpu_to_be32(to_mpd((struct ib_pd *)wr->wr.fast_reg.page_list)->pdn);
-       seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
-       seg->len = cpu_to_be64(wr->wr.fast_reg.length);
-       seg->log2_page_size = wr->wr.fast_reg.page_shift;
+       seg->flags = convert_access(umrwr->access_flags);
+       if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) {
+               seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn);
+               seg->start_addr = cpu_to_be64(umrwr->target.virt_addr);
+       }
+       seg->len = cpu_to_be64(umrwr->length);
+       seg->log2_page_size = umrwr->page_shift;
        seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
-                                      mlx5_mkey_variant(wr->wr.fast_reg.rkey));
+                                      mlx5_mkey_variant(umrwr->mkey));
 }
 
 static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg,
@@ -2927,6 +3042,14 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
        int mlx5_state;
        int err = 0;
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       /*
+        * Wait for any outstanding page faults, in case the user frees memory
+        * based upon this query's result.
+        */
+       flush_workqueue(mlx5_ib_page_fault_wq);
+#endif
+
        mutex_lock(&qp->mutex);
        outb = kzalloc(sizeof(*outb), GFP_KERNEL);
        if (!outb) {
index fef067c959fcf23c9455f96c17e8d734d6a894f6..c0d0296e7a003089dec6d8d180de2bb6dd05964d 100644 (file)
@@ -2341,9 +2341,9 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u,"
                        " offset = %u, page size = %u.\n",
                        (unsigned long int)start, (unsigned long int)virt, (u32)length,
-                       region->offset, region->page_size);
+                       ib_umem_offset(region), region->page_size);
 
-       skip_pages = ((u32)region->offset) >> 12;
+       skip_pages = ((u32)ib_umem_offset(region)) >> 12;
 
        if (ib_copy_from_udata(&req, udata, sizeof(req))) {
                ib_umem_release(region);
@@ -2408,7 +2408,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                                region_length -= skip_pages << 12;
                                for (page_index = skip_pages; page_index < chunk_pages; page_index++) {
                                        skip_pages = 0;
-                                       if ((page_count != 0) && (page_count<<12)-(region->offset&(4096-1)) >= region->length)
+                                       if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length)
                                                goto enough_pages;
                                        if ((page_count&0x01FF) == 0) {
                                                if (page_count >= 1024 * 512) {
index ac02ce4e804045afa74df9cb34b1f50d1b146361..f3cc8c9e65ae70f9e0b157632189e324970cbea5 100644 (file)
@@ -96,7 +96,6 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
        struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
        struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
        union ib_gid sgid;
-       u8 zmac[ETH_ALEN];
 
        if (!(attr->ah_flags & IB_AH_GRH))
                return ERR_PTR(-EINVAL);
@@ -118,9 +117,7 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
                goto av_conf_err;
        }
 
-       memset(&zmac, 0, ETH_ALEN);
-       if (pd->uctx &&
-           memcmp(attr->dmac, &zmac, ETH_ALEN)) {
+       if (pd->uctx) {
                status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid,
                                         attr->dmac, &attr->vlan_id);
                if (status) {
index 4c68305ee7814685f60d51627b0d69d28907c0f3..fb8d8c4dfbb97d2b36abdf69888793741aba182a 100644 (file)
@@ -805,7 +805,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
                goto umem_err;
 
        mr->hwmr.pbe_size = mr->umem->page_size;
-       mr->hwmr.fbo = mr->umem->offset;
+       mr->hwmr.fbo = ib_umem_offset(mr->umem);
        mr->hwmr.va = usr_addr;
        mr->hwmr.len = len;
        mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
@@ -1410,6 +1410,8 @@ int ocrdma_query_qp(struct ib_qp *ibqp,
        mutex_unlock(&dev->dev_lock);
        if (status)
                goto mbx_err;
+       if (qp->qp_type == IB_QPT_UD)
+               qp_attr->qkey = params.qkey;
        qp_attr->qp_state = get_ibqp_state(IB_QPS_INIT);
        qp_attr->cur_qp_state = get_ibqp_state(IB_QPS_INIT);
        qp_attr->path_mtu =
index 9bbb55347cc1934bf9335da1b656a246d1c5cbe2..a77fb4fb14e43c255e23a41b7c86877835a78ea3 100644 (file)
@@ -258,7 +258,7 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        mr->mr.user_base = start;
        mr->mr.iova = virt_addr;
        mr->mr.length = length;
-       mr->mr.offset = umem->offset;
+       mr->mr.offset = ib_umem_offset(umem);
        mr->mr.access_flags = mr_access_flags;
        mr->umem = umem;
 
index d7562beb542367faf1b93d7ba66e8ef879c73bf4..8ba80a6d3a46d17daf233945e76d8760c107a752 100644 (file)
@@ -98,9 +98,15 @@ enum {
 
        IPOIB_MCAST_FLAG_FOUND    = 0,  /* used in set_multicast_list */
        IPOIB_MCAST_FLAG_SENDONLY = 1,
-       IPOIB_MCAST_FLAG_BUSY     = 2,  /* joining or already joined */
+       /*
+        * For IPOIB_MCAST_FLAG_BUSY
+        * When set, in flight join and mcast->mc is unreliable
+        * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or
+        *   haven't started yet
+        * When clear and mcast->mc is valid pointer, join was successful
+        */
+       IPOIB_MCAST_FLAG_BUSY     = 2,
        IPOIB_MCAST_FLAG_ATTACHED = 3,
-       IPOIB_MCAST_JOIN_STARTED  = 4,
 
        MAX_SEND_CQE              = 16,
        IPOIB_CM_COPYBREAK        = 256,
@@ -317,6 +323,7 @@ struct ipoib_dev_priv {
        struct list_head multicast_list;
        struct rb_root multicast_tree;
 
+       struct workqueue_struct *wq;
        struct delayed_work mcast_task;
        struct work_struct carrier_on_task;
        struct work_struct flush_light;
@@ -477,10 +484,10 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work);
 void ipoib_pkey_event(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);
 
-int ipoib_ib_dev_open(struct net_device *dev, int flush);
+int ipoib_ib_dev_open(struct net_device *dev);
 int ipoib_ib_dev_up(struct net_device *dev);
-int ipoib_ib_dev_down(struct net_device *dev, int flush);
-int ipoib_ib_dev_stop(struct net_device *dev, int flush);
+int ipoib_ib_dev_down(struct net_device *dev);
+int ipoib_ib_dev_stop(struct net_device *dev);
 void ipoib_pkey_dev_check_presence(struct net_device *dev);
 
 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
@@ -492,7 +499,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
 
 void ipoib_mcast_restart_task(struct work_struct *work);
 int ipoib_mcast_start_thread(struct net_device *dev);
-int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
+int ipoib_mcast_stop_thread(struct net_device *dev);
 
 void ipoib_mcast_dev_down(struct net_device *dev);
 void ipoib_mcast_dev_flush(struct net_device *dev);
index 933efcea0d03f11b4da3967b8eedc137da21e08a..56959adb6c7da51ccbb6d20307247b7cb69ad55a 100644 (file)
@@ -474,7 +474,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
        }
 
        spin_lock_irq(&priv->lock);
-       queue_delayed_work(ipoib_workqueue,
+       queue_delayed_work(priv->wq,
                           &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
        /* Add this entry to passive ids list head, but do not re-add it
         * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
@@ -576,7 +576,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
                        spin_lock_irqsave(&priv->lock, flags);
                        list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
                        ipoib_cm_start_rx_drain(priv);
-                       queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+                       queue_work(priv->wq, &priv->cm.rx_reap_task);
                        spin_unlock_irqrestore(&priv->lock, flags);
                } else
                        ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
@@ -603,7 +603,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
                                spin_lock_irqsave(&priv->lock, flags);
                                list_move(&p->list, &priv->cm.rx_reap_list);
                                spin_unlock_irqrestore(&priv->lock, flags);
-                               queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+                               queue_work(priv->wq, &priv->cm.rx_reap_task);
                        }
                        return;
                }
@@ -827,7 +827,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
                if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
                        list_move(&tx->list, &priv->cm.reap_list);
-                       queue_work(ipoib_workqueue, &priv->cm.reap_task);
+                       queue_work(priv->wq, &priv->cm.reap_task);
                }
 
                clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
@@ -1255,7 +1255,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 
                if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
                        list_move(&tx->list, &priv->cm.reap_list);
-                       queue_work(ipoib_workqueue, &priv->cm.reap_task);
+                       queue_work(priv->wq, &priv->cm.reap_task);
                }
 
                spin_unlock_irqrestore(&priv->lock, flags);
@@ -1284,7 +1284,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path
        tx->dev = dev;
        list_add(&tx->list, &priv->cm.start_list);
        set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
-       queue_work(ipoib_workqueue, &priv->cm.start_task);
+       queue_work(priv->wq, &priv->cm.start_task);
        return tx;
 }
 
@@ -1295,7 +1295,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
        if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
                spin_lock_irqsave(&priv->lock, flags);
                list_move(&tx->list, &priv->cm.reap_list);
-               queue_work(ipoib_workqueue, &priv->cm.reap_task);
+               queue_work(priv->wq, &priv->cm.reap_task);
                ipoib_dbg(priv, "Reap connection for gid %pI6\n",
                          tx->neigh->daddr + 4);
                tx->neigh = NULL;
@@ -1417,7 +1417,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
 
        skb_queue_tail(&priv->cm.skb_queue, skb);
        if (e)
-               queue_work(ipoib_workqueue, &priv->cm.skb_task);
+               queue_work(priv->wq, &priv->cm.skb_task);
 }
 
 static void ipoib_cm_rx_reap(struct work_struct *work)
@@ -1450,7 +1450,7 @@ static void ipoib_cm_stale_task(struct work_struct *work)
        }
 
        if (!list_empty(&priv->cm.passive_ids))
-               queue_delayed_work(ipoib_workqueue,
+               queue_delayed_work(priv->wq,
                                   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
        spin_unlock_irq(&priv->lock);
 }
index 72626c3481749b962fe96b79722d7c8e9c99c585..fe65abb5150c76b2eb941b3b2331930bc5b2b81e 100644 (file)
@@ -655,7 +655,7 @@ void ipoib_reap_ah(struct work_struct *work)
        __ipoib_reap_ah(dev);
 
        if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
-               queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
+               queue_delayed_work(priv->wq, &priv->ah_reap_task,
                                   round_jiffies_relative(HZ));
 }
 
@@ -664,7 +664,7 @@ static void ipoib_ib_tx_timer_func(unsigned long ctx)
        drain_tx_cq((struct net_device *)ctx);
 }
 
-int ipoib_ib_dev_open(struct net_device *dev, int flush)
+int ipoib_ib_dev_open(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        int ret;
@@ -696,7 +696,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush)
        }
 
        clear_bit(IPOIB_STOP_REAPER, &priv->flags);
-       queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
+       queue_delayed_work(priv->wq, &priv->ah_reap_task,
                           round_jiffies_relative(HZ));
 
        if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
@@ -706,7 +706,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush)
 dev_stop:
        if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
                napi_enable(&priv->napi);
-       ipoib_ib_dev_stop(dev, flush);
+       ipoib_ib_dev_stop(dev);
        return -1;
 }
 
@@ -738,7 +738,7 @@ int ipoib_ib_dev_up(struct net_device *dev)
        return ipoib_mcast_start_thread(dev);
 }
 
-int ipoib_ib_dev_down(struct net_device *dev, int flush)
+int ipoib_ib_dev_down(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
 
@@ -747,7 +747,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush)
        clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
        netif_carrier_off(dev);
 
-       ipoib_mcast_stop_thread(dev, flush);
+       ipoib_mcast_stop_thread(dev);
        ipoib_mcast_dev_flush(dev);
 
        ipoib_flush_paths(dev);
@@ -807,7 +807,7 @@ void ipoib_drain_cq(struct net_device *dev)
        local_bh_enable();
 }
 
-int ipoib_ib_dev_stop(struct net_device *dev, int flush)
+int ipoib_ib_dev_stop(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ib_qp_attr qp_attr;
@@ -880,8 +880,7 @@ timeout:
        /* Wait for all AHs to be reaped */
        set_bit(IPOIB_STOP_REAPER, &priv->flags);
        cancel_delayed_work(&priv->ah_reap_task);
-       if (flush)
-               flush_workqueue(ipoib_workqueue);
+       flush_workqueue(priv->wq);
 
        begin = jiffies;
 
@@ -918,7 +917,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
                    (unsigned long) dev);
 
        if (dev->flags & IFF_UP) {
-               if (ipoib_ib_dev_open(dev, 1)) {
+               if (ipoib_ib_dev_open(dev)) {
                        ipoib_transport_dev_cleanup(dev);
                        return -ENODEV;
                }
@@ -1040,12 +1039,12 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
        }
 
        if (level >= IPOIB_FLUSH_NORMAL)
-               ipoib_ib_dev_down(dev, 0);
+               ipoib_ib_dev_down(dev);
 
        if (level == IPOIB_FLUSH_HEAVY) {
                if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
-                       ipoib_ib_dev_stop(dev, 0);
-               if (ipoib_ib_dev_open(dev, 0) != 0)
+                       ipoib_ib_dev_stop(dev);
+               if (ipoib_ib_dev_open(dev) != 0)
                        return;
                if (netif_queue_stopped(dev))
                        netif_start_queue(dev);
@@ -1097,7 +1096,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
         */
        ipoib_flush_paths(dev);
 
-       ipoib_mcast_stop_thread(dev, 1);
+       ipoib_mcast_stop_thread(dev);
        ipoib_mcast_dev_flush(dev);
 
        ipoib_transport_dev_cleanup(dev);
index 58b5aa3b6f2dded5d2e6d15aff080551aa9eddd9..6bad17d4d5880886f88ef48d8424abe4347cdc50 100644 (file)
@@ -108,7 +108,7 @@ int ipoib_open(struct net_device *dev)
 
        set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
 
-       if (ipoib_ib_dev_open(dev, 1)) {
+       if (ipoib_ib_dev_open(dev)) {
                if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
                        return 0;
                goto err_disable;
@@ -139,7 +139,7 @@ int ipoib_open(struct net_device *dev)
        return 0;
 
 err_stop:
-       ipoib_ib_dev_stop(dev, 1);
+       ipoib_ib_dev_stop(dev);
 
 err_disable:
        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
@@ -157,8 +157,8 @@ static int ipoib_stop(struct net_device *dev)
 
        netif_stop_queue(dev);
 
-       ipoib_ib_dev_down(dev, 1);
-       ipoib_ib_dev_stop(dev, 0);
+       ipoib_ib_dev_down(dev);
+       ipoib_ib_dev_stop(dev);
 
        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
                struct ipoib_dev_priv *cpriv;
@@ -839,7 +839,7 @@ static void ipoib_set_mcast_list(struct net_device *dev)
                return;
        }
 
-       queue_work(ipoib_workqueue, &priv->restart_task);
+       queue_work(priv->wq, &priv->restart_task);
 }
 
 static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
@@ -954,7 +954,7 @@ static void ipoib_reap_neigh(struct work_struct *work)
        __ipoib_reap_neigh(priv);
 
        if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
-               queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
+               queue_delayed_work(priv->wq, &priv->neigh_reap_task,
                                   arp_tbl.gc_interval);
 }
 
@@ -1133,7 +1133,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
 
        /* start garbage collection */
        clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
-       queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
+       queue_delayed_work(priv->wq, &priv->neigh_reap_task,
                           arp_tbl.gc_interval);
 
        return 0;
@@ -1262,15 +1262,13 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-       if (ipoib_neigh_hash_init(priv) < 0)
-               goto out;
        /* Allocate RX/TX "rings" to hold queued skbs */
        priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
                                GFP_KERNEL);
        if (!priv->rx_ring) {
                printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
                       ca->name, ipoib_recvq_size);
-               goto out_neigh_hash_cleanup;
+               goto out;
        }
 
        priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
@@ -1285,16 +1283,24 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
        if (ipoib_ib_dev_init(dev, ca, port))
                goto out_tx_ring_cleanup;
 
+       /*
+        * Must be after ipoib_ib_dev_init so we can allocate a per
+        * device wq there and use it here
+        */
+       if (ipoib_neigh_hash_init(priv) < 0)
+               goto out_dev_uninit;
+
        return 0;
 
+out_dev_uninit:
+       ipoib_ib_dev_cleanup(dev);
+
 out_tx_ring_cleanup:
        vfree(priv->tx_ring);
 
 out_rx_ring_cleanup:
        kfree(priv->rx_ring);
 
-out_neigh_hash_cleanup:
-       ipoib_neigh_hash_uninit(dev);
 out:
        return -ENOMEM;
 }
@@ -1317,6 +1323,12 @@ void ipoib_dev_cleanup(struct net_device *dev)
        }
        unregister_netdevice_many(&head);
 
+       /*
+        * Must be before ipoib_ib_dev_cleanup or we delete an in use
+        * work queue
+        */
+       ipoib_neigh_hash_uninit(dev);
+
        ipoib_ib_dev_cleanup(dev);
 
        kfree(priv->rx_ring);
@@ -1324,8 +1336,6 @@ void ipoib_dev_cleanup(struct net_device *dev)
 
        priv->rx_ring = NULL;
        priv->tx_ring = NULL;
-
-       ipoib_neigh_hash_uninit(dev);
 }
 
 static const struct header_ops ipoib_header_ops = {
@@ -1636,7 +1646,7 @@ register_failed:
        /* Stop GC if started before flush */
        set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
        cancel_delayed_work(&priv->neigh_reap_task);
-       flush_workqueue(ipoib_workqueue);
+       flush_workqueue(priv->wq);
 
 event_failed:
        ipoib_dev_cleanup(priv->dev);
@@ -1707,7 +1717,7 @@ static void ipoib_remove_one(struct ib_device *device)
                /* Stop GC */
                set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
                cancel_delayed_work(&priv->neigh_reap_task);
-               flush_workqueue(ipoib_workqueue);
+               flush_workqueue(priv->wq);
 
                unregister_netdev(priv->dev);
                free_netdev(priv->dev);
@@ -1748,8 +1758,13 @@ static int __init ipoib_init_module(void)
         * unregister_netdev() and linkwatch_event take the rtnl lock,
         * so flush_scheduled_work() can deadlock during device
         * removal.
+        *
+        * In addition, bringing one device up and another down at the
+        * same time can deadlock a single workqueue, so we have this
+        * global fallback workqueue, but we also attempt to open a
+        * per device workqueue each time we bring an interface up
         */
-       ipoib_workqueue = create_singlethread_workqueue("ipoib");
+       ipoib_workqueue = create_singlethread_workqueue("ipoib_flush");
        if (!ipoib_workqueue) {
                ret = -ENOMEM;
                goto err_fs;
index ffb83b5f7e805e411f1506d66a53f8465b90c439..bc50dd0d0e4dad7790725b0414d807d42fe82493 100644 (file)
@@ -190,12 +190,6 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
                spin_unlock_irq(&priv->lock);
                priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
                set_qkey = 1;
-
-               if (!ipoib_cm_admin_enabled(dev)) {
-                       rtnl_lock();
-                       dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu));
-                       rtnl_unlock();
-               }
        }
 
        if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
@@ -277,16 +271,27 @@ ipoib_mcast_sendonly_join_complete(int status,
        struct ipoib_mcast *mcast = multicast->context;
        struct net_device *dev = mcast->dev;
 
+       /*
+        * We have to take the mutex to force mcast_sendonly_join to
+        * return from ib_sa_multicast_join and set mcast->mc to a
+        * valid value.  Otherwise we were racing with ourselves in
+        * that we might fail here, but get a valid return from
+        * ib_sa_multicast_join after we had cleared mcast->mc here,
+        * resulting in mis-matched joins and leaves and a deadlock
+        */
+       mutex_lock(&mcast_mutex);
+
        /* We trap for port events ourselves. */
        if (status == -ENETRESET)
-               return 0;
+               goto out;
 
        if (!status)
                status = ipoib_mcast_join_finish(mcast, &multicast->rec);
 
        if (status) {
                if (mcast->logcount++ < 20)
-                       ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n",
+                       ipoib_dbg_mcast(netdev_priv(dev), "sendonly multicast "
+                                       "join failed for %pI6, status %d\n",
                                        mcast->mcmember.mgid.raw, status);
 
                /* Flush out any queued packets */
@@ -296,11 +301,15 @@ ipoib_mcast_sendonly_join_complete(int status,
                        dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
                }
                netif_tx_unlock_bh(dev);
-
-               /* Clear the busy flag so we try again */
-               status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
-                                           &mcast->flags);
        }
+out:
+       clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+       if (status)
+               mcast->mc = NULL;
+       complete(&mcast->done);
+       if (status == -ENETRESET)
+               status = 0;
+       mutex_unlock(&mcast_mutex);
        return status;
 }
 
@@ -318,12 +327,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
        int ret = 0;
 
        if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
-               ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
+               ipoib_dbg_mcast(priv, "device shutting down, no sendonly "
+                               "multicast joins\n");
                return -ENODEV;
        }
 
-       if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
-               ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
+       if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
+               ipoib_dbg_mcast(priv, "multicast entry busy, skipping "
+                               "sendonly join\n");
                return -EBUSY;
        }
 
@@ -331,6 +342,9 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
        rec.port_gid = priv->local_gid;
        rec.pkey     = cpu_to_be16(priv->pkey);
 
+       mutex_lock(&mcast_mutex);
+       init_completion(&mcast->done);
+       set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
        mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
                                         priv->port, &rec,
                                         IB_SA_MCMEMBER_REC_MGID        |
@@ -343,12 +357,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
        if (IS_ERR(mcast->mc)) {
                ret = PTR_ERR(mcast->mc);
                clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-               ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
-                          ret);
+               complete(&mcast->done);
+               ipoib_warn(priv, "ib_sa_join_multicast for sendonly join "
+                          "failed (ret = %d)\n", ret);
        } else {
-               ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n",
-                               mcast->mcmember.mgid.raw);
+               ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting "
+                               "sendonly join\n", mcast->mcmember.mgid.raw);
        }
+       mutex_unlock(&mcast_mutex);
 
        return ret;
 }
@@ -359,18 +375,29 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work)
                                                   carrier_on_task);
        struct ib_port_attr attr;
 
-       /*
-        * Take rtnl_lock to avoid racing with ipoib_stop() and
-        * turning the carrier back on while a device is being
-        * removed.
-        */
        if (ib_query_port(priv->ca, priv->port, &attr) ||
            attr.state != IB_PORT_ACTIVE) {
                ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
                return;
        }
 
-       rtnl_lock();
+       /*
+        * Take rtnl_lock to avoid racing with ipoib_stop() and
+        * turning the carrier back on while a device is being
+        * removed.  However, ipoib_stop() will attempt to flush
+        * the workqueue while holding the rtnl lock, so loop
+        * on trylock until either we get the lock or we see
+        * FLAG_ADMIN_UP go away as that signals that we are bailing
+        * and can safely ignore the carrier on work.
+        */
+       while (!rtnl_trylock()) {
+               if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+                       return;
+               else
+                       msleep(20);
+       }
+       if (!ipoib_cm_admin_enabled(priv->dev))
+               dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu));
        netif_carrier_on(priv->dev);
        rtnl_unlock();
 }
@@ -385,60 +412,63 @@ static int ipoib_mcast_join_complete(int status,
        ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n",
                        mcast->mcmember.mgid.raw, status);
 
+       /*
+        * We have to take the mutex to force mcast_join to
+        * return from ib_sa_multicast_join and set mcast->mc to a
+        * valid value.  Otherwise we were racing with ourselves in
+        * that we might fail here, but get a valid return from
+        * ib_sa_multicast_join after we had cleared mcast->mc here,
+        * resulting in mis-matched joins and leaves and a deadlock
+        */
+       mutex_lock(&mcast_mutex);
+
        /* We trap for port events ourselves. */
-       if (status == -ENETRESET) {
-               status = 0;
+       if (status == -ENETRESET)
                goto out;
-       }
 
        if (!status)
                status = ipoib_mcast_join_finish(mcast, &multicast->rec);
 
        if (!status) {
                mcast->backoff = 1;
-               mutex_lock(&mcast_mutex);
                if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-                       queue_delayed_work(ipoib_workqueue,
-                                          &priv->mcast_task, 0);
-               mutex_unlock(&mcast_mutex);
+                       queue_delayed_work(priv->wq, &priv->mcast_task, 0);
 
                /*
-                * Defer carrier on work to ipoib_workqueue to avoid a
+                * Defer carrier on work to priv->wq to avoid a
                 * deadlock on rtnl_lock here.
                 */
                if (mcast == priv->broadcast)
-                       queue_work(ipoib_workqueue, &priv->carrier_on_task);
-
-               status = 0;
-               goto out;
-       }
-
-       if (mcast->logcount++ < 20) {
-               if (status == -ETIMEDOUT || status == -EAGAIN) {
-                       ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n",
-                                       mcast->mcmember.mgid.raw, status);
-               } else {
-                       ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
-                                  mcast->mcmember.mgid.raw, status);
+                       queue_work(priv->wq, &priv->carrier_on_task);
+       } else {
+               if (mcast->logcount++ < 20) {
+                       if (status == -ETIMEDOUT || status == -EAGAIN) {
+                               ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n",
+                                               mcast->mcmember.mgid.raw, status);
+                       } else {
+                               ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
+                                          mcast->mcmember.mgid.raw, status);
+                       }
                }
-       }
-
-       mcast->backoff *= 2;
-       if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
-               mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
 
-       /* Clear the busy flag so we try again */
-       status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-
-       mutex_lock(&mcast_mutex);
+               mcast->backoff *= 2;
+               if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+                       mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+       }
+out:
        spin_lock_irq(&priv->lock);
-       if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-               queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
+       clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+       if (status)
+               mcast->mc = NULL;
+       complete(&mcast->done);
+       if (status == -ENETRESET)
+               status = 0;
+       if (status && test_bit(IPOIB_MCAST_RUN, &priv->flags))
+               queue_delayed_work(priv->wq, &priv->mcast_task,
                                   mcast->backoff * HZ);
        spin_unlock_irq(&priv->lock);
        mutex_unlock(&mcast_mutex);
-out:
-       complete(&mcast->done);
+
        return status;
 }
 
@@ -487,10 +517,9 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
                rec.hop_limit     = priv->broadcast->mcmember.hop_limit;
        }
 
-       set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+       mutex_lock(&mcast_mutex);
        init_completion(&mcast->done);
-       set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags);
-
+       set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
        mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
                                         &rec, comp_mask, GFP_KERNEL,
                                         ipoib_mcast_join_complete, mcast);
@@ -504,13 +533,11 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
                if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
                        mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
 
-               mutex_lock(&mcast_mutex);
                if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-                       queue_delayed_work(ipoib_workqueue,
-                                          &priv->mcast_task,
+                       queue_delayed_work(priv->wq, &priv->mcast_task,
                                           mcast->backoff * HZ);
-               mutex_unlock(&mcast_mutex);
        }
+       mutex_unlock(&mcast_mutex);
 }
 
 void ipoib_mcast_join_task(struct work_struct *work)
@@ -547,8 +574,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
                        ipoib_warn(priv, "failed to allocate broadcast group\n");
                        mutex_lock(&mcast_mutex);
                        if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-                               queue_delayed_work(ipoib_workqueue,
-                                                  &priv->mcast_task, HZ);
+                               queue_delayed_work(priv->wq, &priv->mcast_task,
+                                                  HZ);
                        mutex_unlock(&mcast_mutex);
                        return;
                }
@@ -563,7 +590,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
        }
 
        if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
-               if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
+               if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
+                   !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
                        ipoib_mcast_join(dev, priv->broadcast, 0);
                return;
        }
@@ -571,23 +599,33 @@ void ipoib_mcast_join_task(struct work_struct *work)
        while (1) {
                struct ipoib_mcast *mcast = NULL;
 
+               /*
+                * Need the mutex so our flags are consistent, need the
+                * priv->lock so we don't race with list removals in either
+                * mcast_dev_flush or mcast_restart_task
+                */
+               mutex_lock(&mcast_mutex);
                spin_lock_irq(&priv->lock);
                list_for_each_entry(mcast, &priv->multicast_list, list) {
-                       if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
-                           && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
-                           && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+                       if (IS_ERR_OR_NULL(mcast->mc) &&
+                           !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
+                           !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
                                /* Found the next unjoined group */
                                break;
                        }
                }
                spin_unlock_irq(&priv->lock);
+               mutex_unlock(&mcast_mutex);
 
                if (&mcast->list == &priv->multicast_list) {
                        /* All done */
                        break;
                }
 
-               ipoib_mcast_join(dev, mcast, 1);
+               if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+                       ipoib_mcast_sendonly_join(mcast);
+               else
+                       ipoib_mcast_join(dev, mcast, 1);
                return;
        }
 
@@ -604,13 +642,13 @@ int ipoib_mcast_start_thread(struct net_device *dev)
 
        mutex_lock(&mcast_mutex);
        if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
-               queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
+               queue_delayed_work(priv->wq, &priv->mcast_task, 0);
        mutex_unlock(&mcast_mutex);
 
        return 0;
 }
 
-int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
+int ipoib_mcast_stop_thread(struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
 
@@ -621,8 +659,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
        cancel_delayed_work(&priv->mcast_task);
        mutex_unlock(&mcast_mutex);
 
-       if (flush)
-               flush_workqueue(ipoib_workqueue);
+       flush_workqueue(priv->wq);
 
        return 0;
 }
@@ -633,6 +670,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
        int ret = 0;
 
        if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+               ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n");
+
+       if (!IS_ERR_OR_NULL(mcast->mc))
                ib_sa_free_multicast(mcast->mc);
 
        if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
@@ -685,6 +725,8 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
                memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
                __ipoib_mcast_add(dev, mcast);
                list_add_tail(&mcast->list, &priv->multicast_list);
+               if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
+                       queue_delayed_work(priv->wq, &priv->mcast_task, 0);
        }
 
        if (!mcast->ah) {
@@ -698,8 +740,6 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
                if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
                        ipoib_dbg_mcast(priv, "no address vector, "
                                        "but multicast join already started\n");
-               else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
-                       ipoib_mcast_sendonly_join(mcast);
 
                /*
                 * If lookup completes between here and out:, don't
@@ -759,9 +799,12 @@ void ipoib_mcast_dev_flush(struct net_device *dev)
 
        spin_unlock_irqrestore(&priv->lock, flags);
 
-       /* seperate between the wait to the leave*/
+       /*
+        * make sure the in-flight joins have finished before we attempt
+        * to leave
+        */
        list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
-               if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags))
+               if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
                        wait_for_completion(&mcast->done);
 
        list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
@@ -794,8 +837,6 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 
        ipoib_dbg_mcast(priv, "restarting multicast task\n");
 
-       ipoib_mcast_stop_thread(dev, 0);
-
        local_irq_save(flags);
        netif_addr_lock(dev);
        spin_lock(&priv->lock);
@@ -880,14 +921,38 @@ void ipoib_mcast_restart_task(struct work_struct *work)
        netif_addr_unlock(dev);
        local_irq_restore(flags);
 
-       /* We have to cancel outside of the spinlock */
+       /*
+        * make sure the in-flight joins have finished before we attempt
+        * to leave
+        */
+       list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
+               if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+                       wait_for_completion(&mcast->done);
+
+       /*
+        * We have to cancel outside of the spinlock, but we have to
+        * take the rtnl lock or else we race with the removal of
+        * entries from the remove list in mcast_dev_flush as part
+        * of ipoib_stop().  We detect the drop of the ADMIN_UP flag
+        * to signal that we have hit this particular race, and we
+        * return since we know we don't need to do anything else
+        * anyway.
+        */
+       while (!rtnl_trylock()) {
+               if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+                       return;
+               else
+                       msleep(20);
+       }
        list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
                ipoib_mcast_leave(mcast->dev, mcast);
                ipoib_mcast_free(mcast);
        }
-
-       if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
-               ipoib_mcast_start_thread(dev);
+       /*
+        * Restart our join task if needed
+        */
+       ipoib_mcast_start_thread(dev);
+       rtnl_unlock();
 }
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
index c56d5d44c53b3f11725b6d6da220ea2c440fe496..b72a753eb41dc3031608269c56434ed507b96f5f 100644 (file)
@@ -145,10 +145,20 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
        int ret, size;
        int i;
 
+       /*
+        * the various IPoIB tasks assume they will never race against
+        * themselves, so always use a single thread workqueue
+        */
+       priv->wq = create_singlethread_workqueue("ipoib_wq");
+       if (!priv->wq) {
+               printk(KERN_WARNING "ipoib: failed to allocate device WQ\n");
+               return -ENODEV;
+       }
+
        priv->pd = ib_alloc_pd(priv->ca);
        if (IS_ERR(priv->pd)) {
                printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
-               return -ENODEV;
+               goto out_free_wq;
        }
 
        priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
@@ -242,6 +252,10 @@ out_free_mr:
 
 out_free_pd:
        ib_dealloc_pd(priv->pd);
+
+out_free_wq:
+       destroy_workqueue(priv->wq);
+       priv->wq = NULL;
        return -ENODEV;
 }
 
@@ -270,6 +284,12 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
 
        if (ib_dealloc_pd(priv->pd))
                ipoib_warn(priv, "ib_dealloc_pd failed\n");
+
+       if (priv->wq) {
+               flush_workqueue(priv->wq);
+               destroy_workqueue(priv->wq);
+               priv->wq = NULL;
+       }
 }
 
 void ipoib_event(struct ib_event_handler *handler,
index 20ca6a619476f065af6dbef57905897f717faf54..6a594aac229008418f388433107a959186f96867 100644 (file)
@@ -97,7 +97,7 @@ module_param_named(pi_enable, iser_pi_enable, bool, 0644);
 MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)");
 
 module_param_named(pi_guard, iser_pi_guard, int, 0644);
-MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:IP_CSUM)");
+MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]");
 
 static struct workqueue_struct *release_wq;
 struct iser_global ig;
@@ -164,18 +164,42 @@ iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
        return 0;
 }
 
-int iser_initialize_task_headers(struct iscsi_task *task,
-                                               struct iser_tx_desc *tx_desc)
+/**
+ * iser_initialize_task_headers() - Initialize task headers
+ * @task:       iscsi task
+ * @tx_desc:    iser tx descriptor
+ *
+ * Notes:
+ * This routine may race with iser teardown flow for scsi
+ * error handling TMFs. So for TMF we should acquire the
+ * state mutex to avoid dereferencing the IB device which
+ * may have already been terminated.
+ */
+int
+iser_initialize_task_headers(struct iscsi_task *task,
+                            struct iser_tx_desc *tx_desc)
 {
-       struct iser_conn       *iser_conn   = task->conn->dd_data;
+       struct iser_conn *iser_conn = task->conn->dd_data;
        struct iser_device *device = iser_conn->ib_conn.device;
        struct iscsi_iser_task *iser_task = task->dd_data;
        u64 dma_addr;
+       const bool mgmt_task = !task->sc && !in_interrupt();
+       int ret = 0;
+
+       if (unlikely(mgmt_task))
+               mutex_lock(&iser_conn->state_mutex);
+
+       if (unlikely(iser_conn->state != ISER_CONN_UP)) {
+               ret = -ENODEV;
+               goto out;
+       }
 
        dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
                                ISER_HEADERS_LEN, DMA_TO_DEVICE);
-       if (ib_dma_mapping_error(device->ib_device, dma_addr))
-               return -ENOMEM;
+       if (ib_dma_mapping_error(device->ib_device, dma_addr)) {
+               ret = -ENOMEM;
+               goto out;
+       }
 
        tx_desc->dma_addr = dma_addr;
        tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
@@ -183,7 +207,11 @@ int iser_initialize_task_headers(struct iscsi_task *task,
        tx_desc->tx_sg[0].lkey   = device->mr->lkey;
 
        iser_task->iser_conn = iser_conn;
-       return 0;
+out:
+       if (unlikely(mgmt_task))
+               mutex_unlock(&iser_conn->state_mutex);
+
+       return ret;
 }
 
 /**
@@ -199,9 +227,14 @@ static int
 iscsi_iser_task_init(struct iscsi_task *task)
 {
        struct iscsi_iser_task *iser_task = task->dd_data;
+       int ret;
 
-       if (iser_initialize_task_headers(task, &iser_task->desc))
-                       return -ENOMEM;
+       ret = iser_initialize_task_headers(task, &iser_task->desc);
+       if (ret) {
+               iser_err("Failed to init task %p, err = %d\n",
+                        iser_task, ret);
+               return ret;
+       }
 
        /* mgmt task */
        if (!task->sc)
@@ -508,8 +541,8 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
         */
        if (iser_conn) {
                mutex_lock(&iser_conn->state_mutex);
-               iscsi_conn_stop(cls_conn, flag);
                iser_conn_terminate(iser_conn);
+               iscsi_conn_stop(cls_conn, flag);
 
                /* unbind */
                iser_conn->iscsi_conn = NULL;
@@ -541,12 +574,13 @@ iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session)
 static inline unsigned int
 iser_dif_prot_caps(int prot_caps)
 {
-       return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? SHOST_DIF_TYPE1_PROTECTION |
-                                                     SHOST_DIX_TYPE1_PROTECTION : 0) |
-              ((prot_caps & IB_PROT_T10DIF_TYPE_2) ? SHOST_DIF_TYPE2_PROTECTION |
-                                                     SHOST_DIX_TYPE2_PROTECTION : 0) |
-              ((prot_caps & IB_PROT_T10DIF_TYPE_3) ? SHOST_DIF_TYPE3_PROTECTION |
-                                                     SHOST_DIX_TYPE3_PROTECTION : 0);
+       return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ?
+               SHOST_DIF_TYPE1_PROTECTION | SHOST_DIX_TYPE0_PROTECTION |
+               SHOST_DIX_TYPE1_PROTECTION : 0) |
+              ((prot_caps & IB_PROT_T10DIF_TYPE_2) ?
+               SHOST_DIF_TYPE2_PROTECTION | SHOST_DIX_TYPE2_PROTECTION : 0) |
+              ((prot_caps & IB_PROT_T10DIF_TYPE_3) ?
+               SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE3_PROTECTION : 0);
 }
 
 /**
@@ -569,6 +603,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
        struct Scsi_Host *shost;
        struct iser_conn *iser_conn = NULL;
        struct ib_conn *ib_conn;
+       u16 max_cmds;
 
        shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
        if (!shost)
@@ -586,26 +621,41 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
         */
        if (ep) {
                iser_conn = ep->dd_data;
+               max_cmds = iser_conn->max_cmds;
+
+               mutex_lock(&iser_conn->state_mutex);
+               if (iser_conn->state != ISER_CONN_UP) {
+                       iser_err("iser conn %p already started teardown\n",
+                                iser_conn);
+                       mutex_unlock(&iser_conn->state_mutex);
+                       goto free_host;
+               }
+
                ib_conn = &iser_conn->ib_conn;
                if (ib_conn->pi_support) {
                        u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap;
 
                        scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps));
-                       if (iser_pi_guard)
-                               scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP);
-                       else
-                               scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC);
+                       scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP |
+                                                  SHOST_DIX_GUARD_CRC);
                }
-       }
 
-       if (iscsi_host_add(shost, ep ?
-                          ib_conn->device->ib_device->dma_device : NULL))
-               goto free_host;
+               if (iscsi_host_add(shost,
+                                  ib_conn->device->ib_device->dma_device)) {
+                       mutex_unlock(&iser_conn->state_mutex);
+                       goto free_host;
+               }
+               mutex_unlock(&iser_conn->state_mutex);
+       } else {
+               max_cmds = ISER_DEF_XMIT_CMDS_MAX;
+               if (iscsi_host_add(shost, NULL))
+                       goto free_host;
+       }
 
-       if (cmds_max > ISER_DEF_XMIT_CMDS_MAX) {
+       if (cmds_max > max_cmds) {
                iser_info("cmds_max changed from %u to %u\n",
-                         cmds_max, ISER_DEF_XMIT_CMDS_MAX);
-               cmds_max = ISER_DEF_XMIT_CMDS_MAX;
+                         cmds_max, max_cmds);
+               cmds_max = max_cmds;
        }
 
        cls_session = iscsi_session_setup(&iscsi_iser_transport, shost,
index cd4174ca9a760dc70c0c369721407e1c2919a4c4..5ce26817e7e1d9b8d43126518188b3769e0c7f44 100644 (file)
 
 #define DRV_NAME       "iser"
 #define PFX            DRV_NAME ": "
-#define DRV_VER                "1.4.8"
+#define DRV_VER                "1.5"
 
 #define iser_dbg(fmt, arg...)                           \
        do {                                             \
-               if (iser_debug_level > 2)                \
+               if (unlikely(iser_debug_level > 2))      \
                        printk(KERN_DEBUG PFX "%s: " fmt,\
                                __func__ , ## arg);      \
        } while (0)
 
 #define iser_warn(fmt, arg...)                         \
        do {                                            \
-               if (iser_debug_level > 0)               \
+               if (unlikely(iser_debug_level > 0))     \
                        pr_warn(PFX "%s: " fmt,         \
                                __func__ , ## arg);     \
        } while (0)
 
 #define iser_info(fmt, arg...)                         \
        do {                                            \
-               if (iser_debug_level > 1)               \
+               if (unlikely(iser_debug_level > 1))     \
                        pr_info(PFX "%s: " fmt,         \
                                __func__ , ## arg);     \
        } while (0)
 
-#define iser_err(fmt, arg...)                          \
-       do {                                            \
-               printk(KERN_ERR PFX "%s: " fmt,         \
-                      __func__ , ## arg);              \
-       } while (0)
+#define iser_err(fmt, arg...) \
+       pr_err(PFX "%s: " fmt, __func__ , ## arg)
 
 #define SHIFT_4K       12
 #define SIZE_4K        (1ULL << SHIFT_4K)
                                        ISER_MAX_TX_MISC_PDUS         + \
                                        ISER_MAX_RX_MISC_PDUS)
 
+#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr                      \
+                                        - ISER_MAX_TX_MISC_PDUS        \
+                                        - ISER_MAX_RX_MISC_PDUS) /     \
+                                        (1 + ISER_INFLIGHT_DATAOUTS))
+
 #define ISER_WC_BATCH_COUNT   16
 #define ISER_SIGNAL_CMD_COUNT 32
 
@@ -247,7 +249,6 @@ struct iscsi_endpoint;
  * @va:           MR start address (buffer va)
  * @len:          MR length
  * @mem_h:        pointer to registration context (FMR/Fastreg)
- * @is_mr:        indicates weather we registered the buffer
  */
 struct iser_mem_reg {
        u32  lkey;
@@ -255,7 +256,6 @@ struct iser_mem_reg {
        u64  va;
        u64  len;
        void *mem_h;
-       int  is_mr;
 };
 
 /**
@@ -323,8 +323,6 @@ struct iser_rx_desc {
        char                         pad[ISER_RX_PAD_SIZE];
 } __attribute__((packed));
 
-#define ISER_MAX_CQ 4
-
 struct iser_conn;
 struct ib_conn;
 struct iscsi_iser_task;
@@ -375,7 +373,7 @@ struct iser_device {
        struct list_head             ig_list;
        int                          refcount;
        int                          comps_used;
-       struct iser_comp             comps[ISER_MAX_CQ];
+       struct iser_comp             *comps;
        int                          (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn,
                                                                unsigned cmds_max);
        void                         (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn);
@@ -432,6 +430,7 @@ struct fast_reg_descriptor {
  * @cma_id:              rdma_cm connection maneger handle
  * @qp:                  Connection Queue-pair
  * @post_recv_buf_count: post receive counter
+ * @sig_count:           send work request signal count
  * @rx_wr:               receive work request for batch posts
  * @device:              reference to iser device
  * @comp:                iser completion context
@@ -452,6 +451,7 @@ struct ib_conn {
        struct rdma_cm_id           *cma_id;
        struct ib_qp                *qp;
        int                          post_recv_buf_count;
+       u8                           sig_count;
        struct ib_recv_wr            rx_wr[ISER_MIN_POSTED_RX];
        struct iser_device          *device;
        struct iser_comp            *comp;
@@ -482,6 +482,7 @@ struct ib_conn {
  *                    to max number of post recvs
  * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1)
  * @min_posted_rx:    (qp_max_recv_dtos >> 2)
+ * @max_cmds:         maximum cmds allowed for this connection
  * @name:             connection peer portal
  * @release_work:     deffered work for release job
  * @state_mutex:      protects iser onnection state
@@ -507,6 +508,7 @@ struct iser_conn {
        unsigned                     qp_max_recv_dtos;
        unsigned                     qp_max_recv_dtos_mask;
        unsigned                     min_posted_rx;
+       u16                          max_cmds;
        char                         name[ISER_OBJECT_NAME_SIZE];
        struct work_struct           release_work;
        struct mutex                 state_mutex;
index 5a489ea63732c0166b695ce21fe16c6bb3206413..3821633f1065b15a9fcc1b1c36a74eda619aeae3 100644 (file)
@@ -369,7 +369,7 @@ static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
        return 0;
 }
 
-static inline bool iser_signal_comp(int sig_count)
+static inline bool iser_signal_comp(u8 sig_count)
 {
        return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0);
 }
@@ -388,7 +388,7 @@ int iser_send_command(struct iscsi_conn *conn,
        struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr;
        struct scsi_cmnd *sc  =  task->sc;
        struct iser_tx_desc *tx_desc = &iser_task->desc;
-       static unsigned sig_count;
+       u8 sig_count = ++iser_conn->ib_conn.sig_count;
 
        edtl = ntohl(hdr->data_length);
 
@@ -435,7 +435,7 @@ int iser_send_command(struct iscsi_conn *conn,
        iser_task->status = ISER_TASK_STATUS_STARTED;
 
        err = iser_post_send(&iser_conn->ib_conn, tx_desc,
-                            iser_signal_comp(++sig_count));
+                            iser_signal_comp(sig_count));
        if (!err)
                return 0;
 
index 6c5ce357fba6baa19fc62d39bd4c9b79be8c48a2..abce9339333f0a8551a2e52600d2409edd39461c 100644 (file)
@@ -73,7 +73,6 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
 
        if (cmd_dir == ISER_DIR_OUT) {
                /* copy the unaligned sg the buffer which is used for RDMA */
-               int i;
                char *p, *from;
 
                sgl = (struct scatterlist *)data->buf;
@@ -409,7 +408,6 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
                regd_buf->reg.rkey = device->mr->rkey;
                regd_buf->reg.len  = ib_sg_dma_len(ibdev, &sg[0]);
                regd_buf->reg.va   = ib_sg_dma_address(ibdev, &sg[0]);
-               regd_buf->reg.is_mr = 0;
 
                iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X  "
                         "va: 0x%08lX sz: %ld]\n",
@@ -440,13 +438,13 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
        return 0;
 }
 
-static inline void
+static void
 iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
                    struct ib_sig_domain *domain)
 {
        domain->sig_type = IB_SIG_TYPE_T10_DIF;
-       domain->sig.dif.pi_interval = sc->device->sector_size;
-       domain->sig.dif.ref_tag = scsi_get_lba(sc) & 0xffffffff;
+       domain->sig.dif.pi_interval = scsi_prot_interval(sc);
+       domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc);
        /*
         * At the moment we hard code those, but in the future
         * we will take them from sc.
@@ -454,8 +452,7 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
        domain->sig.dif.apptag_check_mask = 0xffff;
        domain->sig.dif.app_escape = true;
        domain->sig.dif.ref_escape = true;
-       if (scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE1 ||
-           scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE2)
+       if (sc->prot_flags & SCSI_PROT_REF_INCREMENT)
                domain->sig.dif.ref_remap = true;
 };
 
@@ -473,26 +470,16 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
        case SCSI_PROT_WRITE_STRIP:
                sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
                iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
-               /*
-                * At the moment we use this modparam to tell what is
-                * the memory bg_type, in the future we will take it
-                * from sc.
-                */
-               sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM :
-                                                IB_T10DIF_CRC;
+               sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
+                                               IB_T10DIF_CSUM : IB_T10DIF_CRC;
                break;
        case SCSI_PROT_READ_PASS:
        case SCSI_PROT_WRITE_PASS:
                iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
                sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
                iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
-               /*
-                * At the moment we use this modparam to tell what is
-                * the memory bg_type, in the future we will take it
-                * from sc.
-                */
-               sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM :
-                                                IB_T10DIF_CRC;
+               sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
+                                               IB_T10DIF_CSUM : IB_T10DIF_CRC;
                break;
        default:
                iser_err("Unsupported PI operation %d\n",
@@ -503,26 +490,28 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
        return 0;
 }
 
-static int
+static inline void
 iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
 {
-       switch (scsi_get_prot_type(sc)) {
-       case SCSI_PROT_DIF_TYPE0:
-               break;
-       case SCSI_PROT_DIF_TYPE1:
-       case SCSI_PROT_DIF_TYPE2:
-               *mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG;
-               break;
-       case SCSI_PROT_DIF_TYPE3:
-               *mask = ISER_CHECK_GUARD;
-               break;
-       default:
-               iser_err("Unsupported protection type %d\n",
-                        scsi_get_prot_type(sc));
-               return -EINVAL;
-       }
+       *mask = 0;
+       if (sc->prot_flags & SCSI_PROT_REF_CHECK)
+               *mask |= ISER_CHECK_REFTAG;
+       if (sc->prot_flags & SCSI_PROT_GUARD_CHECK)
+               *mask |= ISER_CHECK_GUARD;
+}
 
-       return 0;
+static void
+iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
+{
+       u32 rkey;
+
+       memset(inv_wr, 0, sizeof(*inv_wr));
+       inv_wr->opcode = IB_WR_LOCAL_INV;
+       inv_wr->wr_id = ISER_FASTREG_LI_WRID;
+       inv_wr->ex.invalidate_rkey = mr->rkey;
+
+       rkey = ib_inc_rkey(mr->rkey);
+       ib_update_fast_reg_key(mr, rkey);
 }
 
 static int
@@ -536,26 +525,17 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
        struct ib_send_wr *bad_wr, *wr = NULL;
        struct ib_sig_attrs sig_attrs;
        int ret;
-       u32 key;
 
        memset(&sig_attrs, 0, sizeof(sig_attrs));
        ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs);
        if (ret)
                goto err;
 
-       ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask);
-       if (ret)
-               goto err;
+       iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask);
 
        if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) {
-               memset(&inv_wr, 0, sizeof(inv_wr));
-               inv_wr.opcode = IB_WR_LOCAL_INV;
-               inv_wr.wr_id = ISER_FASTREG_LI_WRID;
-               inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey;
+               iser_inv_rkey(&inv_wr, pi_ctx->sig_mr);
                wr = &inv_wr;
-               /* Bump the key */
-               key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF);
-               ib_update_fast_reg_key(pi_ctx->sig_mr, ++key);
        }
 
        memset(&sig_wr, 0, sizeof(sig_wr));
@@ -585,12 +565,7 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
 
        sig_sge->lkey = pi_ctx->sig_mr->lkey;
        sig_sge->addr = 0;
-       sig_sge->length = data_sge->length + prot_sge->length;
-       if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT ||
-           scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) {
-               sig_sge->length += (data_sge->length /
-                                  iser_task->sc->device->sector_size) * 8;
-       }
+       sig_sge->length = scsi_transfer_length(iser_task->sc);
 
        iser_dbg("sig_sge: addr: 0x%llx  length: %u lkey: 0x%x\n",
                 sig_sge->addr, sig_sge->length,
@@ -613,7 +588,6 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
        struct ib_fast_reg_page_list *frpl;
        struct ib_send_wr fastreg_wr, inv_wr;
        struct ib_send_wr *bad_wr, *wr = NULL;
-       u8 key;
        int ret, offset, size, plen;
 
        /* if there a single dma entry, dma mr suffices */
@@ -645,14 +619,8 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
        }
 
        if (!(desc->reg_indicators & ind)) {
-               memset(&inv_wr, 0, sizeof(inv_wr));
-               inv_wr.wr_id = ISER_FASTREG_LI_WRID;
-               inv_wr.opcode = IB_WR_LOCAL_INV;
-               inv_wr.ex.invalidate_rkey = mr->rkey;
+               iser_inv_rkey(&inv_wr, mr);
                wr = &inv_wr;
-               /* Bump the key */
-               key = (u8)(mr->rkey & 0x000000FF);
-               ib_update_fast_reg_key(mr, ++key);
        }
 
        /* Prepare FASTREG WR */
@@ -770,15 +738,11 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
                regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey;
                regd_buf->reg.va = sig_sge.addr;
                regd_buf->reg.len = sig_sge.length;
-               regd_buf->reg.is_mr = 1;
        } else {
-               if (desc) {
+               if (desc)
                        regd_buf->reg.rkey = desc->data_mr->rkey;
-                       regd_buf->reg.is_mr = 1;
-               } else {
+               else
                        regd_buf->reg.rkey = device->mr->rkey;
-                       regd_buf->reg.is_mr = 0;
-               }
 
                regd_buf->reg.lkey = data_sge.lkey;
                regd_buf->reg.va = data_sge.addr;
index 67225bb82bb50bc4b35ac8f350a750e5e3d96797..695a2704bd4380acafbc04c4ae7f0ecd96bc95a0 100644 (file)
@@ -76,7 +76,7 @@ static void iser_event_handler(struct ib_event_handler *handler,
 static int iser_create_device_ib_res(struct iser_device *device)
 {
        struct ib_device_attr *dev_attr = &device->dev_attr;
-       int ret, i;
+       int ret, i, max_cqe;
 
        ret = ib_query_device(device->ib_device, dev_attr);
        if (ret) {
@@ -104,11 +104,19 @@ static int iser_create_device_ib_res(struct iser_device *device)
                return -1;
        }
 
-       device->comps_used = min(ISER_MAX_CQ,
+       device->comps_used = min_t(int, num_online_cpus(),
                                 device->ib_device->num_comp_vectors);
-       iser_info("using %d CQs, device %s supports %d vectors\n",
+
+       device->comps = kcalloc(device->comps_used, sizeof(*device->comps),
+                               GFP_KERNEL);
+       if (!device->comps)
+               goto comps_err;
+
+       max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe);
+
+       iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n",
                  device->comps_used, device->ib_device->name,
-                 device->ib_device->num_comp_vectors);
+                 device->ib_device->num_comp_vectors, max_cqe);
 
        device->pd = ib_alloc_pd(device->ib_device);
        if (IS_ERR(device->pd))
@@ -122,7 +130,7 @@ static int iser_create_device_ib_res(struct iser_device *device)
                                        iser_cq_callback,
                                        iser_cq_event_callback,
                                        (void *)comp,
-                                       ISER_MAX_CQ_LEN, i);
+                                       max_cqe, i);
                if (IS_ERR(comp->cq)) {
                        comp->cq = NULL;
                        goto cq_err;
@@ -162,6 +170,8 @@ cq_err:
        }
        ib_dealloc_pd(device->pd);
 pd_err:
+       kfree(device->comps);
+comps_err:
        iser_err("failed to allocate an IB resource\n");
        return -1;
 }
@@ -187,6 +197,9 @@ static void iser_free_device_ib_res(struct iser_device *device)
        (void)ib_dereg_mr(device->mr);
        (void)ib_dealloc_pd(device->pd);
 
+       kfree(device->comps);
+       device->comps = NULL;
+
        device->mr = NULL;
        device->pd = NULL;
 }
@@ -425,7 +438,10 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn)
  */
 static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
 {
+       struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
+                                                  ib_conn);
        struct iser_device      *device;
+       struct ib_device_attr *dev_attr;
        struct ib_qp_init_attr  init_attr;
        int                     ret = -ENOMEM;
        int index, min_index = 0;
@@ -433,6 +449,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
        BUG_ON(ib_conn->device == NULL);
 
        device = ib_conn->device;
+       dev_attr = &device->dev_attr;
 
        memset(&init_attr, 0, sizeof init_attr);
 
@@ -460,8 +477,20 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
        if (ib_conn->pi_support) {
                init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
                init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+               iser_conn->max_cmds =
+                       ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
        } else {
-               init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS + 1;
+               if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
+                       init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS + 1;
+                       iser_conn->max_cmds =
+                               ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
+               } else {
+                       init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
+                       iser_conn->max_cmds =
+                               ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
+                       iser_dbg("device %s supports max_send_wr %d\n",
+                                device->ib_device->name, dev_attr->max_qp_wr);
+               }
        }
 
        ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
@@ -475,7 +504,11 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
        return ret;
 
 out_err:
+       mutex_lock(&ig.connlist_mutex);
+       ib_conn->comp->active_qps--;
+       mutex_unlock(&ig.connlist_mutex);
        iser_err("unable to alloc mem or create resource, err %d\n", ret);
+
        return ret;
 }
 
@@ -610,9 +643,11 @@ void iser_conn_release(struct iser_conn *iser_conn)
        mutex_unlock(&ig.connlist_mutex);
 
        mutex_lock(&iser_conn->state_mutex);
-       if (iser_conn->state != ISER_CONN_DOWN)
+       if (iser_conn->state != ISER_CONN_DOWN) {
                iser_warn("iser conn %p state %d, expected state down.\n",
                          iser_conn, iser_conn->state);
+               iser_conn->state = ISER_CONN_DOWN;
+       }
        /*
         * In case we never got to bind stage, we still need to
         * release IB resources (which is safe to call more than once).
@@ -662,8 +697,10 @@ int iser_conn_terminate(struct iser_conn *iser_conn)
 
                /* post an indication that all flush errors were consumed */
                err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr);
-               if (err)
+               if (err) {
                        iser_err("conn %p failed to post beacon", ib_conn);
+                       return 1;
+               }
 
                wait_for_completion(&ib_conn->flush_comp);
        }
@@ -846,20 +883,21 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
                break;
        case RDMA_CM_EVENT_DISCONNECTED:
        case RDMA_CM_EVENT_ADDR_CHANGE:
-               iser_disconnected_handler(cma_id);
+       case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+               iser_cleanup_handler(cma_id, false);
                break;
        case RDMA_CM_EVENT_DEVICE_REMOVAL:
                /*
                 * we *must* destroy the device as we cannot rely
                 * on iscsid to be around to initiate error handling.
-                * also implicitly destroy the cma_id.
+                * also if we are not in state DOWN implicitly destroy
+                * the cma_id.
                 */
                iser_cleanup_handler(cma_id, true);
-               iser_conn->ib_conn.cma_id = NULL;
-               ret = 1;
-               break;
-       case RDMA_CM_EVENT_TIMEWAIT_EXIT:
-               iser_cleanup_handler(cma_id, false);
+               if (iser_conn->state != ISER_CONN_DOWN) {
+                       iser_conn->ib_conn.cma_id = NULL;
+                       ret = 1;
+               }
                break;
        default:
                iser_err("Unexpected RDMA CM event (%d)\n", event->event);
@@ -981,7 +1019,6 @@ int iser_reg_page_vec(struct ib_conn *ib_conn,
        mem_reg->rkey  = mem->fmr->rkey;
        mem_reg->len   = page_vec->length * SIZE_4K;
        mem_reg->va    = io_addr;
-       mem_reg->is_mr = 1;
        mem_reg->mem_h = (void *)mem;
 
        mem_reg->va   += page_vec->offset;
@@ -1008,7 +1045,7 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
        struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;
        int ret;
 
-       if (!reg->is_mr)
+       if (!reg->mem_h)
                return;
 
        iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h);
@@ -1028,11 +1065,10 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
        struct ib_conn *ib_conn = &iser_conn->ib_conn;
        struct fast_reg_descriptor *desc = reg->mem_h;
 
-       if (!reg->is_mr)
+       if (!desc)
                return;
 
        reg->mem_h = NULL;
-       reg->is_mr = 0;
        spin_lock_bh(&ib_conn->lock);
        list_add_tail(&desc->list, &ib_conn->fastreg.pool);
        spin_unlock_bh(&ib_conn->lock);
@@ -1049,7 +1085,7 @@ int iser_post_recvl(struct iser_conn *iser_conn)
        sge.length = ISER_RX_LOGIN_SIZE;
        sge.lkey   = ib_conn->device->mr->lkey;
 
-       rx_wr.wr_id   = (unsigned long)iser_conn->login_resp_buf;
+       rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
        rx_wr.sg_list = &sge;
        rx_wr.num_sge = 1;
        rx_wr.next    = NULL;
@@ -1073,7 +1109,7 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count)
 
        for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
                rx_desc         = &iser_conn->rx_descs[my_rx_head];
-               rx_wr->wr_id    = (unsigned long)rx_desc;
+               rx_wr->wr_id    = (uintptr_t)rx_desc;
                rx_wr->sg_list  = &rx_desc->rx_sg;
                rx_wr->num_sge  = 1;
                rx_wr->next     = rx_wr + 1;
@@ -1110,7 +1146,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
                                      DMA_TO_DEVICE);
 
        send_wr.next       = NULL;
-       send_wr.wr_id      = (unsigned long)tx_desc;
+       send_wr.wr_id      = (uintptr_t)tx_desc;
        send_wr.sg_list    = tx_desc->tx_sg;
        send_wr.num_sge    = tx_desc->num_sge;
        send_wr.opcode     = IB_WR_SEND;
@@ -1160,6 +1196,7 @@ static void
 iser_handle_comp_error(struct ib_conn *ib_conn,
                       struct ib_wc *wc)
 {
+       void *wr_id = (void *)(uintptr_t)wc->wr_id;
        struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
                                                   ib_conn);
 
@@ -1168,8 +1205,8 @@ iser_handle_comp_error(struct ib_conn *ib_conn,
                        iscsi_conn_failure(iser_conn->iscsi_conn,
                                           ISCSI_ERR_CONN_FAILED);
 
-       if (is_iser_tx_desc(iser_conn, (void *)wc->wr_id)) {
-               struct iser_tx_desc *desc = (struct iser_tx_desc *)wc->wr_id;
+       if (is_iser_tx_desc(iser_conn, wr_id)) {
+               struct iser_tx_desc *desc = wr_id;
 
                if (desc->type == ISCSI_TX_DATAOUT)
                        kmem_cache_free(ig.desc_cache, desc);
@@ -1193,14 +1230,14 @@ static void iser_handle_wc(struct ib_wc *wc)
        struct iser_rx_desc *rx_desc;
 
        ib_conn = wc->qp->qp_context;
-       if (wc->status == IB_WC_SUCCESS) {
+       if (likely(wc->status == IB_WC_SUCCESS)) {
                if (wc->opcode == IB_WC_RECV) {
-                       rx_desc = (struct iser_rx_desc *)wc->wr_id;
+                       rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
                        iser_rcv_completion(rx_desc, wc->byte_len,
                                            ib_conn);
                } else
                if (wc->opcode == IB_WC_SEND) {
-                       tx_desc = (struct iser_tx_desc *)wc->wr_id;
+                       tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
                        iser_snd_completion(tx_desc, ib_conn);
                } else {
                        iser_err("Unknown wc opcode %d\n", wc->opcode);
index 5461924c9f10ef9311a8f11a5091170c4597281a..db3c8c851af16cd22524f080d2db91581621b020 100644 (file)
@@ -2929,7 +2929,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)
                return -ENOMEM;
 
        sep_opt = options;
-       while ((p = strsep(&sep_opt, ",")) != NULL) {
+       while ((p = strsep(&sep_opt, ",\n")) != NULL) {
                if (!*p)
                        continue;
 
index b935bf3d0bb347350bcd5479462b60eced814d5d..943cbd47d832bb98719e355a727e8451c68bfbbe 100644 (file)
@@ -171,9 +171,9 @@ int mlx4_check_port_params(struct mlx4_dev *dev,
 {
        int i;
 
-       for (i = 0; i < dev->caps.num_ports - 1; i++) {
-               if (port_type[i] != port_type[i + 1]) {
-                       if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+       if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+               for (i = 0; i < dev->caps.num_ports - 1; i++) {
+                       if (port_type[i] != port_type[i + 1]) {
                                mlx4_err(dev, "Only same port types supported on this HCA, aborting\n");
                                return -EINVAL;
                        }
index ab684463780b4fbd193b79eea5299a743e7a36cd..da82991239a8cb0df3f74e1a14d36037ed2d9a18 100644 (file)
@@ -157,6 +157,8 @@ static const char *eqe_type_str(u8 type)
                return "MLX5_EVENT_TYPE_CMD";
        case MLX5_EVENT_TYPE_PAGE_REQUEST:
                return "MLX5_EVENT_TYPE_PAGE_REQUEST";
+       case MLX5_EVENT_TYPE_PAGE_FAULT:
+               return "MLX5_EVENT_TYPE_PAGE_FAULT";
        default:
                return "Unrecognized event";
        }
@@ -279,6 +281,11 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
                        }
                        break;
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+               case MLX5_EVENT_TYPE_PAGE_FAULT:
+                       mlx5_eq_pagefault(dev, eqe);
+                       break;
+#endif
 
                default:
                        mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n",
@@ -446,8 +453,12 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev)
 int mlx5_start_eqs(struct mlx5_core_dev *dev)
 {
        struct mlx5_eq_table *table = &dev->priv.eq_table;
+       u32 async_event_mask = MLX5_ASYNC_EVENT_MASK;
        int err;
 
+       if (dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)
+               async_event_mask |= (1ull << MLX5_EVENT_TYPE_PAGE_FAULT);
+
        err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
                                 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
                                 "mlx5_cmd_eq", &dev->priv.uuari.uars[0]);
@@ -459,7 +470,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
        mlx5_cmd_use_events(dev);
 
        err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC,
-                                MLX5_NUM_ASYNC_EQE, MLX5_ASYNC_EVENT_MASK,
+                                MLX5_NUM_ASYNC_EQE, async_event_mask,
                                 "mlx5_async_eq", &dev->priv.uuari.uars[0]);
        if (err) {
                mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
index 087c4c797debdc90a7cec13aacf3b78f5cf04190..06f9036acd836196424b294ea47ce4dc624cbe09 100644 (file)
@@ -69,6 +69,46 @@ int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, struct mlx5_caps *caps)
        return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR);
 }
 
+int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *caps)
+{
+       u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)];
+       int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+       void *out;
+       int err;
+
+       if (!(dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG))
+               return -ENOTSUPP;
+
+       memset(in, 0, sizeof(in));
+       out = kzalloc(out_sz, GFP_KERNEL);
+       if (!out)
+               return -ENOMEM;
+       MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+       MLX5_SET(query_hca_cap_in, in, op_mod, HCA_CAP_OPMOD_GET_ODP_CUR);
+       err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
+       if (err)
+               goto out;
+
+       err = mlx5_cmd_status_to_err_v2(out);
+       if (err) {
+               mlx5_core_warn(dev, "query cur hca ODP caps failed, %d\n", err);
+               goto out;
+       }
+
+       memcpy(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct),
+              sizeof(*caps));
+
+       mlx5_core_dbg(dev, "on-demand paging capabilities:\nrc: %08x\nuc: %08x\nud: %08x\n",
+               be32_to_cpu(caps->per_transport_caps.rc_odp_caps),
+               be32_to_cpu(caps->per_transport_caps.uc_odp_caps),
+               be32_to_cpu(caps->per_transport_caps.ud_odp_caps));
+
+out:
+       kfree(out);
+       return err;
+}
+EXPORT_SYMBOL(mlx5_query_odp_caps);
+
 int mlx5_cmd_init_hca(struct mlx5_core_dev *dev)
 {
        struct mlx5_cmd_init_hca_mbox_in in;
index 5261a2b0da436b3a4359787c11dbc3686ee70689..575d853dbe05d1e9e8e3255761a673ff72dc5c30 100644 (file)
@@ -88,6 +88,95 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
        mlx5_core_put_rsc(common);
 }
 
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
+{
+       struct mlx5_eqe_page_fault *pf_eqe = &eqe->data.page_fault;
+       int qpn = be32_to_cpu(pf_eqe->flags_qpn) & MLX5_QPN_MASK;
+       struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, qpn);
+       struct mlx5_core_qp *qp =
+               container_of(common, struct mlx5_core_qp, common);
+       struct mlx5_pagefault pfault;
+
+       if (!qp) {
+               mlx5_core_warn(dev, "ODP event for non-existent QP %06x\n",
+                              qpn);
+               return;
+       }
+
+       pfault.event_subtype = eqe->sub_type;
+       pfault.flags = (be32_to_cpu(pf_eqe->flags_qpn) >> MLX5_QPN_BITS) &
+               (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE | MLX5_PFAULT_RDMA);
+       pfault.bytes_committed = be32_to_cpu(
+               pf_eqe->bytes_committed);
+
+       mlx5_core_dbg(dev,
+                     "PAGE_FAULT: subtype: 0x%02x, flags: 0x%02x,\n",
+                     eqe->sub_type, pfault.flags);
+
+       switch (eqe->sub_type) {
+       case MLX5_PFAULT_SUBTYPE_RDMA:
+               /* RDMA based event */
+               pfault.rdma.r_key =
+                       be32_to_cpu(pf_eqe->rdma.r_key);
+               pfault.rdma.packet_size =
+                       be16_to_cpu(pf_eqe->rdma.packet_length);
+               pfault.rdma.rdma_op_len =
+                       be32_to_cpu(pf_eqe->rdma.rdma_op_len);
+               pfault.rdma.rdma_va =
+                       be64_to_cpu(pf_eqe->rdma.rdma_va);
+               mlx5_core_dbg(dev,
+                             "PAGE_FAULT: qpn: 0x%06x, r_key: 0x%08x,\n",
+                             qpn, pfault.rdma.r_key);
+               mlx5_core_dbg(dev,
+                             "PAGE_FAULT: rdma_op_len: 0x%08x,\n",
+                             pfault.rdma.rdma_op_len);
+               mlx5_core_dbg(dev,
+                             "PAGE_FAULT: rdma_va: 0x%016llx,\n",
+                             pfault.rdma.rdma_va);
+               mlx5_core_dbg(dev,
+                             "PAGE_FAULT: bytes_committed: 0x%06x\n",
+                             pfault.bytes_committed);
+               break;
+
+       case MLX5_PFAULT_SUBTYPE_WQE:
+               /* WQE based event */
+               pfault.wqe.wqe_index =
+                       be16_to_cpu(pf_eqe->wqe.wqe_index);
+               pfault.wqe.packet_size =
+                       be16_to_cpu(pf_eqe->wqe.packet_length);
+               mlx5_core_dbg(dev,
+                             "PAGE_FAULT: qpn: 0x%06x, wqe_index: 0x%04x,\n",
+                             qpn, pfault.wqe.wqe_index);
+               mlx5_core_dbg(dev,
+                             "PAGE_FAULT: bytes_committed: 0x%06x\n",
+                             pfault.bytes_committed);
+               break;
+
+       default:
+               mlx5_core_warn(dev,
+                              "Unsupported page fault event sub-type: 0x%02hhx, QP %06x\n",
+                              eqe->sub_type, qpn);
+               /* Unsupported page faults should still be resolved by the
+                * page fault handler
+                */
+       }
+
+       if (qp->pfault_handler) {
+               qp->pfault_handler(qp, &pfault);
+       } else {
+               mlx5_core_err(dev,
+                             "ODP event for QP %08x, without a fault handler in QP\n",
+                             qpn);
+               /* Page fault will remain unresolved. QP will hang until it is
+                * destroyed
+                */
+       }
+
+       mlx5_core_put_rsc(common);
+}
+#endif
+
 int mlx5_core_create_qp(struct mlx5_core_dev *dev,
                        struct mlx5_core_qp *qp,
                        struct mlx5_create_qp_mbox_in *in,
@@ -322,3 +411,33 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn)
        return err;
 }
 EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
+                               u8 flags, int error)
+{
+       struct mlx5_page_fault_resume_mbox_in in;
+       struct mlx5_page_fault_resume_mbox_out out;
+       int err;
+
+       memset(&in, 0, sizeof(in));
+       memset(&out, 0, sizeof(out));
+       in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_PAGE_FAULT_RESUME);
+       in.hdr.opmod = 0;
+       flags &= (MLX5_PAGE_FAULT_RESUME_REQUESTOR |
+                 MLX5_PAGE_FAULT_RESUME_WRITE     |
+                 MLX5_PAGE_FAULT_RESUME_RDMA);
+       flags |= (error ? MLX5_PAGE_FAULT_RESUME_ERROR : 0);
+       in.flags_qpn = cpu_to_be32((qpn & MLX5_QPN_MASK) |
+                                  (flags << MLX5_QPN_BITS));
+       err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+       if (err)
+               return err;
+
+       if (out.hdr.status)
+               err = mlx5_cmd_status_to_err(&out.hdr);
+
+       return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
+#endif
index ea4f1c46f761127bf01c87b5d8933bd09819d1b7..4e5bd813bb9a7d9142866857ff3c4be73f956e6e 100644 (file)
@@ -119,6 +119,15 @@ enum {
        MLX5_MAX_LOG_PKEY_TABLE  = 5,
 };
 
+enum {
+       MLX5_MKEY_INBOX_PG_ACCESS = 1 << 31
+};
+
+enum {
+       MLX5_PFAULT_SUBTYPE_WQE = 0,
+       MLX5_PFAULT_SUBTYPE_RDMA = 1,
+};
+
 enum {
        MLX5_PERM_LOCAL_READ    = 1 << 2,
        MLX5_PERM_LOCAL_WRITE   = 1 << 3,
@@ -180,6 +189,19 @@ enum {
        MLX5_MKEY_MASK_FREE             = 1ull << 29,
 };
 
+enum {
+       MLX5_UMR_TRANSLATION_OFFSET_EN  = (1 << 4),
+
+       MLX5_UMR_CHECK_NOT_FREE         = (1 << 5),
+       MLX5_UMR_CHECK_FREE             = (2 << 5),
+
+       MLX5_UMR_INLINE                 = (1 << 7),
+};
+
+#define MLX5_UMR_MTT_ALIGNMENT 0x40
+#define MLX5_UMR_MTT_MASK      (MLX5_UMR_MTT_ALIGNMENT - 1)
+#define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT
+
 enum mlx5_event {
        MLX5_EVENT_TYPE_COMP               = 0x0,
 
@@ -206,6 +228,8 @@ enum mlx5_event {
 
        MLX5_EVENT_TYPE_CMD                = 0x0a,
        MLX5_EVENT_TYPE_PAGE_REQUEST       = 0xb,
+
+       MLX5_EVENT_TYPE_PAGE_FAULT         = 0xc,
 };
 
 enum {
@@ -225,6 +249,7 @@ enum {
        MLX5_DEV_CAP_FLAG_APM           = 1LL << 17,
        MLX5_DEV_CAP_FLAG_ATOMIC        = 1LL << 18,
        MLX5_DEV_CAP_FLAG_BLOCK_MCAST   = 1LL << 23,
+       MLX5_DEV_CAP_FLAG_ON_DMND_PG    = 1LL << 24,
        MLX5_DEV_CAP_FLAG_CQ_MODER      = 1LL << 29,
        MLX5_DEV_CAP_FLAG_RESIZE_CQ     = 1LL << 30,
        MLX5_DEV_CAP_FLAG_DCT           = 1LL << 37,
@@ -290,6 +315,8 @@ enum {
 enum {
        HCA_CAP_OPMOD_GET_MAX   = 0,
        HCA_CAP_OPMOD_GET_CUR   = 1,
+       HCA_CAP_OPMOD_GET_ODP_MAX = 4,
+       HCA_CAP_OPMOD_GET_ODP_CUR = 5
 };
 
 struct mlx5_inbox_hdr {
@@ -319,6 +346,23 @@ struct mlx5_cmd_query_adapter_mbox_out {
        u8                      vsd_psid[16];
 };
 
+enum mlx5_odp_transport_cap_bits {
+       MLX5_ODP_SUPPORT_SEND    = 1 << 31,
+       MLX5_ODP_SUPPORT_RECV    = 1 << 30,
+       MLX5_ODP_SUPPORT_WRITE   = 1 << 29,
+       MLX5_ODP_SUPPORT_READ    = 1 << 28,
+};
+
+struct mlx5_odp_caps {
+       char reserved[0x10];
+       struct {
+               __be32                  rc_odp_caps;
+               __be32                  uc_odp_caps;
+               __be32                  ud_odp_caps;
+       } per_transport_caps;
+       char reserved2[0xe4];
+};
+
 struct mlx5_cmd_init_hca_mbox_in {
        struct mlx5_inbox_hdr   hdr;
        u8                      rsvd0[2];
@@ -439,6 +483,27 @@ struct mlx5_eqe_page_req {
        __be32          rsvd1[5];
 };
 
+struct mlx5_eqe_page_fault {
+       __be32 bytes_committed;
+       union {
+               struct {
+                       u16     reserved1;
+                       __be16  wqe_index;
+                       u16     reserved2;
+                       __be16  packet_length;
+                       u8      reserved3[12];
+               } __packed wqe;
+               struct {
+                       __be32  r_key;
+                       u16     reserved1;
+                       __be16  packet_length;
+                       __be32  rdma_op_len;
+                       __be64  rdma_va;
+               } __packed rdma;
+       } __packed;
+       __be32 flags_qpn;
+} __packed;
+
 union ev_data {
        __be32                          raw[7];
        struct mlx5_eqe_cmd             cmd;
@@ -450,6 +515,7 @@ union ev_data {
        struct mlx5_eqe_congestion      cong;
        struct mlx5_eqe_stall_vl        stall_vl;
        struct mlx5_eqe_page_req        req_pages;
+       struct mlx5_eqe_page_fault      page_fault;
 } __packed;
 
 struct mlx5_eqe {
@@ -776,6 +842,10 @@ struct mlx5_query_eq_mbox_out {
        struct mlx5_eq_context  ctx;
 };
 
+enum {
+       MLX5_MKEY_STATUS_FREE = 1 << 6,
+};
+
 struct mlx5_mkey_seg {
        /* This is a two bit field occupying bits 31-30.
         * bit 31 is always 0,
@@ -812,7 +882,7 @@ struct mlx5_query_special_ctxs_mbox_out {
 struct mlx5_create_mkey_mbox_in {
        struct mlx5_inbox_hdr   hdr;
        __be32                  input_mkey_index;
-       u8                      rsvd0[4];
+       __be32                  flags;
        struct mlx5_mkey_seg    seg;
        u8                      rsvd1[16];
        __be32                  xlat_oct_act_size;
index b1bf41556b3245fc9970174f5be872202a58eec5..166d9315fe4b565bdd8487da66559f6cbfe19462 100644 (file)
@@ -113,6 +113,13 @@ enum {
        MLX5_REG_HOST_ENDIANNESS = 0x7004,
 };
 
+enum mlx5_page_fault_resume_flags {
+       MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0,
+       MLX5_PAGE_FAULT_RESUME_WRITE     = 1 << 1,
+       MLX5_PAGE_FAULT_RESUME_RDMA      = 1 << 2,
+       MLX5_PAGE_FAULT_RESUME_ERROR     = 1 << 7,
+};
+
 enum dbg_rsc_type {
        MLX5_DBG_RSC_QP,
        MLX5_DBG_RSC_EQ,
@@ -467,7 +474,7 @@ struct mlx5_priv {
        struct workqueue_struct *pg_wq;
        struct rb_root          page_root;
        int                     fw_pages;
-       int                     reg_pages;
+       atomic_t                reg_pages;
        struct list_head        free_list;
 
        struct mlx5_core_health health;
@@ -703,6 +710,9 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev);
 void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas);
 void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn);
 void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
+#endif
 void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type);
 struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn);
 void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector);
@@ -740,6 +750,8 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn,
                         int npsvs, u32 *sig_index);
 int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num);
 void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common);
+int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
+                       struct mlx5_odp_caps *odp_caps);
 
 static inline u32 mlx5_mkey_to_idx(u32 mkey)
 {
index 3fa075daeb1d12bbf7eb06394232018f993b32e4..61f7a342d1bfd1cc4f102d1b75be3d8a977e28d0 100644 (file)
@@ -50,6 +50,9 @@
 #define MLX5_BSF_APPTAG_ESCAPE 0x1
 #define MLX5_BSF_APPREF_ESCAPE 0x2
 
+#define MLX5_QPN_BITS          24
+#define MLX5_QPN_MASK          ((1 << MLX5_QPN_BITS) - 1)
+
 enum mlx5_qp_optpar {
        MLX5_QP_OPTPAR_ALT_ADDR_PATH            = 1 << 0,
        MLX5_QP_OPTPAR_RRE                      = 1 << 1,
@@ -189,6 +192,14 @@ struct mlx5_wqe_ctrl_seg {
        __be32                  imm;
 };
 
+#define MLX5_WQE_CTRL_DS_MASK 0x3f
+#define MLX5_WQE_CTRL_QPN_MASK 0xffffff00
+#define MLX5_WQE_CTRL_QPN_SHIFT 8
+#define MLX5_WQE_DS_UNITS 16
+#define MLX5_WQE_CTRL_OPCODE_MASK 0xff
+#define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00
+#define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8
+
 struct mlx5_wqe_xrc_seg {
        __be32                  xrc_srqn;
        u8                      rsvd[12];
@@ -292,6 +303,8 @@ struct mlx5_wqe_signature_seg {
        u8      rsvd1[11];
 };
 
+#define MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK 0x3ff
+
 struct mlx5_wqe_inline_seg {
        __be32  byte_count;
 };
@@ -360,9 +373,46 @@ struct mlx5_stride_block_ctrl_seg {
        __be16          num_entries;
 };
 
+enum mlx5_pagefault_flags {
+       MLX5_PFAULT_REQUESTOR = 1 << 0,
+       MLX5_PFAULT_WRITE     = 1 << 1,
+       MLX5_PFAULT_RDMA      = 1 << 2,
+};
+
+/* Contains the details of a pagefault. */
+struct mlx5_pagefault {
+       u32                     bytes_committed;
+       u8                      event_subtype;
+       enum mlx5_pagefault_flags flags;
+       union {
+               /* Initiator or send message responder pagefault details. */
+               struct {
+                       /* Received packet size, only valid for responders. */
+                       u32     packet_size;
+                       /*
+                        * WQE index. Refers to either the send queue or
+                        * receive queue, according to event_subtype.
+                        */
+                       u16     wqe_index;
+               } wqe;
+               /* RDMA responder pagefault details */
+               struct {
+                       u32     r_key;
+                       /*
+                        * Received packet size, minimal size page fault
+                        * resolution required for forward progress.
+                        */
+                       u32     packet_size;
+                       u32     rdma_op_len;
+                       u64     rdma_va;
+               } rdma;
+       };
+};
+
 struct mlx5_core_qp {
        struct mlx5_core_rsc_common     common; /* must be first */
        void (*event)           (struct mlx5_core_qp *, int);
+       void (*pfault_handler)(struct mlx5_core_qp *, struct mlx5_pagefault *);
        int                     qpn;
        struct mlx5_rsc_debug   *dbg;
        int                     pid;
@@ -530,6 +580,17 @@ static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u
        return radix_tree_lookup(&dev->priv.mr_table.tree, key);
 }
 
+struct mlx5_page_fault_resume_mbox_in {
+       struct mlx5_inbox_hdr   hdr;
+       __be32                  flags_qpn;
+       u8                      reserved[4];
+};
+
+struct mlx5_page_fault_resume_mbox_out {
+       struct mlx5_outbox_hdr  hdr;
+       u8                      rsvd[8];
+};
+
 int mlx5_core_create_qp(struct mlx5_core_dev *dev,
                        struct mlx5_core_qp *qp,
                        struct mlx5_create_qp_mbox_in *in,
@@ -549,6 +610,10 @@ void mlx5_init_qp_table(struct mlx5_core_dev *dev);
 void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev);
 int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
 void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
+                               u8 context, int error);
+#endif
 
 static inline const char *mlx5_qp_type_str(int type)
 {
index a2bf41e0bde969c0ea0090a6af6a11087f51fa2f..2d83cfd7e6ce20da3b1606cfe4335fada792fb51 100644 (file)
 #include <linux/workqueue.h>
 
 struct ib_ucontext;
+struct ib_umem_odp;
 
 struct ib_umem {
        struct ib_ucontext     *context;
        size_t                  length;
-       int                     offset;
+       unsigned long           address;
        int                     page_size;
        int                     writable;
        int                     hugetlb;
@@ -50,17 +51,43 @@ struct ib_umem {
        struct pid             *pid;
        struct mm_struct       *mm;
        unsigned long           diff;
+       struct ib_umem_odp     *odp_data;
        struct sg_table sg_head;
        int             nmap;
        int             npages;
 };
 
+/* Returns the offset of the umem start relative to the first page. */
+static inline int ib_umem_offset(struct ib_umem *umem)
+{
+       return umem->address & ((unsigned long)umem->page_size - 1);
+}
+
+/* Returns the first page of an ODP umem. */
+static inline unsigned long ib_umem_start(struct ib_umem *umem)
+{
+       return umem->address - ib_umem_offset(umem);
+}
+
+/* Returns the address of the page after the last one of an ODP umem. */
+static inline unsigned long ib_umem_end(struct ib_umem *umem)
+{
+       return PAGE_ALIGN(umem->address + umem->length);
+}
+
+static inline size_t ib_umem_num_pages(struct ib_umem *umem)
+{
+       return (ib_umem_end(umem) - ib_umem_start(umem)) >> PAGE_SHIFT;
+}
+
 #ifdef CONFIG_INFINIBAND_USER_MEM
 
 struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
                            size_t size, int access, int dmasync);
 void ib_umem_release(struct ib_umem *umem);
 int ib_umem_page_count(struct ib_umem *umem);
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+                     size_t length);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -73,7 +100,10 @@ static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context,
 }
 static inline void ib_umem_release(struct ib_umem *umem) { }
 static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; }
-
+static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+                                   size_t length) {
+       return -EINVAL;
+}
 #endif /* CONFIG_INFINIBAND_USER_MEM */
 
 #endif /* IB_UMEM_H */
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
new file mode 100644 (file)
index 0000000..3da0b16
--- /dev/null
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_UMEM_ODP_H
+#define IB_UMEM_ODP_H
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_verbs.h>
+#include <linux/interval_tree.h>
+
+struct umem_odp_node {
+       u64 __subtree_last;
+       struct rb_node rb;
+};
+
+struct ib_umem_odp {
+       /*
+        * An array of the pages included in the on-demand paging umem.
+        * Indices of pages that are currently not mapped into the device will
+        * contain NULL.
+        */
+       struct page             **page_list;
+       /*
+        * An array of the same size as page_list, with DMA addresses mapped
+        * for pages the pages in page_list. The lower two bits designate
+        * access permissions. See ODP_READ_ALLOWED_BIT and
+        * ODP_WRITE_ALLOWED_BIT.
+        */
+       dma_addr_t              *dma_list;
+       /*
+        * The umem_mutex protects the page_list and dma_list fields of an ODP
+        * umem, allowing only a single thread to map/unmap pages. The mutex
+        * also protects access to the mmu notifier counters.
+        */
+       struct mutex            umem_mutex;
+       void                    *private; /* for the HW driver to use. */
+
+       /* When false, use the notifier counter in the ucontext struct. */
+       bool mn_counters_active;
+       int notifiers_seq;
+       int notifiers_count;
+
+       /* A linked list of umems that don't have private mmu notifier
+        * counters yet. */
+       struct list_head no_private_counters;
+       struct ib_umem          *umem;
+
+       /* Tree tracking */
+       struct umem_odp_node    interval_tree;
+
+       struct completion       notifier_completion;
+       int                     dying;
+};
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem);
+
+void ib_umem_odp_release(struct ib_umem *umem);
+
+/*
+ * The lower 2 bits of the DMA address signal the R/W permissions for
+ * the entry. To upgrade the permissions, provide the appropriate
+ * bitmask to the map_dma_pages function.
+ *
+ * Be aware that upgrading a mapped address might result in change of
+ * the DMA address for the page.
+ */
+#define ODP_READ_ALLOWED_BIT  (1<<0ULL)
+#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
+
+#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
+
+int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt,
+                             u64 access_mask, unsigned long current_seq);
+
+void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset,
+                                u64 bound);
+
+void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root);
+void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root);
+typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end,
+                             void *cookie);
+/*
+ * Call the callback on each ib_umem in the range. Returns the logical or of
+ * the return values of the functions called.
+ */
+int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end,
+                                 umem_call_back cb, void *cookie);
+
+struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root,
+                                            u64 start, u64 last);
+struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node,
+                                           u64 start, u64 last);
+
+static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item,
+                                            unsigned long mmu_seq)
+{
+       /*
+        * This code is strongly based on the KVM code from
+        * mmu_notifier_retry. Should be called with
+        * the relevant locks taken (item->odp_data->umem_mutex
+        * and the ucontext umem_mutex semaphore locked for read).
+        */
+
+       /* Do not allow page faults while the new ib_umem hasn't seen a state
+        * with zero notifiers yet, and doesn't have its own valid set of
+        * private counters. */
+       if (!item->odp_data->mn_counters_active)
+               return 1;
+
+       if (unlikely(item->odp_data->notifiers_count))
+               return 1;
+       if (item->odp_data->notifiers_seq != mmu_seq)
+               return 1;
+       return 0;
+}
+
+#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
+static inline int ib_umem_odp_get(struct ib_ucontext *context,
+                                 struct ib_umem *umem)
+{
+       return -EINVAL;
+}
+
+static inline void ib_umem_odp_release(struct ib_umem *umem) {}
+
+#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
+#endif /* IB_UMEM_ODP_H */
index 470a011d6fa49d0e44c512fb927000b1d69bf8ba..0d74f1de99aa89dee233ed408815459e2ad66d5f 100644 (file)
@@ -51,6 +51,7 @@
 #include <uapi/linux/if_ether.h>
 
 #include <linux/atomic.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 
 extern struct workqueue_struct *ib_wq;
@@ -123,7 +124,8 @@ enum ib_device_cap_flags {
        IB_DEVICE_MEM_WINDOW_TYPE_2A    = (1<<23),
        IB_DEVICE_MEM_WINDOW_TYPE_2B    = (1<<24),
        IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29),
-       IB_DEVICE_SIGNATURE_HANDOVER    = (1<<30)
+       IB_DEVICE_SIGNATURE_HANDOVER    = (1<<30),
+       IB_DEVICE_ON_DEMAND_PAGING      = (1<<31),
 };
 
 enum ib_signature_prot_cap {
@@ -143,6 +145,27 @@ enum ib_atomic_cap {
        IB_ATOMIC_GLOB
 };
 
+enum ib_odp_general_cap_bits {
+       IB_ODP_SUPPORT = 1 << 0,
+};
+
+enum ib_odp_transport_cap_bits {
+       IB_ODP_SUPPORT_SEND     = 1 << 0,
+       IB_ODP_SUPPORT_RECV     = 1 << 1,
+       IB_ODP_SUPPORT_WRITE    = 1 << 2,
+       IB_ODP_SUPPORT_READ     = 1 << 3,
+       IB_ODP_SUPPORT_ATOMIC   = 1 << 4,
+};
+
+struct ib_odp_caps {
+       uint64_t general_caps;
+       struct {
+               uint32_t  rc_odp_caps;
+               uint32_t  uc_odp_caps;
+               uint32_t  ud_odp_caps;
+       } per_transport_caps;
+};
+
 struct ib_device_attr {
        u64                     fw_ver;
        __be64                  sys_image_guid;
@@ -186,6 +209,7 @@ struct ib_device_attr {
        u8                      local_ca_ack_delay;
        int                     sig_prot_cap;
        int                     sig_guard_cap;
+       struct ib_odp_caps      odp_caps;
 };
 
 enum ib_mtu {
@@ -1073,7 +1097,8 @@ enum ib_access_flags {
        IB_ACCESS_REMOTE_READ   = (1<<2),
        IB_ACCESS_REMOTE_ATOMIC = (1<<3),
        IB_ACCESS_MW_BIND       = (1<<4),
-       IB_ZERO_BASED           = (1<<5)
+       IB_ZERO_BASED           = (1<<5),
+       IB_ACCESS_ON_DEMAND     = (1<<6),
 };
 
 struct ib_phys_buf {
@@ -1115,6 +1140,8 @@ struct ib_fmr_attr {
        u8      page_shift;
 };
 
+struct ib_umem;
+
 struct ib_ucontext {
        struct ib_device       *device;
        struct list_head        pd_list;
@@ -1127,6 +1154,24 @@ struct ib_ucontext {
        struct list_head        xrcd_list;
        struct list_head        rule_list;
        int                     closing;
+
+       struct pid             *tgid;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       struct rb_root      umem_tree;
+       /*
+        * Protects .umem_rbroot and tree, as well as odp_mrs_count and
+        * mmu notifiers registration.
+        */
+       struct rw_semaphore     umem_rwsem;
+       void (*invalidate_range)(struct ib_umem *umem,
+                                unsigned long start, unsigned long end);
+
+       struct mmu_notifier     mn;
+       atomic_t                notifier_count;
+       /* A list of umems that don't have private mmu notifier counters yet. */
+       struct list_head        no_private_counters;
+       int                     odp_mrs_count;
+#endif
 };
 
 struct ib_uobject {
@@ -1662,7 +1707,10 @@ static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t
 
 static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
 {
-       return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
+       size_t copy_sz;
+
+       copy_sz = min_t(size_t, len, udata->outlen);
+       return copy_to_user(udata->outbuf, src, copy_sz) ? -EFAULT : 0;
 }
 
 /**
index 26daf55ff76ead65f47801620d4697b10e17151c..4275b961bf60f65ec9d93a2dbe680e66d184b6a2 100644 (file)
@@ -90,8 +90,9 @@ enum {
 };
 
 enum {
+       IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE,
        IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
-       IB_USER_VERBS_EX_CMD_DESTROY_FLOW
+       IB_USER_VERBS_EX_CMD_DESTROY_FLOW,
 };
 
 /*
@@ -201,6 +202,32 @@ struct ib_uverbs_query_device_resp {
        __u8  reserved[4];
 };
 
+enum {
+       IB_USER_VERBS_EX_QUERY_DEVICE_ODP =             1ULL << 0,
+};
+
+struct ib_uverbs_ex_query_device {
+       __u32 comp_mask;
+       __u32 reserved;
+};
+
+struct ib_uverbs_odp_caps {
+       __u64 general_caps;
+       struct {
+               __u32 rc_odp_caps;
+               __u32 uc_odp_caps;
+               __u32 ud_odp_caps;
+       } per_transport_caps;
+       __u32 reserved;
+};
+
+struct ib_uverbs_ex_query_device_resp {
+       struct ib_uverbs_query_device_resp base;
+       __u32 comp_mask;
+       __u32 reserved;
+       struct ib_uverbs_odp_caps odp_caps;
+};
+
 struct ib_uverbs_query_port {
        __u64 response;
        __u8  port_num;