]> git.proxmox.com Git - mirror_ubuntu-disco-kernel.git/commitdiff
IB/mlx5: Add advise_mr() support
authorMoni Shoua <monis@mellanox.com>
Tue, 11 Dec 2018 11:37:53 +0000 (13:37 +0200)
committerJason Gunthorpe <jgg@mellanox.com>
Tue, 18 Dec 2018 22:26:19 +0000 (15:26 -0700)
The verb advise_mr() is used to give advice to the kernel about an address
range that belongs to a MR.  Implement the verb and register it on the
device. The current implementation supports the only known advice to date,
prefetch.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Reviewed-by: Guy Levi <guyle@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/odp.c

index 5a4e23105b0c32ee91e821b390ad2fcd22196567..d7b56222fea376679aaa9057b96e2c693cfa7e2a 100644 (file)
@@ -5712,6 +5712,8 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
        mlx5_ib_cleanup_multiport_master(dev);
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
        cleanup_srcu_struct(&dev->mr_srcu);
+       drain_workqueue(dev->advise_mr_wq);
+       destroy_workqueue(dev->advise_mr_wq);
 #endif
        kfree(dev->port);
 }
@@ -5766,6 +5768,12 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
        dev->memic.dev = mdev;
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+       dev->advise_mr_wq = alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0);
+       if (!dev->advise_mr_wq) {
+               err = -ENOMEM;
+               goto err_free_port;
+       }
+
        err = init_srcu_struct(&dev->mr_srcu);
        if (err)
                goto err_free_port;
index 1285ac11bb704f511b03b9788ef0ebba05c21433..f245b5d8a3bcd36c38ffb7108b315d9a9f65c6e6 100644 (file)
@@ -923,6 +923,7 @@ struct mlx5_ib_dev {
         */
        struct srcu_struct      mr_srcu;
        u32                     null_mkey;
+       struct workqueue_struct *advise_mr_wq;
 #endif
        struct mlx5_ib_flow_db  *flow_db;
        /* protect resources needed as part of reset flow */
@@ -1085,6 +1086,12 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                                  u64 virt_addr, int access_flags,
                                  struct ib_udata *udata);
+int mlx5_ib_advise_mr(struct ib_pd *pd,
+                     enum ib_uverbs_advise_mr_advice advice,
+                     u32 flags,
+                     struct ib_sge *sg_list,
+                     u32 num_sge,
+                     struct uverbs_attr_bundle *attrs);
 struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
                               struct ib_udata *udata);
 int mlx5_ib_dealloc_mw(struct ib_mw *mw);
@@ -1182,6 +1189,10 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
 void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
                           size_t nentries, struct mlx5_ib_mr *mr, int flags);
+
+int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
+                              enum ib_uverbs_advise_mr_advice advice,
+                              u32 flags, struct ib_sge *sg_list, u32 num_sge);
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 {
@@ -1197,6 +1208,13 @@ static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
                                         size_t nentries, struct mlx5_ib_mr *mr,
                                         int flags) {}
 
+static int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
+                                     enum ib_uverbs_advise_mr_advice advice,
+                                     u32 flags, struct ib_sge *sg_list,
+                                     u32 num_sge)
+{
+       return -EOPNOTSUPP;
+}
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
 /* Needed for rep profile */
index 8e072f84a323ca56a25615e4fc32849b6bb468a8..fd6ea1f75085ee0590a1eb0508689cba3d358e71 100644 (file)
@@ -1280,6 +1280,21 @@ err_free:
        return ERR_PTR(err);
 }
 
+int mlx5_ib_advise_mr(struct ib_pd *pd,
+                     enum ib_uverbs_advise_mr_advice advice,
+                     u32 flags,
+                     struct ib_sge *sg_list,
+                     u32 num_sge,
+                     struct uverbs_attr_bundle *attrs)
+{
+       if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
+           advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE)
+               return -EOPNOTSUPP;
+
+       return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
+                                        sg_list, num_sge);
+}
+
 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
                                struct ib_dm_mr_attr *attr,
                                struct uverbs_attr_bundle *attrs)
index 4ead8c0fff5ad46d6d68f676194badda71938f4f..80fa2438db8f0de3a6dab517ab9dc5b89823e401 100644 (file)
@@ -549,10 +549,15 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
        wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
 }
 
+#define MLX5_PF_FLAGS_PREFETCH  BIT(0)
+#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
 static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
-                       u64 io_virt, size_t bcnt, u32 *bytes_mapped)
+                       u64 io_virt, size_t bcnt, u32 *bytes_mapped,
+                       u32 flags)
 {
        struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
+       bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
+       bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
        u64 access_mask = ODP_READ_ALLOWED_BIT;
        int npages = 0, page_shift, np;
        u64 start_idx, page_mask;
@@ -579,7 +584,15 @@ next_mr:
        page_mask = ~(BIT(page_shift) - 1);
        start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
 
-       if (mr->umem->writable)
+       if (prefetch && !downgrade && !mr->umem->writable) {
+               /* prefetch with write-access must
+                * be supported by the MR
+                */
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (mr->umem->writable && !downgrade)
                access_mask |= ODP_WRITE_ALLOWED_BIT;
 
        current_seq = READ_ONCE(odp->notifiers_seq);
@@ -684,12 +697,13 @@ struct pf_frame {
  * -EFAULT when there's an error mapping the requested pages. The caller will
  *  abort the page fault handling.
  */
-static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
-                                        u32 key, u64 io_virt, size_t bcnt,
+static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 key,
+                                        u64 io_virt, size_t bcnt,
                                         u32 *bytes_committed,
-                                        u32 *bytes_mapped)
+                                        u32 *bytes_mapped, u32 flags)
 {
        int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
+       bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
        struct pf_frame *head = NULL, *frame;
        struct mlx5_core_mkey *mmkey;
        struct mlx5_ib_mw *mw;
@@ -711,6 +725,12 @@ next_mr:
                goto srcu_unlock;
        }
 
+       if (prefetch && mmkey->type != MLX5_MKEY_MR) {
+               mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n");
+               ret = -EINVAL;
+               goto srcu_unlock;
+       }
+
        switch (mmkey->type) {
        case MLX5_MKEY_MR:
                mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
@@ -720,6 +740,11 @@ next_mr:
                        goto srcu_unlock;
                }
 
+               if (prefetch && !mr->umem->is_odp) {
+                       ret = -EINVAL;
+                       goto srcu_unlock;
+               }
+
                if (!mr->umem->is_odp) {
                        mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
                                    key);
@@ -729,7 +754,7 @@ next_mr:
                        goto srcu_unlock;
                }
 
-               ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped);
+               ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags);
                if (ret < 0)
                        goto srcu_unlock;
 
@@ -906,7 +931,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
 
                ret = pagefault_single_data_segment(dev, key, io_virt, bcnt,
                                                    &pfault->bytes_committed,
-                                                   bytes_mapped);
+                                                   bytes_mapped, 0);
                if (ret < 0)
                        break;
                npages += ret;
@@ -1217,7 +1242,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
        }
 
        ret = pagefault_single_data_segment(dev, rkey, address, length,
-                                           &pfault->bytes_committed, NULL);
+                                           &pfault->bytes_committed, NULL,
+                                           0);
        if (ret == -EAGAIN) {
                /* We're racing with an invalidation, don't prefetch */
                prefetch_activated = 0;
@@ -1244,7 +1270,8 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
 
                ret = pagefault_single_data_segment(dev, rkey, address,
                                                    prefetch_len,
-                                                   &bytes_committed, NULL);
+                                                   &bytes_committed, NULL,
+                                                   0);
                if (ret < 0 && ret != -EAGAIN) {
                        mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
                                    ret, pfault->token, address, prefetch_len);
@@ -1493,10 +1520,17 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
        }
 }
 
+static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
+       .advise_mr = mlx5_ib_advise_mr,
+};
+
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
 {
        int ret = 0;
 
+       if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
+               ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
+
        if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
                ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
                if (ret) {
@@ -1528,3 +1562,76 @@ int mlx5_ib_odp_init(void)
 
        return 0;
 }
+
+struct prefetch_mr_work {
+       struct work_struct work;
+       struct mlx5_ib_dev *dev;
+       u32 pf_flags;
+       u32 num_sge;
+       struct ib_sge sg_list[0];
+};
+
+static int mlx5_ib_prefetch_sg_list(struct mlx5_ib_dev *dev, u32 pf_flags,
+                                   struct ib_sge *sg_list, u32 num_sge)
+{
+       int i;
+
+       for (i = 0; i < num_sge; ++i) {
+               struct ib_sge *sg = &sg_list[i];
+               int bytes_committed = 0;
+               int ret;
+
+               ret = pagefault_single_data_segment(dev, sg->lkey, sg->addr,
+                                                   sg->length,
+                                                   &bytes_committed, NULL,
+                                                   pf_flags);
+               if (ret < 0)
+                       return ret;
+       }
+       return 0;
+}
+
+static void mlx5_ib_prefetch_mr_work(struct work_struct *work)
+{
+       struct prefetch_mr_work *w =
+               container_of(work, struct prefetch_mr_work, work);
+
+       if (w->dev->ib_dev.reg_state == IB_DEV_REGISTERED)
+               mlx5_ib_prefetch_sg_list(w->dev, w->pf_flags, w->sg_list,
+                                        w->num_sge);
+
+       kfree(w);
+}
+
+int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
+                              enum ib_uverbs_advise_mr_advice advice,
+                              u32 flags, struct ib_sge *sg_list, u32 num_sge)
+{
+       struct mlx5_ib_dev *dev = to_mdev(pd->device);
+       u32 pf_flags = MLX5_PF_FLAGS_PREFETCH;
+       struct prefetch_mr_work *work;
+
+       if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
+               pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
+
+       if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
+               return mlx5_ib_prefetch_sg_list(dev, pf_flags, sg_list,
+                                               num_sge);
+
+       if (dev->ib_dev.reg_state != IB_DEV_REGISTERED)
+               return -ENODEV;
+
+       work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL);
+       if (!work)
+               return -ENOMEM;
+
+       memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge));
+
+       work->dev = dev;
+       work->pf_flags = pf_flags;
+       work->num_sge = num_sge;
+
+       INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
+       schedule_work(&work->work);
+       return 0;
+}