]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/commitdiff
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 23 Feb 2017 19:27:49 +0000 (11:27 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 23 Feb 2017 19:27:49 +0000 (11:27 -0800)
Pull Mellanox rdma updates from Doug Ledford:
 "Mellanox specific updates for 4.11 merge window

  Because the Mellanox code required being based on a net-next tree, I
  keept it separate from the remainder of the RDMA stack submission that
  is based on 4.10-rc3.

  This branch contains:

   - Various mlx4 and mlx5 fixes and minor changes

   - Support for adding a tag match rule to flow specs

   - Support for cvlan offload operation for raw ethernet QPs

   - A change to the core IB code to recognize raw eth capabilities and
     enumerate them (touches non-Mellanox code)

   - Implicit On-Demand Paging memory registration support"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (40 commits)
  IB/mlx5: Fix configuration of port capabilities
  IB/mlx4: Take source GID by index from HW GID table
  IB/mlx5: Fix blue flame buffer size calculation
  IB/mlx4: Remove unused variable from function declaration
  IB: Query ports via the core instead of direct into the driver
  IB: Add protocol for USNIC
  IB/mlx4: Support raw packet protocol
  IB/mlx5: Support raw packet protocol
  IB/core: Add raw packet protocol
  IB/mlx5: Add implicit MR support
  IB/mlx5: Expose MR cache for mlx5_ib
  IB/mlx5: Add null_mkey access
  IB/umem: Indicate that process is being terminated
  IB/umem: Update on demand page (ODP) support
  IB/core: Add implicit MR flag
  IB/mlx5: Support creation of a WQ with scatter FCS offload
  IB/mlx5: Enable QP creation with cvlan offload
  IB/mlx5: Enable WQ creation and modification with cvlan offload
  IB/mlx5: Expose vlan offloads capabilities
  IB/uverbs: Enable QP creation with cvlan offload
  ...

22 files changed:
1  2 
drivers/infiniband/core/umem.c
drivers/infiniband/hw/cxgb3/iwch_provider.c
drivers/infiniband/hw/cxgb4/provider.c
drivers/infiniband/hw/hfi1/verbs.c
drivers/infiniband/hw/hns/hns_roce_main.c
drivers/infiniband/hw/i40iw/i40iw_verbs.c
drivers/infiniband/hw/mlx4/qp.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/nes/nes_verbs.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
drivers/infiniband/hw/qedr/verbs.c
drivers/infiniband/hw/qib/qib_verbs.c
drivers/infiniband/hw/usnic/usnic_ib_verbs.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
drivers/infiniband/sw/rxe/rxe_verbs.c
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
include/linux/mlx5/driver.h
include/linux/mlx5/mlx5_ifc.h
include/rdma/ib_verbs.h
include/uapi/rdma/ib_user_verbs.h

index 4609b921f899c9d7481b86825f18fe076a6f732c,9f9630b1bc7bd27cef7cb4c5f8709af81de1af98..446b56a5260b73f1994355403411d3c971a06e25
@@@ -99,9 -99,6 +99,6 @@@ struct ib_umem *ib_umem_get(struct ib_u
        if (dmasync)
                dma_attrs |= DMA_ATTR_WRITE_BARRIER;
  
-       if (!size)
-               return ERR_PTR(-EINVAL);
        /*
         * If the combination of the addr and size requested for this memory
         * region causes an integer overflow, return error.
                 IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
  
        if (access & IB_ACCESS_ON_DEMAND) {
 +              put_pid(umem->pid);
                ret = ib_umem_odp_get(context, umem);
                if (ret) {
                        kfree(umem);
  
        page_list = (struct page **) __get_free_page(GFP_KERNEL);
        if (!page_list) {
 +              put_pid(umem->pid);
                kfree(umem);
                return ERR_PTR(-ENOMEM);
        }
index 6262dc035f3cea4c9613d96f67ec13e76a18643e,9e39252a570a3c0380a0491b7c395bc2fd28a45b..48649f93258a41e8ecf8b4214d4d518a28a07318
@@@ -1133,9 -1133,18 +1133,9 @@@ static int iwch_query_port(struct ib_de
        dev = to_iwch_dev(ibdev);
        netdev = dev->rdev.port_info.lldevs[port-1];
  
-       memset(props, 0, sizeof(struct ib_port_attr));
+       /* props being zeroed by the caller, avoid zeroing it here */
        props->max_mtu = IB_MTU_4096;
 -      if (netdev->mtu >= 4096)
 -              props->active_mtu = IB_MTU_4096;
 -      else if (netdev->mtu >= 2048)
 -              props->active_mtu = IB_MTU_2048;
 -      else if (netdev->mtu >= 1024)
 -              props->active_mtu = IB_MTU_1024;
 -      else if (netdev->mtu >= 512)
 -              props->active_mtu = IB_MTU_512;
 -      else
 -              props->active_mtu = IB_MTU_256;
 +      props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
  
        if (!netif_carrier_ok(netdev))
                props->state = IB_PORT_DOWN;
@@@ -1329,13 -1338,14 +1329,14 @@@ static int iwch_port_immutable(struct i
        struct ib_port_attr attr;
        int err;
  
-       err = iwch_query_port(ibdev, port_num, &attr);
+       immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+       err = ib_query_port(ibdev, port_num, &attr);
        if (err)
                return err;
  
        immutable->pkey_tbl_len = attr.pkey_tbl_len;
        immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
  
        return 0;
  }
index 3345e1c312f771cfaa8e31858624ca9892267467,5b00b50a484fa57cfbbc546954a2c63caed5e35f..bdf7de571d838d824dd5fc57d8e357a35ddfe84a
@@@ -93,28 -93,17 +93,28 @@@ static int c4iw_process_mad(struct ib_d
        return -ENOSYS;
  }
  
 -static int c4iw_dealloc_ucontext(struct ib_ucontext *context)
 +void _c4iw_free_ucontext(struct kref *kref)
  {
 -      struct c4iw_dev *rhp = to_c4iw_dev(context->device);
 -      struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
 +      struct c4iw_ucontext *ucontext;
 +      struct c4iw_dev *rhp;
        struct c4iw_mm_entry *mm, *tmp;
  
 -      PDBG("%s context %p\n", __func__, context);
 +      ucontext = container_of(kref, struct c4iw_ucontext, kref);
 +      rhp = to_c4iw_dev(ucontext->ibucontext.device);
 +
 +      PDBG("%s ucontext %p\n", __func__, ucontext);
        list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
                kfree(mm);
        c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx);
        kfree(ucontext);
 +}
 +
 +static int c4iw_dealloc_ucontext(struct ib_ucontext *context)
 +{
 +      struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
 +
 +      PDBG("%s context %p\n", __func__, context);
 +      c4iw_put_ucontext(ucontext);
        return 0;
  }
  
@@@ -138,7 -127,6 +138,7 @@@ static struct ib_ucontext *c4iw_alloc_u
        c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx);
        INIT_LIST_HEAD(&context->mmaps);
        spin_lock_init(&context->mmap_lock);
 +      kref_init(&context->kref);
  
        if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) {
                if (!warned++)
@@@ -370,10 -358,18 +370,9 @@@ static int c4iw_query_port(struct ib_de
  
        dev = to_c4iw_dev(ibdev);
        netdev = dev->rdev.lldi.ports[port-1];
-       memset(props, 0, sizeof(struct ib_port_attr));
+       /* props being zeroed by the caller, avoid zeroing it here */
        props->max_mtu = IB_MTU_4096;
 -      if (netdev->mtu >= 4096)
 -              props->active_mtu = IB_MTU_4096;
 -      else if (netdev->mtu >= 2048)
 -              props->active_mtu = IB_MTU_2048;
 -      else if (netdev->mtu >= 1024)
 -              props->active_mtu = IB_MTU_1024;
 -      else if (netdev->mtu >= 512)
 -              props->active_mtu = IB_MTU_512;
 -      else
 -              props->active_mtu = IB_MTU_256;
 +      props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
  
        if (!netif_carrier_ok(netdev))
                props->state = IB_PORT_DOWN;
@@@ -508,13 -504,14 +507,14 @@@ static int c4iw_port_immutable(struct i
        struct ib_port_attr attr;
        int err;
  
-       err = c4iw_query_port(ibdev, port_num, &attr);
+       immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+       err = ib_query_port(ibdev, port_num, &attr);
        if (err)
                return err;
  
        immutable->pkey_tbl_len = attr.pkey_tbl_len;
        immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
  
        return 0;
  }
@@@ -610,6 -607,8 +610,6 @@@ int c4iw_register_device(struct c4iw_de
        dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
        dev->ibdev.get_port_immutable = c4iw_port_immutable;
        dev->ibdev.get_dev_fw_str = get_dev_fw_str;
 -      dev->ibdev.drain_sq = c4iw_drain_sq;
 -      dev->ibdev.drain_rq = c4iw_drain_rq;
  
        dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
        if (!dev->ibdev.iwcm)
index 5ba4c0dec3488429de0a6a0d83fc9f89e9a8b17d,f2d4e042872599f8e17fa4156e79a332b7be7e48..33f00f0719c561acec89667bed0566fe8432bb71
@@@ -291,7 -291,7 +291,7 @@@ static void wss_insert(void *address
  /*
   * Is the working set larger than the threshold?
   */
 -static inline int wss_exceeds_threshold(void)
 +static inline bool wss_exceeds_threshold(void)
  {
        return atomic_read(&wss.total_count) >= wss.threshold;
  }
@@@ -419,19 -419,18 +419,19 @@@ __be64 ib_hfi1_sys_image_guid
   * @ss: the SGE state
   * @data: the data to copy
   * @length: the length of the data
 + * @release: boolean to release MR
   * @copy_last: do a separate copy of the last 8 bytes
   */
  void hfi1_copy_sge(
        struct rvt_sge_state *ss,
        void *data, u32 length,
 -      int release,
 -      int copy_last)
 +      bool release,
 +      bool copy_last)
  {
        struct rvt_sge *sge = &ss->sge;
 -      int in_last = 0;
        int i;
 -      int cacheless_copy = 0;
 +      bool in_last = false;
 +      bool cacheless_copy = false;
  
        if (sge_copy_mode == COPY_CACHELESS) {
                cacheless_copy = length >= PAGE_SIZE;
                if (length > 8) {
                        length -= 8;
                } else {
 -                      copy_last = 0;
 -                      in_last = 1;
 +                      copy_last = false;
 +                      in_last = true;
                }
        }
  
  again:
        while (length) {
 -              u32 len = sge->length;
 +              u32 len = rvt_get_sge_length(sge, length);
  
 -              if (len > length)
 -                      len = length;
 -              if (len > sge->sge_length)
 -                      len = sge->sge_length;
                WARN_ON_ONCE(len == 0);
                if (unlikely(in_last)) {
                        /* enforce byte transfer ordering */
                } else {
                        memcpy(sge->vaddr, data, len);
                }
 -              sge->vaddr += len;
 -              sge->length -= len;
 -              sge->sge_length -= len;
 -              if (sge->sge_length == 0) {
 -                      if (release)
 -                              rvt_put_mr(sge->mr);
 -                      if (--ss->num_sge)
 -                              *sge = *ss->sg_list++;
 -              } else if (sge->length == 0 && sge->mr->lkey) {
 -                      if (++sge->n >= RVT_SEGSZ) {
 -                              if (++sge->m >= sge->mr->mapsz)
 -                                      break;
 -                              sge->n = 0;
 -                      }
 -                      sge->vaddr =
 -                              sge->mr->map[sge->m]->segs[sge->n].vaddr;
 -                      sge->length =
 -                              sge->mr->map[sge->m]->segs[sge->n].length;
 -              }
 +              rvt_update_sge(ss, len, release);
                data += len;
                length -= len;
        }
  
        if (copy_last) {
 -              copy_last = 0;
 -              in_last = 1;
 +              copy_last = false;
 +              in_last = true;
                length = 8;
                goto again;
        }
  }
  
 -/**
 - * hfi1_skip_sge - skip over SGE memory
 - * @ss: the SGE state
 - * @length: the number of bytes to skip
 - */
 -void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release)
 -{
 -      struct rvt_sge *sge = &ss->sge;
 -
 -      while (length) {
 -              u32 len = sge->length;
 -
 -              if (len > length)
 -                      len = length;
 -              if (len > sge->sge_length)
 -                      len = sge->sge_length;
 -              WARN_ON_ONCE(len == 0);
 -              sge->vaddr += len;
 -              sge->length -= len;
 -              sge->sge_length -= len;
 -              if (sge->sge_length == 0) {
 -                      if (release)
 -                              rvt_put_mr(sge->mr);
 -                      if (--ss->num_sge)
 -                              *sge = *ss->sg_list++;
 -              } else if (sge->length == 0 && sge->mr->lkey) {
 -                      if (++sge->n >= RVT_SEGSZ) {
 -                              if (++sge->m >= sge->mr->mapsz)
 -                                      break;
 -                              sge->n = 0;
 -                      }
 -                      sge->vaddr =
 -                              sge->mr->map[sge->m]->segs[sge->n].vaddr;
 -                      sge->length =
 -                              sge->mr->map[sge->m]->segs[sge->n].length;
 -              }
 -              length -= len;
 -      }
 -}
 -
  /*
   * Make sure the QP is ready and able to accept the given opcode.
   */
@@@ -515,7 -576,7 +515,7 @@@ void hfi1_ib_rcv(struct hfi1_packet *pa
        struct ib_header *hdr = packet->hdr;
        u32 tlen = packet->tlen;
        struct hfi1_pportdata *ppd = rcd->ppd;
 -      struct hfi1_ibport *ibp = &ppd->ibport_data;
 +      struct hfi1_ibport *ibp = rcd_to_iport(rcd);
        struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
        opcode_handler packet_handler;
        unsigned long flags;
@@@ -628,6 -689,27 +628,6 @@@ static void mem_timer(unsigned long dat
                hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
  }
  
 -void update_sge(struct rvt_sge_state *ss, u32 length)
 -{
 -      struct rvt_sge *sge = &ss->sge;
 -
 -      sge->vaddr += length;
 -      sge->length -= length;
 -      sge->sge_length -= length;
 -      if (sge->sge_length == 0) {
 -              if (--ss->num_sge)
 -                      *sge = *ss->sg_list++;
 -      } else if (sge->length == 0 && sge->mr->lkey) {
 -              if (++sge->n >= RVT_SEGSZ) {
 -                      if (++sge->m >= sge->mr->mapsz)
 -                              return;
 -                      sge->n = 0;
 -              }
 -              sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
 -              sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
 -      }
 -}
 -
  /*
   * This is called with progress side lock held.
   */
@@@ -716,7 -798,7 +716,7 @@@ static noinline int build_verbs_ulp_pay
                        len);
                if (ret)
                        goto bail_txadd;
 -              update_sge(ss, len);
 +              rvt_update_sge(ss, len, false);
                length -= len;
        }
        return ret;
@@@ -991,7 -1073,7 +991,7 @@@ int hfi1_verbs_send_pio(struct rvt_qp *
  
                                if (slen > len)
                                        slen = len;
 -                              update_sge(ss, slen);
 +                              rvt_update_sge(ss, slen, false);
                                seg_pio_copy_mid(pbuf, addr, slen);
                                len -= slen;
                        }
@@@ -1302,6 -1384,7 +1302,7 @@@ static int query_port(struct rvt_dev_in
        struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
        u16 lid = ppd->lid;
  
+       /* props being zeroed by the caller, avoid zeroing it here */
        props->lid = lid ? lid : 0;
        props->lmc = ppd->lmc;
        /* OPA logical states match IB logical states */
@@@ -1536,7 -1619,7 +1537,7 @@@ static int cntr_names_initialized
   * external strings.
   */
  static int init_cntr_names(const char *names_in,
 -                         const int names_len,
 +                         const size_t names_len,
                           int num_extra_names,
                           int *num_cntrs,
                           const char ***cntr_names)
@@@ -1763,7 -1846,6 +1764,7 @@@ int hfi1_register_ib_device(struct hfi1
        dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu;
        dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
        dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
 +      dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
        dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
  
        /* completeion queue */
@@@ -1829,7 -1911,7 +1830,7 @@@ void hfi1_unregister_ib_device(struct h
  
  void hfi1_cnp_rcv(struct hfi1_packet *packet)
  {
 -      struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
 +      struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
        struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
        struct ib_header *hdr = packet->hdr;
        struct rvt_qp *qp = packet->qp;
index cf14679664ca84e50c0d15a6dd0c4df074631432,7be8158f15eed70a6e56ecdcd98f49935d054519..6843409fba298abf1d593d0990c49054333cf43d
@@@ -32,7 -32,6 +32,7 @@@
   */
  #include <linux/acpi.h>
  #include <linux/of_platform.h>
 +#include <linux/module.h>
  #include <rdma/ib_addr.h>
  #include <rdma/ib_smi.h>
  #include <rdma/ib_user_verbs.h>
@@@ -250,7 -249,7 +250,7 @@@ static int hns_roce_query_port(struct i
        assert(port_num > 0);
        port = port_num - 1;
  
-       memset(props, 0, sizeof(*props));
+       /* props being zeroed by the caller, avoid zeroing it here */
  
        props->max_mtu = hr_dev->caps.max_mtu;
        props->gid_tbl_len = hr_dev->caps.gid_table_len[port];
@@@ -401,14 -400,15 +401,15 @@@ static int hns_roce_port_immutable(stru
        struct ib_port_attr attr;
        int ret;
  
-       ret = hns_roce_query_port(ib_dev, port_num, &attr);
+       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+       ret = ib_query_port(ib_dev, port_num, &attr);
        if (ret)
                return ret;
  
        immutable->pkey_tbl_len = attr.pkey_tbl_len;
        immutable->gid_tbl_len = attr.gid_tbl_len;
  
-       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
        immutable->max_mad_size = IB_MGMT_MAD_SIZE;
  
        return 0;
index 4c000d60d5c6f865ae17aa28654497e3dbbb913c,3e830486417b02b515b3a36fb014d89c62b25db2..5f695bf232a8f0d7bf332f65441db6a195d9092e
@@@ -97,10 -97,18 +97,9 @@@ static int i40iw_query_port(struct ib_d
        struct i40iw_device *iwdev = to_iwdev(ibdev);
        struct net_device *netdev = iwdev->netdev;
  
-       memset(props, 0, sizeof(*props));
+       /* props being zeroed by the caller, avoid zeroing it here */
        props->max_mtu = IB_MTU_4096;
 -      if (netdev->mtu >= 4096)
 -              props->active_mtu = IB_MTU_4096;
 -      else if (netdev->mtu >= 2048)
 -              props->active_mtu = IB_MTU_2048;
 -      else if (netdev->mtu >= 1024)
 -              props->active_mtu = IB_MTU_1024;
 -      else if (netdev->mtu >= 512)
 -              props->active_mtu = IB_MTU_512;
 -      else
 -              props->active_mtu = IB_MTU_256;
 +      props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
  
        props->lid = 1;
        if (netif_carrier_ok(iwdev->netdev))
@@@ -2497,14 -2505,15 +2496,15 @@@ static int i40iw_port_immutable(struct 
        struct ib_port_attr attr;
        int err;
  
-       err = i40iw_query_port(ibdev, port_num, &attr);
+       immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+       err = ib_query_port(ibdev, port_num, &attr);
  
        if (err)
                return err;
  
        immutable->pkey_tbl_len = attr.pkey_tbl_len;
        immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
  
        return 0;
  }
index 7d76f769233cfb5a58a2ce78717e758e8492363e,4f50b96fe605d71e18f436dd8b41b4545c07a960..c34eebc7db65a9bbf4deb0ce611e6ec61e88af7f
@@@ -76,6 -76,10 +76,6 @@@ enum 
        MLX4_IB_LSO_HEADER_SPARE        = 128,
  };
  
 -enum {
 -      MLX4_IB_IBOE_ETHERTYPE          = 0x8915
 -};
 -
  struct mlx4_ib_sqp {
        struct mlx4_ib_qp       qp;
        int                     pkey_index;
@@@ -2420,11 -2424,31 +2420,31 @@@ static u8 sl_to_vl(struct mlx4_ib_dev *
        return vl;
  }
  
+ static int fill_gid_by_hw_index(struct mlx4_ib_dev *ibdev, u8 port_num,
+                               int index, union ib_gid *gid,
+                               enum ib_gid_type *gid_type)
+ {
+       struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+       struct mlx4_port_gid_table *port_gid_table;
+       unsigned long flags;
+       port_gid_table = &iboe->gids[port_num - 1];
+       spin_lock_irqsave(&iboe->lock, flags);
+       memcpy(gid, &port_gid_table->gids[index].gid, sizeof(*gid));
+       *gid_type = port_gid_table->gids[index].gid_type;
+       spin_unlock_irqrestore(&iboe->lock, flags);
+       if (!memcmp(gid, &zgid, sizeof(*gid)))
+               return -ENOENT;
+       return 0;
+ }
  #define MLX4_ROCEV2_QP1_SPORT 0xC000
  static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
                            void *wqe, unsigned *mlx_seg_len)
  {
        struct ib_device *ib_dev = sqp->qp.ibqp.device;
+       struct mlx4_ib_dev *ibdev = to_mdev(ib_dev);
        struct mlx4_wqe_mlx_seg *mlx = wqe;
        struct mlx4_wqe_ctrl_seg *ctrl = wqe;
        struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
        is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
        is_grh = mlx4_ib_ah_grh_present(ah);
        if (is_eth) {
-               struct ib_gid_attr gid_attr;
+               enum ib_gid_type gid_type;
                if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
                        /* When multi-function is enabled, the ib_core gid
                         * indexes don't necessarily match the hw ones, so
                        if (err)
                                return err;
                } else  {
-                       err = ib_get_cached_gid(ib_dev,
-                                               be32_to_cpu(ah->av.ib.port_pd) >> 24,
-                                               ah->av.ib.gid_index, &sgid,
-                                               &gid_attr);
-                       if (!err) {
-                               if (gid_attr.ndev)
-                                       dev_put(gid_attr.ndev);
-                               if (!memcmp(&sgid, &zgid, sizeof(sgid)))
-                                       err = -ENOENT;
-                       }
+                       err = fill_gid_by_hw_index(ibdev, sqp->qp.port,
+                                           ah->av.ib.gid_index,
+                                           &sgid, &gid_type);
                        if (!err) {
-                               is_udp = gid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
+                               is_udp = gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
                                if (is_udp) {
                                        if (ipv6_addr_v4mapped((struct in6_addr *)&sgid))
                                                ip_version = 4;
                u16 ether_type;
                u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
  
 -              ether_type = (!is_udp) ? MLX4_IB_IBOE_ETHERTYPE :
 +              ether_type = (!is_udp) ? ETH_P_IBOE:
                        (ip_version == 4 ? ETH_P_IP : ETH_P_IPV6);
  
                mlx->sched_prio = cpu_to_be16(pcp);
@@@ -2951,21 -2967,17 +2963,17 @@@ int mlx4_ib_post_send(struct ib_qp *ibq
  
                if (sqp->roce_v2_gsi) {
                        struct mlx4_ib_ah *ah = to_mah(ud_wr(wr)->ah);
-                       struct ib_gid_attr gid_attr;
+                       enum ib_gid_type gid_type;
                        union ib_gid gid;
  
-                       if (!ib_get_cached_gid(ibqp->device,
-                                              be32_to_cpu(ah->av.ib.port_pd) >> 24,
-                                              ah->av.ib.gid_index, &gid,
-                                              &gid_attr)) {
-                               if (gid_attr.ndev)
-                                       dev_put(gid_attr.ndev);
-                               qp = (gid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ?
-                                       to_mqp(sqp->roce_v2_gsi) : qp;
-                       } else {
+                       if (!fill_gid_by_hw_index(mdev, sqp->qp.port,
+                                          ah->av.ib.gid_index,
+                                          &gid, &gid_type))
+                               qp = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ?
+                                               to_mqp(sqp->roce_v2_gsi) : qp;
+                       else
                                pr_err("Failed to get gid at index %d. RoCEv2 will not work properly\n",
                                       ah->av.ib.gid_index);
-                       }
                }
        }
  
index 9d8535385bb8bded0e6dc5161396034cdf524b9b,4a043cf35b9ac7137772da4802dc6669423a1544..6a8498c052a5c3a164ab47a1525f88835634ab17
@@@ -53,7 -53,6 +53,7 @@@
  #include <linux/in.h>
  #include <linux/etherdevice.h>
  #include <linux/mlx5/fs.h>
 +#include <linux/mlx5/vport.h>
  #include "mlx5_ib.h"
  
  #define DRIVER_NAME "mlx5_ib"
@@@ -65,10 -64,6 +65,6 @@@ MODULE_DESCRIPTION("Mellanox Connect-I
  MODULE_LICENSE("Dual BSD/GPL");
  MODULE_VERSION(DRIVER_VERSION);
  
- static int deprecated_prof_sel = 2;
- module_param_named(prof_sel, deprecated_prof_sel, int, 0444);
- MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core");
  static char mlx5_version[] =
        DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
        DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
@@@ -175,7 -170,7 +171,7 @@@ static int mlx5_query_port_roce(struct 
        enum ib_mtu ndev_ib_mtu;
        u16 qkey_viol_cntr;
  
-       memset(props, 0, sizeof(*props));
+       /* props being zeroed by the caller, avoid zeroing it here */
  
        props->port_cap_flags  |= IB_PORT_CM_SUP;
        props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
@@@ -326,6 -321,27 +322,27 @@@ __be16 mlx5_get_roce_udp_sport(struct m
        return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
  }
  
+ int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
+                          int index, enum ib_gid_type *gid_type)
+ {
+       struct ib_gid_attr attr;
+       union ib_gid gid;
+       int ret;
+       ret = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr);
+       if (ret)
+               return ret;
+       if (!attr.ndev)
+               return -ENODEV;
+       dev_put(attr.ndev);
+       *gid_type = attr.gid_type;
+       return 0;
+ }
  static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
  {
        if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
@@@ -565,8 -581,15 +582,15 @@@ static int mlx5_ib_query_device(struct 
                props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
  
        if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) {
-               if (MLX5_CAP_ETH(mdev, csum_cap))
+               if (MLX5_CAP_ETH(mdev, csum_cap)) {
+                       /* Legacy bit to support old userspace libraries */
                        props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
+                       props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM;
+               }
+               if (MLX5_CAP_ETH(dev->mdev, vlan_cap))
+                       props->raw_packet_caps |=
+                               IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
  
                if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
                        max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
        }
  
        if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
-           MLX5_CAP_ETH(dev->mdev, scatter_fcs))
+           MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
+               /* Legacy bit to support old userspace libraries */
                props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
+               props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
+       }
  
        if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
                props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
@@@ -831,7 -857,7 +858,7 @@@ static int mlx5_query_hca_port(struct i
                goto out;
        }
  
-       memset(props, 0, sizeof(*props));
+       /* props being zeroed by the caller, avoid zeroing it here */
  
        err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
        if (err)
@@@ -969,6 -995,31 +996,31 @@@ static int mlx5_ib_modify_device(struc
        return err;
  }
  
+ static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask,
+                               u32 value)
+ {
+       struct mlx5_hca_vport_context ctx = {};
+       int err;
+       err = mlx5_query_hca_vport_context(dev->mdev, 0,
+                                          port_num, 0, &ctx);
+       if (err)
+               return err;
+       if (~ctx.cap_mask1_perm & mask) {
+               mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
+                            mask, ctx.cap_mask1_perm);
+               return -EINVAL;
+       }
+       ctx.cap_mask1 = value;
+       ctx.cap_mask1_perm = mask;
+       err = mlx5_core_modify_hca_vport_context(dev->mdev, 0,
+                                                port_num, 0, &ctx);
+       return err;
+ }
  static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
                               struct ib_port_modify *props)
  {
        struct ib_port_attr attr;
        u32 tmp;
        int err;
+       u32 change_mask;
+       u32 value;
+       bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) ==
+                     IB_LINK_LAYER_INFINIBAND);
+       if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) {
+               change_mask = props->clr_port_cap_mask | props->set_port_cap_mask;
+               value = ~props->clr_port_cap_mask | props->set_port_cap_mask;
+               return set_port_caps_atomic(dev, port, change_mask, value);
+       }
  
        mutex_lock(&dev->cap_mask_mutex);
  
-       err = mlx5_ib_query_port(ibdev, port, &attr);
+       err = ib_query_port(ibdev, port, &attr);
        if (err)
                goto out;
  
@@@ -1203,14 -1264,6 +1265,14 @@@ static struct ib_ucontext *mlx5_ib_allo
                resp.response_length += sizeof(resp.cmds_supp_uhw);
        }
  
 +      if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) {
 +              if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
 +                      mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
 +                      resp.eth_min_inline++;
 +              }
 +              resp.response_length += sizeof(resp.eth_min_inline);
 +      }
 +
        /*
         * We don't want to expose information from the PCI bar that is located
         * after 4096 bytes, so if the arch only supports larger pages, let's
@@@ -1661,6 -1714,7 +1723,7 @@@ static void set_tos(void *outer_c, voi
  #define LAST_IPV6_FIELD traffic_class
  #define LAST_TCP_UDP_FIELD src_port
  #define LAST_TUNNEL_FIELD tunnel_id
+ #define LAST_FLOW_TAG_FIELD tag_id
  
  /* Field is the last supported field */
  #define FIELDS_NOT_SUPPORTED(filter, field)\
                   sizeof(filter.field))
  
  static int parse_flow_attr(u32 *match_c, u32 *match_v,
-                          const union ib_flow_spec *ib_spec)
+                          const union ib_flow_spec *ib_spec, u32 *tag_id)
  {
        void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
                                           misc_parameters);
        switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
        case IB_FLOW_SPEC_ETH:
                if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
-                       return -ENOTSUPP;
+                       return -EOPNOTSUPP;
  
                ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
                                             dmac_47_16),
  
                if (ib_spec->eth.mask.vlan_tag) {
                        MLX5_SET(fte_match_set_lyr_2_4, headers_c,
 -                               vlan_tag, 1);
 +                               cvlan_tag, 1);
                        MLX5_SET(fte_match_set_lyr_2_4, headers_v,
 -                               vlan_tag, 1);
 +                               cvlan_tag, 1);
  
                        MLX5_SET(fte_match_set_lyr_2_4, headers_c,
                                 first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
                break;
        case IB_FLOW_SPEC_IPV4:
                if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
-                       return -ENOTSUPP;
+                       return -EOPNOTSUPP;
  
                MLX5_SET(fte_match_set_lyr_2_4, headers_c,
                         ethertype, 0xffff);
                break;
        case IB_FLOW_SPEC_IPV6:
                if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
-                       return -ENOTSUPP;
+                       return -EOPNOTSUPP;
  
                MLX5_SET(fte_match_set_lyr_2_4, headers_c,
                         ethertype, 0xffff);
        case IB_FLOW_SPEC_TCP:
                if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
                                         LAST_TCP_UDP_FIELD))
-                       return -ENOTSUPP;
+                       return -EOPNOTSUPP;
  
                MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
                         0xff);
        case IB_FLOW_SPEC_UDP:
                if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
                                         LAST_TCP_UDP_FIELD))
-                       return -ENOTSUPP;
+                       return -EOPNOTSUPP;
  
                MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
                         0xff);
        case IB_FLOW_SPEC_VXLAN_TUNNEL:
                if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
                                         LAST_TUNNEL_FIELD))
-                       return -ENOTSUPP;
+                       return -EOPNOTSUPP;
  
                MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
                         ntohl(ib_spec->tunnel.mask.tunnel_id));
                MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
                         ntohl(ib_spec->tunnel.val.tunnel_id));
                break;
+       case IB_FLOW_SPEC_ACTION_TAG:
+               if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
+                                        LAST_FLOW_TAG_FIELD))
+                       return -EOPNOTSUPP;
+               if (ib_spec->flow_tag.tag_id >= BIT(24))
+                       return -EINVAL;
+               *tag_id = ib_spec->flow_tag.tag_id;
+               break;
        default:
                return -EINVAL;
        }
@@@ -2046,6 -2109,7 +2118,7 @@@ static struct mlx5_ib_flow_handler *cre
        struct mlx5_flow_spec *spec;
        const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
        unsigned int spec_index;
+       u32 flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
        int err = 0;
  
        if (!is_valid_attr(flow_attr))
  
        for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
                err = parse_flow_attr(spec->match_criteria,
-                                     spec->match_value, ib_flow);
+                                     spec->match_value, ib_flow, &flow_tag);
                if (err < 0)
                        goto free;
  
        spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
        flow_act.action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
                MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
-       flow_act.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
+       if (flow_tag != MLX5_FS_DEFAULT_FLOW_TAG &&
+           (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
+            flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
+               mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
+                            flow_tag, flow_attr->type);
+               err = -EINVAL;
+               goto free;
+       }
+       flow_act.flow_tag = flow_tag;
        handler->rule = mlx5_add_flow_rules(ft, spec,
                                            &flow_act,
                                            dst, 1);
@@@ -2542,6 -2615,35 +2624,35 @@@ static void mlx5_ib_event(struct mlx5_c
                ibdev->ib_active = false;
  }
  
+ static int set_has_smi_cap(struct mlx5_ib_dev *dev)
+ {
+       struct mlx5_hca_vport_context vport_ctx;
+       int err;
+       int port;
+       for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
+               dev->mdev->port_caps[port - 1].has_smi = false;
+               if (MLX5_CAP_GEN(dev->mdev, port_type) ==
+                   MLX5_CAP_PORT_TYPE_IB) {
+                       if (MLX5_CAP_GEN(dev->mdev, ib_virt)) {
+                               err = mlx5_query_hca_vport_context(dev->mdev, 0,
+                                                                  port, 0,
+                                                                  &vport_ctx);
+                               if (err) {
+                                       mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
+                                                   port, err);
+                                       return err;
+                               }
+                               dev->mdev->port_caps[port - 1].has_smi =
+                                       vport_ctx.has_smi;
+                       } else {
+                               dev->mdev->port_caps[port - 1].has_smi = true;
+                       }
+               }
+       }
+       return 0;
+ }
  static void get_ext_port_caps(struct mlx5_ib_dev *dev)
  {
        int port;
@@@ -2566,6 -2668,10 +2677,10 @@@ static int get_port_caps(struct mlx5_ib
        if (!dprops)
                goto out;
  
+       err = set_has_smi_cap(dev);
+       if (err)
+               goto out;
        err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
        if (err) {
                mlx5_ib_warn(dev, "query_device failed %d\n", err);
        }
  
        for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
+               memset(pprops, 0, sizeof(*pprops));
                err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
                if (err) {
                        mlx5_ib_warn(dev, "query_port %d failed %d\n",
@@@ -2867,11 -2974,13 +2983,13 @@@ static u32 get_core_cap_flags(struct ib
        if (ll == IB_LINK_LAYER_INFINIBAND)
                return RDMA_CORE_PORT_IBA_IB;
  
+       ret = RDMA_CORE_PORT_RAW_PACKET;
        if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
-               return 0;
+               return ret;
  
        if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
-               return 0;
+               return ret;
  
        if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
                ret |= RDMA_CORE_PORT_IBA_ROCE;
@@@ -2890,7 -2999,9 +3008,9 @@@ static int mlx5_port_immutable(struct i
        enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
        int err;
  
-       err = mlx5_ib_query_port(ibdev, port_num, &attr);
+       immutable->core_cap_flags = get_core_cap_flags(ibdev);
+       err = ib_query_port(ibdev, port_num, &attr);
        if (err)
                return err;
  
@@@ -3011,13 -3122,102 +3131,102 @@@ static void mlx5_disable_eth(struct mlx
                mlx5_nic_vport_disable_roce(dev->mdev);
  }
  
+ struct mlx5_ib_q_counter {
+       const char *name;
+       size_t offset;
+ };
+ #define INIT_Q_COUNTER(_name)         \
+       { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
+ static const struct mlx5_ib_q_counter basic_q_cnts[] = {
+       INIT_Q_COUNTER(rx_write_requests),
+       INIT_Q_COUNTER(rx_read_requests),
+       INIT_Q_COUNTER(rx_atomic_requests),
+       INIT_Q_COUNTER(out_of_buffer),
+ };
+ static const struct mlx5_ib_q_counter out_of_seq_q_cnts[] = {
+       INIT_Q_COUNTER(out_of_sequence),
+ };
+ static const struct mlx5_ib_q_counter retrans_q_cnts[] = {
+       INIT_Q_COUNTER(duplicate_request),
+       INIT_Q_COUNTER(rnr_nak_retry_err),
+       INIT_Q_COUNTER(packet_seq_err),
+       INIT_Q_COUNTER(implied_nak_seq_err),
+       INIT_Q_COUNTER(local_ack_timeout_err),
+ };
  static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
  {
        unsigned int i;
  
-       for (i = 0; i < dev->num_ports; i++)
+       for (i = 0; i < dev->num_ports; i++) {
                mlx5_core_dealloc_q_counter(dev->mdev,
-                                           dev->port[i].q_cnt_id);
+                                           dev->port[i].q_cnts.set_id);
+               kfree(dev->port[i].q_cnts.names);
+               kfree(dev->port[i].q_cnts.offsets);
+       }
+ }
+ static int __mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev,
+                                     const char ***names,
+                                     size_t **offsets,
+                                     u32 *num)
+ {
+       u32 num_counters;
+       num_counters = ARRAY_SIZE(basic_q_cnts);
+       if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
+               num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
+       if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
+               num_counters += ARRAY_SIZE(retrans_q_cnts);
+       *names = kcalloc(num_counters, sizeof(**names), GFP_KERNEL);
+       if (!*names)
+               return -ENOMEM;
+       *offsets = kcalloc(num_counters, sizeof(**offsets), GFP_KERNEL);
+       if (!*offsets)
+               goto err_names;
+       *num = num_counters;
+       return 0;
+ err_names:
+       kfree(*names);
+       return -ENOMEM;
+ }
+ static void mlx5_ib_fill_q_counters(struct mlx5_ib_dev *dev,
+                                   const char **names,
+                                   size_t *offsets)
+ {
+       int i;
+       int j = 0;
+       for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
+               names[j] = basic_q_cnts[i].name;
+               offsets[j] = basic_q_cnts[i].offset;
+       }
+       if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
+               for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
+                       names[j] = out_of_seq_q_cnts[i].name;
+                       offsets[j] = out_of_seq_q_cnts[i].offset;
+               }
+       }
+       if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
+               for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
+                       names[j] = retrans_q_cnts[i].name;
+                       offsets[j] = retrans_q_cnts[i].offset;
+               }
+       }
  }
  
  static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
        int ret;
  
        for (i = 0; i < dev->num_ports; i++) {
+               struct mlx5_ib_port *port = &dev->port[i];
                ret = mlx5_core_alloc_q_counter(dev->mdev,
-                                               &dev->port[i].q_cnt_id);
+                                               &port->q_cnts.set_id);
                if (ret) {
                        mlx5_ib_warn(dev,
                                     "couldn't allocate queue counter for port %d, err %d\n",
                                     i + 1, ret);
                        goto dealloc_counters;
                }
+               ret = __mlx5_ib_alloc_q_counters(dev,
+                                                &port->q_cnts.names,
+                                                &port->q_cnts.offsets,
+                                                &port->q_cnts.num_counters);
+               if (ret)
+                       goto dealloc_counters;
+               mlx5_ib_fill_q_counters(dev, port->q_cnts.names,
+                                       port->q_cnts.offsets);
        }
  
        return 0;
  dealloc_counters:
        while (--i >= 0)
                mlx5_core_dealloc_q_counter(dev->mdev,
-                                           dev->port[i].q_cnt_id);
+                                           dev->port[i].q_cnts.set_id);
  
        return ret;
  }
  
- static const char * const names[] = {
-       "rx_write_requests",
-       "rx_read_requests",
-       "rx_atomic_requests",
-       "out_of_buffer",
-       "out_of_sequence",
-       "duplicate_request",
-       "rnr_nak_retry_err",
-       "packet_seq_err",
-       "implied_nak_seq_err",
-       "local_ack_timeout_err",
- };
- static const size_t stats_offsets[] = {
-       MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests),
-       MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests),
-       MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests),
-       MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer),
-       MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence),
-       MLX5_BYTE_OFF(query_q_counter_out, duplicate_request),
-       MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err),
-       MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err),
-       MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err),
-       MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err),
- };
  static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
                                                    u8 port_num)
  {
-       BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets));
+       struct mlx5_ib_dev *dev = to_mdev(ibdev);
+       struct mlx5_ib_port *port = &dev->port[port_num - 1];
  
        /* We support only per port stats */
        if (port_num == 0)
                return NULL;
  
-       return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names),
+       return rdma_alloc_hw_stats_struct(port->q_cnts.names,
+                                         port->q_cnts.num_counters,
                                          RDMA_HW_STATS_DEFAULT_LIFESPAN);
  }
  
  static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
                                struct rdma_hw_stats *stats,
-                               u8 port, int index)
+                               u8 port_num, int index)
  {
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
+       struct mlx5_ib_port *port = &dev->port[port_num - 1];
        int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
        void *out;
        __be32 val;
        int ret;
        int i;
  
-       if (!port || !stats)
+       if (!stats)
                return -ENOSYS;
  
        out = mlx5_vzalloc(outlen);
                return -ENOMEM;
  
        ret = mlx5_core_query_q_counter(dev->mdev,
-                                       dev->port[port - 1].q_cnt_id, 0,
+                                       port->q_cnts.set_id, 0,
                                        out, outlen);
        if (ret)
                goto free;
  
-       for (i = 0; i < ARRAY_SIZE(names); i++) {
-               val = *(__be32 *)(out + stats_offsets[i]);
+       for (i = 0; i < port->q_cnts.num_counters; i++) {
+               val = *(__be32 *)(out + port->q_cnts.offsets[i]);
                stats->value[i] = (u64)be32_to_cpu(val);
        }
  free:
        kvfree(out);
-       return ARRAY_SIZE(names);
+       return port->q_cnts.num_counters;
  }
  
  static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
                        (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
        }
  
-       if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
-           MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
+       if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
                dev->ib_dev.get_hw_stats        = mlx5_ib_get_hw_stats;
                dev->ib_dev.alloc_hw_stats      = mlx5_ib_alloc_hw_stats;
        }
        if (err)
                goto err_rsrc;
  
-       err = mlx5_ib_alloc_q_counters(dev);
-       if (err)
-               goto err_odp;
+       if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
+               err = mlx5_ib_alloc_q_counters(dev);
+               if (err)
+                       goto err_odp;
+       }
  
        dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
        if (!dev->mdev->priv.uar)
@@@ -3373,7 -3564,8 +3573,8 @@@ err_uar_page
        mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
  
  err_q_cnt:
-       mlx5_ib_dealloc_q_counters(dev);
+       if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
+               mlx5_ib_dealloc_q_counters(dev);
  
  err_odp:
        mlx5_ib_odp_remove_one(dev);
@@@ -3406,7 -3598,8 +3607,8 @@@ static void mlx5_ib_remove(struct mlx5_
        mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
        mlx5_free_bfreg(dev->mdev, &dev->bfreg);
        mlx5_put_uars_page(dev->mdev, mdev->priv.uar);
-       mlx5_ib_dealloc_q_counters(dev);
+       if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
+               mlx5_ib_dealloc_q_counters(dev);
        destroy_umrc_res(dev);
        mlx5_ib_odp_remove_one(dev);
        destroy_dev_resources(&dev->devr);
@@@ -3430,8 -3623,7 +3632,7 @@@ static int __init mlx5_ib_init(void
  {
        int err;
  
-       if (deprecated_prof_sel != 2)
-               pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
+       mlx5_ib_odp_init();
  
        err = mlx5_register_interface(&mlx5_ib_interface);
  
index e31bf11ae64fccdda3bb85863ea4da0da902b9bb,69a1604a887c4a91f6c6ff3cf61f17790a668c72..ad8a2638e339b4bf0d7e866cfbf0fa08b8f13a1b
@@@ -905,7 -905,10 +905,10 @@@ static int create_kernel_qp(struct mlx5
        else
                qp->bf.bfreg = &dev->bfreg;
  
-       qp->bf.buf_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
+       /* We need to divide by two since each register is comprised of
+        * two buffers of identical size, namely odd and even
+        */
+       qp->bf.buf_size = (1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size)) / 2;
        uar_index = qp->bf.bfreg->index;
  
        err = calc_sq_size(dev, init_attr, qp);
@@@ -1141,7 -1144,8 +1144,8 @@@ static int create_raw_packet_qp_rq(stru
                return -ENOMEM;
  
        rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
-       MLX5_SET(rqc, rqc, vsd, 1);
+       if (!(rq->flags & MLX5_IB_RQ_CVLAN_STRIPPING))
+               MLX5_SET(rqc, rqc, vsd, 1);
        MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE);
        MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
        MLX5_SET(rqc, rqc, flush_in_error_en, 1);
@@@ -1238,6 -1242,8 +1242,8 @@@ static int create_raw_packet_qp(struct 
        if (qp->rq.wqe_cnt) {
                rq->base.container_mibqp = qp;
  
+               if (qp->flags & MLX5_IB_QP_CVLAN_STRIPPING)
+                       rq->flags |= MLX5_IB_RQ_CVLAN_STRIPPING;
                err = create_raw_packet_qp_rq(dev, rq, in);
                if (err)
                        goto err_destroy_sq;
@@@ -1559,6 -1565,14 +1565,14 @@@ static int create_qp_common(struct mlx5
        if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
                qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
  
+       if (init_attr->create_flags & IB_QP_CREATE_CVLAN_STRIPPING) {
+               if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+                     MLX5_CAP_ETH(dev->mdev, vlan_cap)) ||
+                   (init_attr->qp_type != IB_QPT_RAW_PACKET))
+                       return -EOPNOTSUPP;
+               qp->flags |= MLX5_IB_QP_CVLAN_STRIPPING;
+       }
        if (pd && pd->uobject) {
                if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
                        mlx5_ib_dbg(dev, "copy failed\n");
@@@ -2198,6 -2212,7 +2212,7 @@@ static int mlx5_set_path(struct mlx5_ib
  {
        enum rdma_link_layer ll = rdma_port_get_link_layer(&dev->ib_dev, port);
        int err;
+       enum ib_gid_type gid_type;
  
        if (attr_mask & IB_QP_PKEY_INDEX)
                path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index :
        if (ll == IB_LINK_LAYER_ETHERNET) {
                if (!(ah->ah_flags & IB_AH_GRH))
                        return -EINVAL;
+               err = mlx5_get_roce_gid_type(dev, port, ah->grh.sgid_index,
+                                            &gid_type);
+               if (err)
+                       return err;
                memcpy(path->rmac, ah->dmac, sizeof(ah->dmac));
                path->udp_sport = mlx5_get_roce_udp_sport(dev, port,
                                                          ah->grh.sgid_index);
                path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4;
+               if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+                       path->ecn_dscp = (ah->grh.traffic_class >> 2) & 0x3f;
        } else {
                path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
                path->fl_free_ar |=
@@@ -2422,7 -2443,7 +2443,7 @@@ static int modify_raw_packet_qp_rq(stru
        if (raw_qp_param->set_mask & MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID) {
                if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) {
                        MLX5_SET64(modify_rq_in, in, modify_bitmask,
-                                  MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_MODIFY_RQ_COUNTER_SET_ID);
+                                  MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID);
                        MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id);
                } else
                        pr_info_once("%s: RAW PACKET QP counters are not supported on current FW\n",
@@@ -2777,7 -2798,7 +2798,7 @@@ static int __mlx5_ib_modify_qp(struct i
                               qp->port) - 1;
                mibport = &dev->port[port_num];
                context->qp_counter_set_usr_page |=
-                       cpu_to_be32((u32)(mibport->q_cnt_id) << 24);
+                       cpu_to_be32((u32)(mibport->q_cnts.set_id) << 24);
        }
  
        if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
  
                raw_qp_param.operation = op;
                if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
-                       raw_qp_param.rq_q_ctr_id = mibport->q_cnt_id;
+                       raw_qp_param.rq_q_ctr_id = mibport->q_cnts.set_id;
                        raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID;
                }
  
@@@ -2984,20 -3005,20 +3005,20 @@@ static void *set_eth_seg(struct mlx5_wq
  
        if (wr->opcode == IB_WR_LSO) {
                struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr);
 -              int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start);
 +              int size_of_inl_hdr_start = sizeof(eseg->inline_hdr.start);
                u64 left, leftlen, copysz;
                void *pdata = ud_wr->header;
  
                left = ud_wr->hlen;
                eseg->mss = cpu_to_be16(ud_wr->mss);
 -              eseg->inline_hdr_sz = cpu_to_be16(left);
 +              eseg->inline_hdr.sz = cpu_to_be16(left);
  
                /*
                 * check if there is space till the end of queue, if yes,
                 * copy all in one shot, otherwise copy till the end of queue,
                 * rollback and than the copy the left
                 */
 -              leftlen = qend - (void *)eseg->inline_hdr_start;
 +              leftlen = qend - (void *)eseg->inline_hdr.start;
                copysz = min_t(u64, leftlen, left);
  
                memcpy(seg - size_of_inl_hdr_start, pdata, copysz);
@@@ -3637,8 -3658,9 +3658,9 @@@ static int set_psv_wr(struct ib_sig_dom
                psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag);
                break;
        default:
-               pr_err("Bad signature type given.\n");
-               return 1;
+               pr_err("Bad signature type (%d) is given.\n",
+                      domain->sig_type);
+               return -EINVAL;
        }
  
        *seg += sizeof(*psv_seg);
@@@ -3978,6 -4000,12 +4000,12 @@@ int mlx5_ib_post_send(struct ib_qp *ibq
                        break;
  
                case IB_QPT_SMI:
+                       if (unlikely(!mdev->port_caps[qp->port - 1].has_smi)) {
+                               mlx5_ib_warn(dev, "Send SMP MADs is not allowed\n");
+                               err = -EPERM;
+                               *bad_wr = wr;
+                               goto out;
+                       }
                case MLX5_IB_QPT_HW_GSI:
                        set_datagram_seg(seg, wr);
                        seg += sizeof(struct mlx5_wqe_datagram_seg);
@@@ -4579,6 -4607,7 +4607,7 @@@ static int  create_rq(struct mlx5_ib_rw
                      struct ib_wq_init_attr *init_attr)
  {
        struct mlx5_ib_dev *dev;
+       int has_net_offloads;
        __be64 *rq_pas0;
        void *in;
        void *rqc;
        MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size);
        MLX5_SET(wq, wq, wq_signature, rwq->wq_sig);
        MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma);
+       has_net_offloads = MLX5_CAP_GEN(dev->mdev, eth_net_offloads);
+       if (init_attr->create_flags & IB_WQ_FLAGS_CVLAN_STRIPPING) {
+               if (!(has_net_offloads && MLX5_CAP_ETH(dev->mdev, vlan_cap))) {
+                       mlx5_ib_dbg(dev, "VLAN offloads are not supported\n");
+                       err = -EOPNOTSUPP;
+                       goto out;
+               }
+       } else {
+               MLX5_SET(rqc, rqc, vsd, 1);
+       }
+       if (init_attr->create_flags & IB_WQ_FLAGS_SCATTER_FCS) {
+               if (!(has_net_offloads && MLX5_CAP_ETH(dev->mdev, scatter_fcs))) {
+                       mlx5_ib_dbg(dev, "Scatter FCS is not supported\n");
+                       err = -EOPNOTSUPP;
+                       goto out;
+               }
+               MLX5_SET(rqc, rqc, scatter_fcs, 1);
+       }
        rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
        mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0);
        err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp);
+ out:
        kvfree(in);
        return err;
  }
@@@ -4896,10 -4944,37 +4944,37 @@@ int mlx5_ib_modify_wq(struct ib_wq *wq
        MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state);
        MLX5_SET(rqc, rqc, state, wq_state);
  
+       if (wq_attr_mask & IB_WQ_FLAGS) {
+               if (wq_attr->flags_mask & IB_WQ_FLAGS_CVLAN_STRIPPING) {
+                       if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+                             MLX5_CAP_ETH(dev->mdev, vlan_cap))) {
+                               mlx5_ib_dbg(dev, "VLAN offloads are not "
+                                           "supported\n");
+                               err = -EOPNOTSUPP;
+                               goto out;
+                       }
+                       MLX5_SET64(modify_rq_in, in, modify_bitmask,
+                                  MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD);
+                       MLX5_SET(rqc, rqc, vsd,
+                                (wq_attr->flags & IB_WQ_FLAGS_CVLAN_STRIPPING) ? 0 : 1);
+               }
+       }
+       if (curr_wq_state == IB_WQS_RESET && wq_state == IB_WQS_RDY) {
+               if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) {
+                       MLX5_SET64(modify_rq_in, in, modify_bitmask,
+                                  MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID);
+                       MLX5_SET(rqc, rqc, counter_set_id, dev->port->q_cnts.set_id);
+               } else
+                       pr_info_once("%s: Receive WQ counters are not supported on current FW\n",
+                                    dev->ib_dev.name);
+       }
        err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in, inlen);
-       kvfree(in);
        if (!err)
                rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state;
  
+ out:
+       kvfree(in);
        return err;
  }
index 5a31f3c6a4211d507cc4634c49df53021bba505b,b7179f4ac3a6312d9436503122cc8202495db70f..d3eae2f3e9f504957305e4bda59f837327bc69f7
@@@ -475,10 -475,20 +475,10 @@@ static int nes_query_port(struct ib_dev
        struct nes_vnic *nesvnic = to_nesvnic(ibdev);
        struct net_device *netdev = nesvnic->netdev;
  
-       memset(props, 0, sizeof(*props));
+       /* props being zeroed by the caller, avoid zeroing it here */
  
        props->max_mtu = IB_MTU_4096;
 -
 -      if (netdev->mtu  >= 4096)
 -              props->active_mtu = IB_MTU_4096;
 -      else if (netdev->mtu  >= 2048)
 -              props->active_mtu = IB_MTU_2048;
 -      else if (netdev->mtu  >= 1024)
 -              props->active_mtu = IB_MTU_1024;
 -      else if (netdev->mtu  >= 512)
 -              props->active_mtu = IB_MTU_512;
 -      else
 -              props->active_mtu = IB_MTU_256;
 +      props->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
  
        props->lid = 1;
        props->lmc = 0;
@@@ -3660,13 -3670,14 +3660,14 @@@ static int nes_port_immutable(struct ib
        struct ib_port_attr attr;
        int err;
  
+       immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
        err = nes_query_port(ibdev, port_num, &attr);
        if (err)
                return err;
  
        immutable->pkey_tbl_len = attr.pkey_tbl_len;
        immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
  
        return 0;
  }
index e06ad72509636414414eb393076d425fe7e32c9e,013d15c7e59393a6990908931574af17891f8a39..bc9fb144e57b8e3a05863223156f25b7f17e6a34
@@@ -210,6 -210,7 +210,7 @@@ int ocrdma_query_port(struct ib_device 
        struct ocrdma_dev *dev;
        struct net_device *netdev;
  
+       /* props being zeroed by the caller, avoid zeroing it here */
        dev = get_ocrdma_dev(ibdev);
        if (port > 1) {
                pr_err("%s(%d) invalid_port=0x%x\n", __func__,
@@@ -1170,7 -1171,8 +1171,7 @@@ int ocrdma_destroy_cq(struct ib_cq *ibc
  
        dev->cq_tbl[cq->id] = NULL;
        indx = ocrdma_get_eq_table_index(dev, cq->eqn);
 -      if (indx == -EINVAL)
 -              BUG();
 +      BUG_ON(indx == -EINVAL);
  
        eq = &dev->eq_tbl[indx];
        irq = ocrdma_get_irq(dev, eq);
@@@ -1740,7 -1742,8 +1741,7 @@@ static void ocrdma_discard_cqes(struct 
                                wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >>
                                        OCRDMA_CQE_BUFTAG_SHIFT) &
                                        qp->srq->rq.max_wqe_idx;
 -                              if (wqe_idx < 1)
 -                                      BUG();
 +                              BUG_ON(wqe_idx < 1);
                                spin_lock_irqsave(&qp->srq->q_lock, flags);
                                ocrdma_hwq_inc_tail(&qp->srq->rq);
                                ocrdma_srq_toggle_bit(qp->srq, wqe_idx - 1);
@@@ -2386,13 -2389,15 +2387,13 @@@ static int ocrdma_srq_get_idx(struct oc
                if (srq->idx_bit_fields[row]) {
                        indx = ffs(srq->idx_bit_fields[row]);
                        indx = (row * 32) + (indx - 1);
 -                      if (indx >= srq->rq.max_cnt)
 -                              BUG();
 +                      BUG_ON(indx >= srq->rq.max_cnt);
                        ocrdma_srq_toggle_bit(srq, indx);
                        break;
                }
        }
  
 -      if (row == srq->bit_fields_len)
 -              BUG();
 +      BUG_ON(row == srq->bit_fields_len);
        return indx + 1; /* Use from index 1 */
  }
  
@@@ -2750,7 -2755,8 +2751,7 @@@ static void ocrdma_update_free_srq_cqe(
        srq = get_ocrdma_srq(qp->ibqp.srq);
        wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >>
                OCRDMA_CQE_BUFTAG_SHIFT) & srq->rq.max_wqe_idx;
 -      if (wqe_idx < 1)
 -              BUG();
 +      BUG_ON(wqe_idx < 1);
  
        ibwc->wr_id = srq->rqe_wr_id_tbl[wqe_idx];
        spin_lock_irqsave(&srq->q_lock, flags);
index 0c51657af151c02e8cae67bdedca882232d05fff,91a16d38d1fc2afcbde05ef8659720744e0dee96..6b3bb32803bd8661d9efebd14dcefff0b601f6f3
@@@ -238,8 -238,8 +238,8 @@@ int qedr_query_port(struct ib_device *i
        }
  
        rdma_port = dev->ops->rdma_query_port(dev->rdma_ctx);
-       memset(attr, 0, sizeof(*attr));
  
+       /* *attr being zeroed by the caller, avoid zeroing it here */
        if (rdma_port->port_state == QED_RDMA_PORT_UP) {
                attr->state = IB_PORT_ACTIVE;
                attr->phys_state = 5;
@@@ -471,6 -471,8 +471,6 @@@ struct ib_pd *qedr_alloc_pd(struct ib_d
                            struct ib_ucontext *context, struct ib_udata *udata)
  {
        struct qedr_dev *dev = get_qedr_dev(ibdev);
 -      struct qedr_ucontext *uctx = NULL;
 -      struct qedr_alloc_pd_uresp uresp;
        struct qedr_pd *pd;
        u16 pd_id;
        int rc;
        if (!pd)
                return ERR_PTR(-ENOMEM);
  
 -      dev->ops->rdma_alloc_pd(dev->rdma_ctx, &pd_id);
 +      rc = dev->ops->rdma_alloc_pd(dev->rdma_ctx, &pd_id);
 +      if (rc)
 +              goto err;
  
 -      uresp.pd_id = pd_id;
        pd->pd_id = pd_id;
  
        if (udata && context) {
 +              struct qedr_alloc_pd_uresp uresp;
 +
 +              uresp.pd_id = pd_id;
 +
                rc = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 -              if (rc)
 +              if (rc) {
                        DP_ERR(dev, "copy error pd_id=0x%x.\n", pd_id);
 -              uctx = get_qedr_ucontext(context);
 -              uctx->pd = pd;
 -              pd->uctx = uctx;
 +                      dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd_id);
 +                      goto err;
 +              }
 +
 +              pd->uctx = get_qedr_ucontext(context);
 +              pd->uctx->pd = pd;
        }
  
        return &pd->ibpd;
 +
 +err:
 +      kfree(pd);
 +      return ERR_PTR(rc);
  }
  
  int qedr_dealloc_pd(struct ib_pd *ibpd)
@@@ -771,10 -761,8 +771,10 @@@ static inline int qedr_init_user_queue(
                goto err0;
  
        q->pbl_tbl = qedr_alloc_pbl_tbl(dev, &q->pbl_info, GFP_KERNEL);
 -      if (IS_ERR_OR_NULL(q->pbl_tbl))
 +      if (IS_ERR(q->pbl_tbl)) {
 +              rc = PTR_ERR(q->pbl_tbl);
                goto err0;
 +      }
  
        qedr_populate_pbls(dev, q->umem, q->pbl_tbl, &q->pbl_info);
  
@@@ -1088,6 -1076,30 +1088,6 @@@ static inline int get_gid_info_from_tab
        return 0;
  }
  
 -static void qedr_cleanup_user_sq(struct qedr_dev *dev, struct qedr_qp *qp)
 -{
 -      qedr_free_pbl(dev, &qp->usq.pbl_info, qp->usq.pbl_tbl);
 -      ib_umem_release(qp->usq.umem);
 -}
 -
 -static void qedr_cleanup_user_rq(struct qedr_dev *dev, struct qedr_qp *qp)
 -{
 -      qedr_free_pbl(dev, &qp->urq.pbl_info, qp->urq.pbl_tbl);
 -      ib_umem_release(qp->urq.umem);
 -}
 -
 -static void qedr_cleanup_kernel_sq(struct qedr_dev *dev, struct qedr_qp *qp)
 -{
 -      dev->ops->common->chain_free(dev->cdev, &qp->sq.pbl);
 -      kfree(qp->wqe_wr_id);
 -}
 -
 -static void qedr_cleanup_kernel_rq(struct qedr_dev *dev, struct qedr_qp *qp)
 -{
 -      dev->ops->common->chain_free(dev->cdev, &qp->rq.pbl);
 -      kfree(qp->rqe_wr_id);
 -}
 -
  static int qedr_check_qp_attrs(struct ib_pd *ibpd, struct qedr_dev *dev,
                               struct ib_qp_init_attr *attrs)
  {
@@@ -1176,13 -1188,15 +1176,13 @@@ static int qedr_copy_qp_uresp(struct qe
        return rc;
  }
  
 -static void qedr_set_qp_init_params(struct qedr_dev *dev,
 -                                  struct qedr_qp *qp,
 -                                  struct qedr_pd *pd,
 -                                  struct ib_qp_init_attr *attrs)
 +static void qedr_set_common_qp_params(struct qedr_dev *dev,
 +                                    struct qedr_qp *qp,
 +                                    struct qedr_pd *pd,
 +                                    struct ib_qp_init_attr *attrs)
  {
 -      qp->pd = pd;
 -
        spin_lock_init(&qp->q_lock);
 -
 +      qp->pd = pd;
        qp->qp_type = attrs->qp_type;
        qp->max_inline_data = attrs->cap.max_inline_data;
        qp->sq.max_sges = attrs->cap.max_send_sge;
        qp->sq_cq = get_qedr_cq(attrs->send_cq);
        qp->rq_cq = get_qedr_cq(attrs->recv_cq);
        qp->dev = dev;
 +      qp->rq.max_sges = attrs->cap.max_recv_sge;
  
 +      DP_DEBUG(dev, QEDR_MSG_QP,
 +               "RQ params:\trq_max_sges = %d, rq_cq_id = %d\n",
 +               qp->rq.max_sges, qp->rq_cq->icid);
        DP_DEBUG(dev, QEDR_MSG_QP,
                 "QP params:\tpd = %d, qp_type = %d, max_inline_data = %d, state = %d, signaled = %d, use_srq=%d\n",
                 pd->pd_id, qp->qp_type, qp->max_inline_data,
        DP_DEBUG(dev, QEDR_MSG_QP,
                 "SQ params:\tsq_max_sges = %d, sq_cq_id = %d\n",
                 qp->sq.max_sges, qp->sq_cq->icid);
 -      qp->rq.max_sges = attrs->cap.max_recv_sge;
 -      DP_DEBUG(dev, QEDR_MSG_QP,
 -               "RQ params:\trq_max_sges = %d, rq_cq_id = %d\n",
 -               qp->rq.max_sges, qp->rq_cq->icid);
 -}
 -
 -static inline void
 -qedr_init_qp_user_params(struct qed_rdma_create_qp_in_params *params,
 -                       struct qedr_create_qp_ureq *ureq)
 -{
 -      /* QP handle to be written in CQE */
 -      params->qp_handle_lo = ureq->qp_handle_lo;
 -      params->qp_handle_hi = ureq->qp_handle_hi;
  }
  
 -static inline void
 -qedr_init_qp_kernel_doorbell_sq(struct qedr_dev *dev, struct qedr_qp *qp)
 +static void qedr_set_roce_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
  {
        qp->sq.db = dev->db_addr +
                    DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD);
        qp->sq.db_data.data.icid = qp->icid + 1;
 +      qp->rq.db = dev->db_addr +
 +                  DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD);
 +      qp->rq.db_data.data.icid = qp->icid;
  }
  
  static inline void
 -qedr_init_qp_kernel_doorbell_rq(struct qedr_dev *dev, struct qedr_qp *qp)
 +qedr_init_common_qp_in_params(struct qedr_dev *dev,
 +                            struct qedr_pd *pd,
 +                            struct qedr_qp *qp,
 +                            struct ib_qp_init_attr *attrs,
 +                            bool fmr_and_reserved_lkey,
 +                            struct qed_rdma_create_qp_in_params *params)
  {
 -      qp->rq.db = dev->db_addr +
 -                  DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD);
 -      qp->rq.db_data.data.icid = qp->icid;
 +      /* QP handle to be written in an async event */
 +      params->qp_handle_async_lo = lower_32_bits((uintptr_t) qp);
 +      params->qp_handle_async_hi = upper_32_bits((uintptr_t) qp);
 +
 +      params->signal_all = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR);
 +      params->fmr_and_reserved_lkey = fmr_and_reserved_lkey;
 +      params->pd = pd->pd_id;
 +      params->dpi = pd->uctx ? pd->uctx->dpi : dev->dpi;
 +      params->sq_cq_id = get_qedr_cq(attrs->send_cq)->icid;
 +      params->stats_queue = 0;
 +      params->rq_cq_id = get_qedr_cq(attrs->recv_cq)->icid;
 +      params->srq_id = 0;
 +      params->use_srq = false;
  }
  
 -static inline int
 -qedr_init_qp_kernel_params_rq(struct qedr_dev *dev,
 -                            struct qedr_qp *qp, struct ib_qp_init_attr *attrs)
 +static inline void qedr_qp_user_print(struct qedr_dev *dev, struct qedr_qp *qp)
  {
 -      /* Allocate driver internal RQ array */
 -      qp->rqe_wr_id = kcalloc(qp->rq.max_wr, sizeof(*qp->rqe_wr_id),
 -                              GFP_KERNEL);
 -      if (!qp->rqe_wr_id)
 -              return -ENOMEM;
 +      DP_DEBUG(dev, QEDR_MSG_QP, "create qp: successfully created user QP. "
 +               "qp=%p. "
 +               "sq_addr=0x%llx, "
 +               "sq_len=%zd, "
 +               "rq_addr=0x%llx, "
 +               "rq_len=%zd"
 +               "\n",
 +               qp,
 +               qp->usq.buf_addr,
 +               qp->usq.buf_len, qp->urq.buf_addr, qp->urq.buf_len);
 +}
  
 -      DP_DEBUG(dev, QEDR_MSG_QP, "RQ max_wr set to %d.\n", qp->rq.max_wr);
 +static void qedr_cleanup_user(struct qedr_dev *dev, struct qedr_qp *qp)
 +{
 +      if (qp->usq.umem)
 +              ib_umem_release(qp->usq.umem);
 +      qp->usq.umem = NULL;
  
 -      return 0;
 +      if (qp->urq.umem)
 +              ib_umem_release(qp->urq.umem);
 +      qp->urq.umem = NULL;
  }
  
 -static inline int
 -qedr_init_qp_kernel_params_sq(struct qedr_dev *dev,
 -                            struct qedr_qp *qp,
 -                            struct ib_qp_init_attr *attrs,
 -                            struct qed_rdma_create_qp_in_params *params)
 +static int qedr_create_user_qp(struct qedr_dev *dev,
 +                             struct qedr_qp *qp,
 +                             struct ib_pd *ibpd,
 +                             struct ib_udata *udata,
 +                             struct ib_qp_init_attr *attrs)
  {
 -      u32 temp_max_wr;
 +      struct qed_rdma_create_qp_in_params in_params;
 +      struct qed_rdma_create_qp_out_params out_params;
 +      struct qedr_pd *pd = get_qedr_pd(ibpd);
 +      struct ib_ucontext *ib_ctx = NULL;
 +      struct qedr_ucontext *ctx = NULL;
 +      struct qedr_create_qp_ureq ureq;
 +      int rc = -EINVAL;
  
 -      /* Allocate driver internal SQ array */
 -      temp_max_wr = attrs->cap.max_send_wr * dev->wq_multiplier;
 -      temp_max_wr = min_t(u32, temp_max_wr, dev->attr.max_sqe);
 +      ib_ctx = ibpd->uobject->context;
 +      ctx = get_qedr_ucontext(ib_ctx);
  
 -      /* temp_max_wr < attr->max_sqe < u16 so the casting is safe */
 -      qp->sq.max_wr = (u16)temp_max_wr;
 -      qp->wqe_wr_id = kcalloc(qp->sq.max_wr, sizeof(*qp->wqe_wr_id),
 -                              GFP_KERNEL);
 -      if (!qp->wqe_wr_id)
 -              return -ENOMEM;
 +      memset(&ureq, 0, sizeof(ureq));
 +      rc = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
 +      if (rc) {
 +              DP_ERR(dev, "Problem copying data from user space\n");
 +              return rc;
 +      }
 +
 +      /* SQ - read access only (0), dma sync not required (0) */
 +      rc = qedr_init_user_queue(ib_ctx, dev, &qp->usq, ureq.sq_addr,
 +                                ureq.sq_len, 0, 0);
 +      if (rc)
 +              return rc;
  
 -      DP_DEBUG(dev, QEDR_MSG_QP, "SQ max_wr set to %d.\n", qp->sq.max_wr);
 +      /* RQ - read access only (0), dma sync not required (0) */
 +      rc = qedr_init_user_queue(ib_ctx, dev, &qp->urq, ureq.rq_addr,
 +                                ureq.rq_len, 0, 0);
  
 -      /* QP handle to be written in CQE */
 -      params->qp_handle_lo = lower_32_bits((uintptr_t)qp);
 -      params->qp_handle_hi = upper_32_bits((uintptr_t)qp);
 +      if (rc)
 +              return rc;
 +
 +      memset(&in_params, 0, sizeof(in_params));
 +      qedr_init_common_qp_in_params(dev, pd, qp, attrs, false, &in_params);
 +      in_params.qp_handle_lo = ureq.qp_handle_lo;
 +      in_params.qp_handle_hi = ureq.qp_handle_hi;
 +      in_params.sq_num_pages = qp->usq.pbl_info.num_pbes;
 +      in_params.sq_pbl_ptr = qp->usq.pbl_tbl->pa;
 +      in_params.rq_num_pages = qp->urq.pbl_info.num_pbes;
 +      in_params.rq_pbl_ptr = qp->urq.pbl_tbl->pa;
 +
 +      qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx,
 +                                            &in_params, &out_params);
 +
 +      if (!qp->qed_qp) {
 +              rc = -ENOMEM;
 +              goto err1;
 +      }
 +
 +      qp->qp_id = out_params.qp_id;
 +      qp->icid = out_params.icid;
 +
 +      rc = qedr_copy_qp_uresp(dev, qp, udata);
 +      if (rc)
 +              goto err;
 +
 +      qedr_qp_user_print(dev, qp);
  
        return 0;
 +err:
 +      rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
 +      if (rc)
 +              DP_ERR(dev, "create qp: fatal fault. rc=%d", rc);
 +
 +err1:
 +      qedr_cleanup_user(dev, qp);
 +      return rc;
  }
  
 -static inline int qedr_init_qp_kernel_sq(struct qedr_dev *dev,
 -                                       struct qedr_qp *qp,
 -                                       struct ib_qp_init_attr *attrs)
 +static int
 +qedr_roce_create_kernel_qp(struct qedr_dev *dev,
 +                         struct qedr_qp *qp,
 +                         struct qed_rdma_create_qp_in_params *in_params,
 +                         u32 n_sq_elems, u32 n_rq_elems)
  {
 -      u32 n_sq_elems, n_sq_entries;
 +      struct qed_rdma_create_qp_out_params out_params;
        int rc;
  
 -      /* A single work request may take up to QEDR_MAX_SQ_WQE_SIZE elements in
 -       * the ring. The ring should allow at least a single WR, even if the
 -       * user requested none, due to allocation issues.
 -       */
 -      n_sq_entries = attrs->cap.max_send_wr;
 -      n_sq_entries = min_t(u32, n_sq_entries, dev->attr.max_sqe);
 -      n_sq_entries = max_t(u32, n_sq_entries, 1);
 -      n_sq_elems = n_sq_entries * QEDR_MAX_SQE_ELEMENTS_PER_SQE;
        rc = dev->ops->common->chain_alloc(dev->cdev,
                                           QED_CHAIN_USE_TO_PRODUCE,
                                           QED_CHAIN_MODE_PBL,
                                           n_sq_elems,
                                           QEDR_SQE_ELEMENT_SIZE,
                                           &qp->sq.pbl);
 -      if (rc) {
 -              DP_ERR(dev, "failed to allocate QP %p SQ\n", qp);
 -              return rc;
 -      }
  
 -      DP_DEBUG(dev, QEDR_MSG_SQ,
 -               "SQ Pbl base addr = %llx max_send_wr=%d max_wr=%d capacity=%d, rc=%d\n",
 -               qed_chain_get_pbl_phys(&qp->sq.pbl), attrs->cap.max_send_wr,
 -               n_sq_entries, qed_chain_get_capacity(&qp->sq.pbl), rc);
 -      return 0;
 -}
 +      if (rc)
 +              return rc;
  
 -static inline int qedr_init_qp_kernel_rq(struct qedr_dev *dev,
 -                                       struct qedr_qp *qp,
 -                                       struct ib_qp_init_attr *attrs)
 -{
 -      u32 n_rq_elems, n_rq_entries;
 -      int rc;
 +      in_params->sq_num_pages = qed_chain_get_page_cnt(&qp->sq.pbl);
 +      in_params->sq_pbl_ptr = qed_chain_get_pbl_phys(&qp->sq.pbl);
  
 -      /* A single work request may take up to QEDR_MAX_RQ_WQE_SIZE elements in
 -       * the ring. There ring should allow at least a single WR, even if the
 -       * user requested none, due to allocation issues.
 -       */
 -      n_rq_entries = max_t(u32, attrs->cap.max_recv_wr, 1);
 -      n_rq_elems = n_rq_entries * QEDR_MAX_RQE_ELEMENTS_PER_RQE;
        rc = dev->ops->common->chain_alloc(dev->cdev,
                                           QED_CHAIN_USE_TO_CONSUME_PRODUCE,
                                           QED_CHAIN_MODE_PBL,
                                           n_rq_elems,
                                           QEDR_RQE_ELEMENT_SIZE,
                                           &qp->rq.pbl);
 +      if (rc)
 +              return rc;
  
 -      if (rc) {
 -              DP_ERR(dev, "failed to allocate memory for QP %p RQ\n", qp);
 -              return -ENOMEM;
 -      }
 -
 -      DP_DEBUG(dev, QEDR_MSG_RQ,
 -               "RQ Pbl base addr = %llx max_recv_wr=%d max_wr=%d capacity=%d, rc=%d\n",
 -               qed_chain_get_pbl_phys(&qp->rq.pbl), attrs->cap.max_recv_wr,
 -               n_rq_entries, qed_chain_get_capacity(&qp->rq.pbl), rc);
 +      in_params->rq_num_pages = qed_chain_get_page_cnt(&qp->rq.pbl);
 +      in_params->rq_pbl_ptr = qed_chain_get_pbl_phys(&qp->rq.pbl);
  
 -      /* n_rq_entries < u16 so the casting is safe */
 -      qp->rq.max_wr = (u16)n_rq_entries;
 +      qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx,
 +                                            in_params, &out_params);
  
 -      return 0;
 -}
 +      if (!qp->qed_qp)
 +              return -EINVAL;
  
 -static inline void
 -qedr_init_qp_in_params_sq(struct qedr_dev *dev,
 -                        struct qedr_pd *pd,
 -                        struct qedr_qp *qp,
 -                        struct ib_qp_init_attr *attrs,
 -                        struct ib_udata *udata,
 -                        struct qed_rdma_create_qp_in_params *params)
 -{
 -      /* QP handle to be written in an async event */
 -      params->qp_handle_async_lo = lower_32_bits((uintptr_t)qp);
 -      params->qp_handle_async_hi = upper_32_bits((uintptr_t)qp);
 +      qp->qp_id = out_params.qp_id;
 +      qp->icid = out_params.icid;
  
 -      params->signal_all = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR);
 -      params->fmr_and_reserved_lkey = !udata;
 -      params->pd = pd->pd_id;
 -      params->dpi = pd->uctx ? pd->uctx->dpi : dev->dpi;
 -      params->sq_cq_id = get_qedr_cq(attrs->send_cq)->icid;
 -      params->max_sq_sges = 0;
 -      params->stats_queue = 0;
 +      qedr_set_roce_db_info(dev, qp);
  
 -      if (udata) {
 -              params->sq_num_pages = qp->usq.pbl_info.num_pbes;
 -              params->sq_pbl_ptr = qp->usq.pbl_tbl->pa;
 -      } else {
 -              params->sq_num_pages = qed_chain_get_page_cnt(&qp->sq.pbl);
 -              params->sq_pbl_ptr = qed_chain_get_pbl_phys(&qp->sq.pbl);
 -      }
 +      return 0;
  }
  
 -static inline void
 -qedr_init_qp_in_params_rq(struct qedr_qp *qp,
 -                        struct ib_qp_init_attr *attrs,
 -                        struct ib_udata *udata,
 -                        struct qed_rdma_create_qp_in_params *params)
 +static void qedr_cleanup_kernel(struct qedr_dev *dev, struct qedr_qp *qp)
  {
 -      params->rq_cq_id = get_qedr_cq(attrs->recv_cq)->icid;
 -      params->srq_id = 0;
 -      params->use_srq = false;
 +      dev->ops->common->chain_free(dev->cdev, &qp->sq.pbl);
 +      kfree(qp->wqe_wr_id);
  
 -      if (udata) {
 -              params->rq_num_pages = qp->urq.pbl_info.num_pbes;
 -              params->rq_pbl_ptr = qp->urq.pbl_tbl->pa;
 -      } else {
 -              params->rq_num_pages = qed_chain_get_page_cnt(&qp->rq.pbl);
 -              params->rq_pbl_ptr = qed_chain_get_pbl_phys(&qp->rq.pbl);
 -      }
 +      dev->ops->common->chain_free(dev->cdev, &qp->rq.pbl);
 +      kfree(qp->rqe_wr_id);
  }
  
 -static inline void qedr_qp_user_print(struct qedr_dev *dev, struct qedr_qp *qp)
 +static int qedr_create_kernel_qp(struct qedr_dev *dev,
 +                               struct qedr_qp *qp,
 +                               struct ib_pd *ibpd,
 +                               struct ib_qp_init_attr *attrs)
  {
 -      DP_DEBUG(dev, QEDR_MSG_QP,
 -               "create qp: successfully created user QP. qp=%p, sq_addr=0x%llx, sq_len=%zd, rq_addr=0x%llx, rq_len=%zd\n",
 -               qp, qp->usq.buf_addr, qp->usq.buf_len, qp->urq.buf_addr,
 -               qp->urq.buf_len);
 -}
 +      struct qed_rdma_create_qp_in_params in_params;
 +      struct qedr_pd *pd = get_qedr_pd(ibpd);
 +      int rc = -EINVAL;
 +      u32 n_rq_elems;
 +      u32 n_sq_elems;
 +      u32 n_sq_entries;
  
 -static inline int qedr_init_user_qp(struct ib_ucontext *ib_ctx,
 -                                  struct qedr_dev *dev,
 -                                  struct qedr_qp *qp,
 -                                  struct qedr_create_qp_ureq *ureq)
 -{
 -      int rc;
 +      memset(&in_params, 0, sizeof(in_params));
  
 -      /* SQ - read access only (0), dma sync not required (0) */
 -      rc = qedr_init_user_queue(ib_ctx, dev, &qp->usq, ureq->sq_addr,
 -                                ureq->sq_len, 0, 0);
 -      if (rc)
 -              return rc;
 +      /* A single work request may take up to QEDR_MAX_SQ_WQE_SIZE elements in
 +       * the ring. The ring should allow at least a single WR, even if the
 +       * user requested none, due to allocation issues.
 +       * We should add an extra WR since the prod and cons indices of
 +       * wqe_wr_id are managed in such a way that the WQ is considered full
 +       * when (prod+1)%max_wr==cons. We currently don't do that because we
 +       * double the number of entries due an iSER issue that pushes far more
 +       * WRs than indicated. If we decline its ib_post_send() then we get
 +       * error prints in the dmesg we'd like to avoid.
 +       */
 +      qp->sq.max_wr = min_t(u32, attrs->cap.max_send_wr * dev->wq_multiplier,
 +                            dev->attr.max_sqe);
  
 -      /* RQ - read access only (0), dma sync not required (0) */
 -      rc = qedr_init_user_queue(ib_ctx, dev, &qp->urq, ureq->rq_addr,
 -                                ureq->rq_len, 0, 0);
 +      qp->wqe_wr_id = kzalloc(qp->sq.max_wr * sizeof(*qp->wqe_wr_id),
 +                              GFP_KERNEL);
 +      if (!qp->wqe_wr_id) {
 +              DP_ERR(dev, "create qp: failed SQ shadow memory allocation\n");
 +              return -ENOMEM;
 +      }
  
 -      if (rc)
 -              qedr_cleanup_user_sq(dev, qp);
 -      return rc;
 -}
 +      /* QP handle to be written in CQE */
 +      in_params.qp_handle_lo = lower_32_bits((uintptr_t) qp);
 +      in_params.qp_handle_hi = upper_32_bits((uintptr_t) qp);
  
 -static inline int
 -qedr_init_kernel_qp(struct qedr_dev *dev,
 -                  struct qedr_qp *qp,
 -                  struct ib_qp_init_attr *attrs,
 -                  struct qed_rdma_create_qp_in_params *params)
 -{
 -      int rc;
 +      /* A single work request may take up to QEDR_MAX_RQ_WQE_SIZE elements in
 +       * the ring. There ring should allow at least a single WR, even if the
 +       * user requested none, due to allocation issues.
 +       */
 +      qp->rq.max_wr = (u16) max_t(u32, attrs->cap.max_recv_wr, 1);
  
 -      rc = qedr_init_qp_kernel_sq(dev, qp, attrs);
 -      if (rc) {
 -              DP_ERR(dev, "failed to init kernel QP %p SQ\n", qp);
 -              return rc;
 +      /* Allocate driver internal RQ array */
 +      qp->rqe_wr_id = kzalloc(qp->rq.max_wr * sizeof(*qp->rqe_wr_id),
 +                              GFP_KERNEL);
 +      if (!qp->rqe_wr_id) {
 +              DP_ERR(dev,
 +                     "create qp: failed RQ shadow memory allocation\n");
 +              kfree(qp->wqe_wr_id);
 +              return -ENOMEM;
        }
  
 -      rc = qedr_init_qp_kernel_params_sq(dev, qp, attrs, params);
 -      if (rc) {
 -              dev->ops->common->chain_free(dev->cdev, &qp->sq.pbl);
 -              DP_ERR(dev, "failed to init kernel QP %p SQ params\n", qp);
 -              return rc;
 -      }
 +      qedr_init_common_qp_in_params(dev, pd, qp, attrs, true, &in_params);
  
 -      rc = qedr_init_qp_kernel_rq(dev, qp, attrs);
 -      if (rc) {
 -              qedr_cleanup_kernel_sq(dev, qp);
 -              DP_ERR(dev, "failed to init kernel QP %p RQ\n", qp);
 -              return rc;
 -      }
 +      n_sq_entries = attrs->cap.max_send_wr;
 +      n_sq_entries = min_t(u32, n_sq_entries, dev->attr.max_sqe);
 +      n_sq_entries = max_t(u32, n_sq_entries, 1);
 +      n_sq_elems = n_sq_entries * QEDR_MAX_SQE_ELEMENTS_PER_SQE;
  
 -      rc = qedr_init_qp_kernel_params_rq(dev, qp, attrs);
 -      if (rc) {
 -              DP_ERR(dev, "failed to init kernel QP %p RQ params\n", qp);
 -              qedr_cleanup_kernel_sq(dev, qp);
 -              dev->ops->common->chain_free(dev->cdev, &qp->rq.pbl);
 -              return rc;
 -      }
 +      n_rq_elems = qp->rq.max_wr * QEDR_MAX_RQE_ELEMENTS_PER_RQE;
 +
 +      rc = qedr_roce_create_kernel_qp(dev, qp, &in_params,
 +                                      n_sq_elems, n_rq_elems);
 +      if (rc)
 +              qedr_cleanup_kernel(dev, qp);
  
        return rc;
  }
@@@ -1472,7 -1480,12 +1472,7 @@@ struct ib_qp *qedr_create_qp(struct ib_
                             struct ib_udata *udata)
  {
        struct qedr_dev *dev = get_qedr_dev(ibpd->device);
 -      struct qed_rdma_create_qp_out_params out_params;
 -      struct qed_rdma_create_qp_in_params in_params;
        struct qedr_pd *pd = get_qedr_pd(ibpd);
 -      struct ib_ucontext *ib_ctx = NULL;
 -      struct qedr_ucontext *ctx = NULL;
 -      struct qedr_create_qp_ureq ureq;
        struct qedr_qp *qp;
        struct ib_qp *ibqp;
        int rc = 0;
        if (attrs->srq)
                return ERR_PTR(-EINVAL);
  
 -      qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 -      if (!qp)
 -              return ERR_PTR(-ENOMEM);
 -
        DP_DEBUG(dev, QEDR_MSG_QP,
 -               "create qp: sq_cq=%p, sq_icid=%d, rq_cq=%p, rq_icid=%d\n",
 +               "create qp: called from %s, event_handler=%p, eepd=%p sq_cq=%p, sq_icid=%d, rq_cq=%p, rq_icid=%d\n",
 +               udata ? "user library" : "kernel", attrs->event_handler, pd,
                 get_qedr_cq(attrs->send_cq),
                 get_qedr_cq(attrs->send_cq)->icid,
                 get_qedr_cq(attrs->recv_cq),
                 get_qedr_cq(attrs->recv_cq)->icid);
  
 -      qedr_set_qp_init_params(dev, qp, pd, attrs);
 +      qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 +      if (!qp) {
 +              DP_ERR(dev, "create qp: failed allocating memory\n");
 +              return ERR_PTR(-ENOMEM);
 +      }
 +
 +      qedr_set_common_qp_params(dev, qp, pd, attrs);
  
        if (attrs->qp_type == IB_QPT_GSI) {
 -              if (udata) {
 -                      DP_ERR(dev,
 -                             "create qp: unexpected udata when creating GSI QP\n");
 -                      goto err0;
 -              }
                ibqp = qedr_create_gsi_qp(dev, attrs, qp);
                if (IS_ERR(ibqp))
                        kfree(qp);
                return ibqp;
        }
  
 -      memset(&in_params, 0, sizeof(in_params));
 -
 -      if (udata) {
 -              if (!(udata && ibpd->uobject && ibpd->uobject->context))
 -                      goto err0;
 -
 -              ib_ctx = ibpd->uobject->context;
 -              ctx = get_qedr_ucontext(ib_ctx);
 -
 -              memset(&ureq, 0, sizeof(ureq));
 -              if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
 -                      DP_ERR(dev,
 -                             "create qp: problem copying data from user space\n");
 -                      goto err0;
 -              }
 -
 -              rc = qedr_init_user_qp(ib_ctx, dev, qp, &ureq);
 -              if (rc)
 -                      goto err0;
 -
 -              qedr_init_qp_user_params(&in_params, &ureq);
 -      } else {
 -              rc = qedr_init_kernel_qp(dev, qp, attrs, &in_params);
 -              if (rc)
 -                      goto err0;
 -      }
 -
 -      qedr_init_qp_in_params_sq(dev, pd, qp, attrs, udata, &in_params);
 -      qedr_init_qp_in_params_rq(qp, attrs, udata, &in_params);
 -
 -      qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx,
 -                                            &in_params, &out_params);
 +      if (udata)
 +              rc = qedr_create_user_qp(dev, qp, ibpd, udata, attrs);
 +      else
 +              rc = qedr_create_kernel_qp(dev, qp, ibpd, attrs);
  
 -      if (!qp->qed_qp)
 -              goto err1;
 +      if (rc)
 +              goto err;
  
 -      qp->qp_id = out_params.qp_id;
 -      qp->icid = out_params.icid;
        qp->ibqp.qp_num = qp->qp_id;
  
 -      if (udata) {
 -              rc = qedr_copy_qp_uresp(dev, qp, udata);
 -              if (rc)
 -                      goto err2;
 -
 -              qedr_qp_user_print(dev, qp);
 -      } else {
 -              qedr_init_qp_kernel_doorbell_sq(dev, qp);
 -              qedr_init_qp_kernel_doorbell_rq(dev, qp);
 -      }
 -
 -      DP_DEBUG(dev, QEDR_MSG_QP, "created %s space QP %p\n",
 -               udata ? "user" : "kernel", qp);
 -
        return &qp->ibqp;
  
 -err2:
 -      rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
 -      if (rc)
 -              DP_ERR(dev, "create qp: fatal fault. rc=%d", rc);
 -err1:
 -      if (udata) {
 -              qedr_cleanup_user_sq(dev, qp);
 -              qedr_cleanup_user_rq(dev, qp);
 -      } else {
 -              qedr_cleanup_kernel_sq(dev, qp);
 -              qedr_cleanup_kernel_rq(dev, qp);
 -      }
 -
 -err0:
 +err:
        kfree(qp);
  
        return ERR_PTR(-EFAULT);
  }
  
 -enum ib_qp_state qedr_get_ibqp_state(enum qed_roce_qp_state qp_state)
 +static enum ib_qp_state qedr_get_ibqp_state(enum qed_roce_qp_state qp_state)
  {
        switch (qp_state) {
        case QED_ROCE_QP_STATE_RESET:
        return IB_QPS_ERR;
  }
  
 -enum qed_roce_qp_state qedr_get_state_from_ibqp(enum ib_qp_state qp_state)
 +static enum qed_roce_qp_state qedr_get_state_from_ibqp(
 +                                      enum ib_qp_state qp_state)
  {
        switch (qp_state) {
        case IB_QPS_RESET:
@@@ -1586,7 -1657,7 +1586,7 @@@ static int qedr_update_qp_state(struct 
        int status = 0;
  
        if (new_state == qp->state)
 -              return 1;
 +              return 0;
  
        switch (qp->state) {
        case QED_ROCE_QP_STATE_RESET:
                /* ERR->XXX */
                switch (new_state) {
                case QED_ROCE_QP_STATE_RESET:
 +                      if ((qp->rq.prod != qp->rq.cons) ||
 +                          (qp->sq.prod != qp->sq.cons)) {
 +                              DP_NOTICE(dev,
 +                                        "Error->Reset with rq/sq not empty rq.prod=%x rq.cons=%x sq.prod=%x sq.cons=%x\n",
 +                                        qp->rq.prod, qp->rq.cons, qp->sq.prod,
 +                                        qp->sq.cons);
 +                              status = -EINVAL;
 +                      }
                        break;
                default:
                        status = -EINVAL;
@@@ -1802,6 -1865,7 +1802,6 @@@ int qedr_modify_qp(struct ib_qp *ibqp, 
                         qp_params.sgid.dwords[2], qp_params.sgid.dwords[3]);
                DP_DEBUG(dev, QEDR_MSG_QP, "remote_mac=[%pM]\n",
                         qp_params.remote_mac_addr);
 -;
  
                qp_params.mtu = qp->mtu;
                qp_params.lb_indication = false;
@@@ -1952,7 -2016,7 +1952,7 @@@ int qedr_query_qp(struct ib_qp *ibqp
  
        qp_attr->qp_state = qedr_get_ibqp_state(params.state);
        qp_attr->cur_qp_state = qedr_get_ibqp_state(params.state);
 -      qp_attr->path_mtu = iboe_get_mtu(params.mtu);
 +      qp_attr->path_mtu = ib_mtu_int_to_enum(params.mtu);
        qp_attr->path_mig_state = IB_MIG_MIGRATED;
        qp_attr->rq_psn = params.rq_psn;
        qp_attr->sq_psn = params.sq_psn;
        qp_attr->cap.max_recv_wr = qp->rq.max_wr;
        qp_attr->cap.max_send_sge = qp->sq.max_sges;
        qp_attr->cap.max_recv_sge = qp->rq.max_sges;
 -      qp_attr->cap.max_inline_data = qp->max_inline_data;
 +      qp_attr->cap.max_inline_data = ROCE_REQ_MAX_INLINE_DATA_SIZE;
        qp_init_attr->cap = qp_attr->cap;
  
        memcpy(&qp_attr->ah_attr.grh.dgid.raw[0], &params.dgid.bytes[0],
        return rc;
  }
  
 +int qedr_free_qp_resources(struct qedr_dev *dev, struct qedr_qp *qp)
 +{
 +      int rc = 0;
 +
 +      if (qp->qp_type != IB_QPT_GSI) {
 +              rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
 +              if (rc)
 +                      return rc;
 +      }
 +
 +      if (qp->ibqp.uobject && qp->ibqp.uobject->context)
 +              qedr_cleanup_user(dev, qp);
 +      else
 +              qedr_cleanup_kernel(dev, qp);
 +
 +      return 0;
 +}
 +
  int qedr_destroy_qp(struct ib_qp *ibqp)
  {
        struct qedr_qp *qp = get_qedr_qp(ibqp);
                qedr_modify_qp(ibqp, &attr, attr_mask, NULL);
        }
  
 -      if (qp->qp_type != IB_QPT_GSI) {
 -              rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
 -              if (rc)
 -                      return rc;
 -      } else {
 +      if (qp->qp_type == IB_QPT_GSI)
                qedr_destroy_gsi_qp(dev);
 -      }
  
 -      if (ibqp->uobject && ibqp->uobject->context) {
 -              qedr_cleanup_user_sq(dev, qp);
 -              qedr_cleanup_user_rq(dev, qp);
 -      } else {
 -              qedr_cleanup_kernel_sq(dev, qp);
 -              qedr_cleanup_kernel_rq(dev, qp);
 -      }
 +      qedr_free_qp_resources(dev, qp);
  
        kfree(qp);
  
@@@ -2107,8 -2164,8 +2107,8 @@@ static int init_mr_info(struct qedr_de
                goto done;
  
        info->pbl_table = qedr_alloc_pbl_tbl(dev, &info->pbl_info, GFP_KERNEL);
 -      if (!info->pbl_table) {
 -              rc = -ENOMEM;
 +      if (IS_ERR(info->pbl_table)) {
 +              rc = PTR_ERR(info->pbl_table);
                goto done;
        }
  
         * list and allocating another one
         */
        tmp = qedr_alloc_pbl_tbl(dev, &info->pbl_info, GFP_KERNEL);
 -      if (!tmp) {
 +      if (IS_ERR(tmp)) {
                DP_DEBUG(dev, QEDR_MSG_MR, "Extra PBL is not allocated\n");
                goto done;
        }
@@@ -2245,8 -2302,7 +2245,8 @@@ int qedr_dereg_mr(struct ib_mr *ib_mr
        return rc;
  }
  
 -struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, int max_page_list_len)
 +static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd,
 +                                     int max_page_list_len)
  {
        struct qedr_pd *pd = get_qedr_pd(ibpd);
        struct qedr_dev *dev = get_qedr_dev(ibpd->device);
@@@ -2648,7 -2704,7 +2648,7 @@@ static int qedr_prepare_reg(struct qedr
        return 0;
  }
  
 -enum ib_wc_opcode qedr_ib_to_wc_opcode(enum ib_wr_opcode opcode)
 +static enum ib_wc_opcode qedr_ib_to_wc_opcode(enum ib_wr_opcode opcode)
  {
        switch (opcode) {
        case IB_WR_RDMA_WRITE:
        }
  }
  
 -inline bool qedr_can_post_send(struct qedr_qp *qp, struct ib_send_wr *wr)
 +static inline bool qedr_can_post_send(struct qedr_qp *qp, struct ib_send_wr *wr)
  {
        int wq_is_full, err_wr, pbl_is_full;
        struct qedr_dev *dev = qp->dev;
        return true;
  }
  
 -int __qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 +static int __qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                     struct ib_send_wr **bad_wr)
  {
        struct qedr_dev *dev = get_qedr_dev(ibqp->device);
@@@ -3178,10 -3234,9 +3178,10 @@@ static int qedr_poll_cq_req(struct qedr
                                  IB_WC_SUCCESS, 0);
                break;
        case RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR:
 -              DP_ERR(dev,
 -                     "Error: POLL CQ with RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR. CQ icid=0x%x, QP icid=0x%x\n",
 -                     cq->icid, qp->icid);
 +              if (qp->state != QED_ROCE_QP_STATE_ERR)
 +                      DP_ERR(dev,
 +                             "Error: POLL CQ with RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR. CQ icid=0x%x, QP icid=0x%x\n",
 +                             cq->icid, qp->icid);
                cnt = process_req(dev, qp, cq, num_entries, wc, req->sq_cons,
                                  IB_WC_WR_FLUSH_ERR, 1);
                break;
@@@ -3494,14 -3549,15 +3494,15 @@@ int qedr_port_immutable(struct ib_devic
        struct ib_port_attr attr;
        int err;
  
-       err = qedr_query_port(ibdev, port_num, &attr);
+       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE |
+                                   RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+       err = ib_query_port(ibdev, port_num, &attr);
        if (err)
                return err;
  
        immutable->pkey_tbl_len = attr.pkey_tbl_len;
        immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE |
-                                   RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
        immutable->max_mad_size = IB_MGMT_MAD_SIZE;
  
        return 0;
index b0b78e1cec9282dcf95bcccdd88e13fb1d4dfb69,9161574601f69496e4cbc87fa05cb2431235d899..6b56f1c01a0789f9335691f57535947df4afa2a9
@@@ -129,16 -129,78 +129,16 @@@ void qib_copy_sge(struct rvt_sge_state 
        struct rvt_sge *sge = &ss->sge;
  
        while (length) {
 -              u32 len = sge->length;
 +              u32 len = rvt_get_sge_length(sge, length);
  
 -              if (len > length)
 -                      len = length;
 -              if (len > sge->sge_length)
 -                      len = sge->sge_length;
 -              BUG_ON(len == 0);
 +              WARN_ON_ONCE(len == 0);
                memcpy(sge->vaddr, data, len);
 -              sge->vaddr += len;
 -              sge->length -= len;
 -              sge->sge_length -= len;
 -              if (sge->sge_length == 0) {
 -                      if (release)
 -                              rvt_put_mr(sge->mr);
 -                      if (--ss->num_sge)
 -                              *sge = *ss->sg_list++;
 -              } else if (sge->length == 0 && sge->mr->lkey) {
 -                      if (++sge->n >= RVT_SEGSZ) {
 -                              if (++sge->m >= sge->mr->mapsz)
 -                                      break;
 -                              sge->n = 0;
 -                      }
 -                      sge->vaddr =
 -                              sge->mr->map[sge->m]->segs[sge->n].vaddr;
 -                      sge->length =
 -                              sge->mr->map[sge->m]->segs[sge->n].length;
 -              }
 +              rvt_update_sge(ss, len, release);
                data += len;
                length -= len;
        }
  }
  
 -/**
 - * qib_skip_sge - skip over SGE memory - XXX almost dup of prev func
 - * @ss: the SGE state
 - * @length: the number of bytes to skip
 - */
 -void qib_skip_sge(struct rvt_sge_state *ss, u32 length, int release)
 -{
 -      struct rvt_sge *sge = &ss->sge;
 -
 -      while (length) {
 -              u32 len = sge->length;
 -
 -              if (len > length)
 -                      len = length;
 -              if (len > sge->sge_length)
 -                      len = sge->sge_length;
 -              BUG_ON(len == 0);
 -              sge->vaddr += len;
 -              sge->length -= len;
 -              sge->sge_length -= len;
 -              if (sge->sge_length == 0) {
 -                      if (release)
 -                              rvt_put_mr(sge->mr);
 -                      if (--ss->num_sge)
 -                              *sge = *ss->sg_list++;
 -              } else if (sge->length == 0 && sge->mr->lkey) {
 -                      if (++sge->n >= RVT_SEGSZ) {
 -                              if (++sge->m >= sge->mr->mapsz)
 -                                      break;
 -                              sge->n = 0;
 -                      }
 -                      sge->vaddr =
 -                              sge->mr->map[sge->m]->segs[sge->n].vaddr;
 -                      sge->length =
 -                              sge->mr->map[sge->m]->segs[sge->n].length;
 -              }
 -              length -= len;
 -      }
 -}
 -
  /*
   * Count the number of DMA descriptors needed to send length bytes of data.
   * Don't modify the qib_sge_state to get the count.
@@@ -406,6 -468,27 +406,6 @@@ static void mem_timer(unsigned long dat
        }
  }
  
 -static void update_sge(struct rvt_sge_state *ss, u32 length)
 -{
 -      struct rvt_sge *sge = &ss->sge;
 -
 -      sge->vaddr += length;
 -      sge->length -= length;
 -      sge->sge_length -= length;
 -      if (sge->sge_length == 0) {
 -              if (--ss->num_sge)
 -                      *sge = *ss->sg_list++;
 -      } else if (sge->length == 0 && sge->mr->lkey) {
 -              if (++sge->n >= RVT_SEGSZ) {
 -                      if (++sge->m >= sge->mr->mapsz)
 -                              return;
 -                      sge->n = 0;
 -              }
 -              sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
 -              sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
 -      }
 -}
 -
  #ifdef __LITTLE_ENDIAN
  static inline u32 get_upper_bits(u32 data, u32 shift)
  {
@@@ -563,11 -646,11 +563,11 @@@ static void copy_io(u32 __iomem *piobuf
                                data = clear_upper_bytes(v, extra, 0);
                        }
                }
 -              update_sge(ss, len);
 +              rvt_update_sge(ss, len, false);
                length -= len;
        }
        /* Update address before sending packet. */
 -      update_sge(ss, length);
 +      rvt_update_sge(ss, length, false);
        if (flush_wc) {
                /* must flush early everything before trigger word */
                qib_flush_wc();
@@@ -986,7 -1069,7 +986,7 @@@ static int qib_verbs_send_pio(struct rv
                u32 *addr = (u32 *) ss->sge.vaddr;
  
                /* Update address before sending packet. */
 -              update_sge(ss, len);
 +              rvt_update_sge(ss, len, false);
                if (flush_wc) {
                        qib_pio_copy(piobuf, addr, dwords - 1);
                        /* must flush early everything before trigger word */
@@@ -1220,6 -1303,7 +1220,7 @@@ static int qib_query_port(struct rvt_de
        enum ib_mtu mtu;
        u16 lid = ppd->lid;
  
+       /* props being zeroed by the caller, avoid zeroing it here */
        props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
        props->lmc = ppd->lmc;
        props->state = dd->f_iblink_state(ppd->lastibcstat);
@@@ -1576,7 -1660,6 +1577,7 @@@ int qib_register_ib_device(struct qib_d
        dd->verbs_dev.rdi.driver_f.stop_send_queue = qib_stop_send_queue;
        dd->verbs_dev.rdi.driver_f.flush_qp_waiters = qib_flush_qp_waiters;
        dd->verbs_dev.rdi.driver_f.notify_error_qp = qib_notify_error_qp;
 +      dd->verbs_dev.rdi.driver_f.notify_restart_rc = qib_restart_rc;
        dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = qib_mtu_to_path_mtu;
        dd->verbs_dev.rdi.driver_f.mtu_from_qp = qib_mtu_from_qp;
        dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = qib_get_pmtu_from_attr;
index 69df8e353123c893aa1f34277bc86dd961bffdea,0ba274ff7be671affa1a63d6ef418d4e8e0e10c8..3284730d3c0923d33f3c2ea916c0f38f71c515ca
@@@ -291,11 -291,11 +291,11 @@@ int usnic_ib_query_device(struct ib_dev
        qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
                        us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
        props->max_qp = qp_per_vf *
 -              atomic_read(&us_ibdev->vf_cnt.refcount);
 +              kref_read(&us_ibdev->vf_cnt);
        props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT |
                IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
        props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] *
 -              atomic_read(&us_ibdev->vf_cnt.refcount);
 +              kref_read(&us_ibdev->vf_cnt);
        props->max_pd = USNIC_UIOM_MAX_PD_CNT;
        props->max_mr = USNIC_UIOM_MAX_MR_CNT;
        props->local_ca_ack_delay = 0;
@@@ -330,7 -330,7 +330,7 @@@ int usnic_ib_query_port(struct ib_devic
  
        mutex_lock(&us_ibdev->usdev_lock);
        __ethtool_get_link_ksettings(us_ibdev->netdev, &cmd);
-       memset(props, 0, sizeof(*props));
+       /* props being zeroed by the caller, avoid zeroing it here */
  
        props->lid = 0;
        props->lmc = 1;
index 60cdb77195650c25f1b3776897ef239ca2e730a7,b8b0081de4789c8430b34b77876573121d9953f4..e03d2f6c1f90ed4f7f9782027d73ba9c44ab4f3e
@@@ -132,13 -132,14 +132,14 @@@ static int pvrdma_port_immutable(struc
        struct ib_port_attr attr;
        int err;
  
-       err = pvrdma_query_port(ibdev, port_num, &attr);
+       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
+       err = ib_query_port(ibdev, port_num, &attr);
        if (err)
                return err;
  
        immutable->pkey_tbl_len = attr.pkey_tbl_len;
        immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE;
        immutable->max_mad_size = IB_MGMT_MAD_SIZE;
        return 0;
  }
@@@ -282,7 -283,7 +283,7 @@@ static irqreturn_t pvrdma_intr0_handler
  
        dev_dbg(&dev->pdev->dev, "interrupt 0 (response) handler\n");
  
 -      if (dev->intr.type != PVRDMA_INTR_TYPE_MSIX) {
 +      if (!dev->pdev->msix_enabled) {
                /* Legacy intr */
                icr = pvrdma_read_reg(dev, PVRDMA_REG_ICR);
                if (icr == 0)
@@@ -489,13 -490,31 +490,13 @@@ static irqreturn_t pvrdma_intrx_handler
        return IRQ_HANDLED;
  }
  
 -static void pvrdma_disable_msi_all(struct pvrdma_dev *dev)
 -{
 -      if (dev->intr.type == PVRDMA_INTR_TYPE_MSIX)
 -              pci_disable_msix(dev->pdev);
 -      else if (dev->intr.type == PVRDMA_INTR_TYPE_MSI)
 -              pci_disable_msi(dev->pdev);
 -}
 -
  static void pvrdma_free_irq(struct pvrdma_dev *dev)
  {
        int i;
  
        dev_dbg(&dev->pdev->dev, "freeing interrupts\n");
 -
 -      if (dev->intr.type == PVRDMA_INTR_TYPE_MSIX) {
 -              for (i = 0; i < dev->intr.size; i++) {
 -                      if (dev->intr.enabled[i]) {
 -                              free_irq(dev->intr.msix_entry[i].vector, dev);
 -                              dev->intr.enabled[i] = 0;
 -                      }
 -              }
 -      } else if (dev->intr.type == PVRDMA_INTR_TYPE_INTX ||
 -                 dev->intr.type == PVRDMA_INTR_TYPE_MSI) {
 -              free_irq(dev->pdev->irq, dev);
 -      }
 +      for (i = 0; i < dev->nr_vectors; i++)
 +              free_irq(pci_irq_vector(dev->pdev, i), dev);
  }
  
  static void pvrdma_enable_intrs(struct pvrdma_dev *dev)
@@@ -510,48 -529,126 +511,48 @@@ static void pvrdma_disable_intrs(struc
        pvrdma_write_reg(dev, PVRDMA_REG_IMR, ~0);
  }
  
 -static int pvrdma_enable_msix(struct pci_dev *pdev, struct pvrdma_dev *dev)
 -{
 -      int i;
 -      int ret;
 -
 -      for (i = 0; i < PVRDMA_MAX_INTERRUPTS; i++) {
 -              dev->intr.msix_entry[i].entry = i;
 -              dev->intr.msix_entry[i].vector = i;
 -
 -              switch (i) {
 -              case 0:
 -                      /* CMD ring handler */
 -                      dev->intr.handler[i] = pvrdma_intr0_handler;
 -                      break;
 -              case 1:
 -                      /* Async event ring handler */
 -                      dev->intr.handler[i] = pvrdma_intr1_handler;
 -                      break;
 -              default:
 -                      /* Completion queue handler */
 -                      dev->intr.handler[i] = pvrdma_intrx_handler;
 -                      break;
 -              }
 -      }
 -
 -      ret = pci_enable_msix(pdev, dev->intr.msix_entry,
 -                            PVRDMA_MAX_INTERRUPTS);
 -      if (!ret) {
 -              dev->intr.type = PVRDMA_INTR_TYPE_MSIX;
 -              dev->intr.size = PVRDMA_MAX_INTERRUPTS;
 -      } else if (ret > 0) {
 -              ret = pci_enable_msix(pdev, dev->intr.msix_entry, ret);
 -              if (!ret) {
 -                      dev->intr.type = PVRDMA_INTR_TYPE_MSIX;
 -                      dev->intr.size = ret;
 -              } else {
 -                      dev->intr.size = 0;
 -              }
 -      }
 -
 -      dev_dbg(&pdev->dev, "using interrupt type %d, size %d\n",
 -              dev->intr.type, dev->intr.size);
 -
 -      return ret;
 -}
 -
  static int pvrdma_alloc_intrs(struct pvrdma_dev *dev)
  {
 -      int ret = 0;
 -      int i;
 +      struct pci_dev *pdev = dev->pdev;
 +      int ret = 0, i;
  
 -      if (pci_find_capability(dev->pdev, PCI_CAP_ID_MSIX) &&
 -          pvrdma_enable_msix(dev->pdev, dev)) {
 -              /* Try MSI */
 -              ret = pci_enable_msi(dev->pdev);
 -              if (!ret) {
 -                      dev->intr.type = PVRDMA_INTR_TYPE_MSI;
 -              } else {
 -                      /* Legacy INTR */
 -                      dev->intr.type = PVRDMA_INTR_TYPE_INTX;
 -              }
 +      ret = pci_alloc_irq_vectors(pdev, 1, PVRDMA_MAX_INTERRUPTS,
 +                      PCI_IRQ_MSIX);
 +      if (ret < 0) {
 +              ret = pci_alloc_irq_vectors(pdev, 1, 1,
 +                              PCI_IRQ_MSI | PCI_IRQ_LEGACY);
 +              if (ret < 0)
 +                      return ret;
        }
 +      dev->nr_vectors = ret;
  
 -      /* Request First IRQ */
 -      switch (dev->intr.type) {
 -      case PVRDMA_INTR_TYPE_INTX:
 -      case PVRDMA_INTR_TYPE_MSI:
 -              ret = request_irq(dev->pdev->irq, pvrdma_intr0_handler,
 -                                IRQF_SHARED, DRV_NAME, dev);
 -              if (ret) {
 -                      dev_err(&dev->pdev->dev,
 -                              "failed to request interrupt\n");
 -                      goto disable_msi;
 -              }
 -              break;
 -      case PVRDMA_INTR_TYPE_MSIX:
 -              ret = request_irq(dev->intr.msix_entry[0].vector,
 -                                pvrdma_intr0_handler, 0, DRV_NAME, dev);
 -              if (ret) {
 -                      dev_err(&dev->pdev->dev,
 -                              "failed to request interrupt 0\n");
 -                      goto disable_msi;
 -              }
 -              dev->intr.enabled[0] = 1;
 -              break;
 -      default:
 -              /* Not reached */
 -              break;
 +      ret = request_irq(pci_irq_vector(dev->pdev, 0), pvrdma_intr0_handler,
 +                      pdev->msix_enabled ? 0 : IRQF_SHARED, DRV_NAME, dev);
 +      if (ret) {
 +              dev_err(&dev->pdev->dev,
 +                      "failed to request interrupt 0\n");
 +              goto out_free_vectors;
        }
  
 -      /* For MSIX: request intr for each vector */
 -      if (dev->intr.size > 1) {
 -              ret = request_irq(dev->intr.msix_entry[1].vector,
 -                                pvrdma_intr1_handler, 0, DRV_NAME, dev);
 +      for (i = 1; i < dev->nr_vectors; i++) {
 +              ret = request_irq(pci_irq_vector(dev->pdev, i),
 +                              i == 1 ? pvrdma_intr1_handler :
 +                                       pvrdma_intrx_handler,
 +                              0, DRV_NAME, dev);
                if (ret) {
                        dev_err(&dev->pdev->dev,
 -                              "failed to request interrupt 1\n");
 -                      goto free_irq;
 -              }
 -              dev->intr.enabled[1] = 1;
 -
 -              for (i = 2; i < dev->intr.size; i++) {
 -                      ret = request_irq(dev->intr.msix_entry[i].vector,
 -                                        pvrdma_intrx_handler, 0,
 -                                        DRV_NAME, dev);
 -                      if (ret) {
 -                              dev_err(&dev->pdev->dev,
 -                                      "failed to request interrupt %d\n", i);
 -                              goto free_irq;
 -                      }
 -                      dev->intr.enabled[i] = 1;
 +                              "failed to request interrupt %d\n", i);
 +                      goto free_irqs;
                }
        }
  
        return 0;
  
 -free_irq:
 -      pvrdma_free_irq(dev);
 -disable_msi:
 -      pvrdma_disable_msi_all(dev);
 +free_irqs:
 +      while (--i >= 0)
 +              free_irq(pci_irq_vector(dev->pdev, i), dev);
 +out_free_vectors:
 +      pci_free_irq_vectors(pdev);
        return ret;
  }
  
@@@ -933,7 -1030,7 +934,7 @@@ static int pvrdma_pci_probe(struct pci_
        if (ret) {
                dev_err(&pdev->dev, "failed to allocate interrupts\n");
                ret = -ENOMEM;
 -              goto err_netdevice;
 +              goto err_free_cq_ring;
        }
  
        /* Allocate UAR table. */
@@@ -995,7 -1092,9 +996,7 @@@ err_free_uar_table
        pvrdma_uar_table_cleanup(dev);
  err_free_intrs:
        pvrdma_free_irq(dev);
 -      pvrdma_disable_msi_all(dev);
 -err_netdevice:
 -      unregister_netdevice_notifier(&dev->nb_netdev);
 +      pci_free_irq_vectors(pdev);
  err_free_cq_ring:
        pvrdma_page_dir_cleanup(dev, &dev->cq_pdir);
  err_free_async_ring:
@@@ -1045,7 -1144,7 +1046,7 @@@ static void pvrdma_pci_remove(struct pc
  
        pvrdma_disable_intrs(dev);
        pvrdma_free_irq(dev);
 -      pvrdma_disable_msi_all(dev);
 +      pci_free_irq_vectors(pdev);
  
        /* Deactivate pvrdma device */
        pvrdma_write_reg(dev, PVRDMA_REG_CTL, PVRDMA_DEVICE_CTL_RESET);
index c2aa52638dcb81ea4539b61c43d61f55edefb2b3,b3a982be8006f6fc63343e4a07e750dc2e13ab09..fec17c49103b9a89c992e23a93e335ffefea573a
@@@ -135,7 -135,7 +135,7 @@@ int pvrdma_query_port(struct ib_device 
                return err;
        }
  
-       memset(props, 0, sizeof(*props));
+       /* props being zeroed by the caller, avoid zeroing it here */
  
        props->state = pvrdma_port_state_to_ib(resp->attrs.state);
        props->max_mtu = pvrdma_mtu_to_ib(resp->attrs.max_mtu);
@@@ -275,7 -275,7 +275,7 @@@ int pvrdma_modify_port(struct ib_devic
        }
  
        mutex_lock(&vdev->port_mutex);
-       ret = pvrdma_query_port(ibdev, port, &attr);
+       ret = ib_query_port(ibdev, port, &attr);
        if (ret)
                goto out;
  
@@@ -306,7 -306,7 +306,7 @@@ struct ib_ucontext *pvrdma_alloc_uconte
        union pvrdma_cmd_resp rsp;
        struct pvrdma_cmd_create_uc *cmd = &req.create_uc;
        struct pvrdma_cmd_create_uc_resp *resp = &rsp.create_uc_resp;
 -      struct pvrdma_alloc_ucontext_resp uresp;
 +      struct pvrdma_alloc_ucontext_resp uresp = {0};
        int ret;
        void *ptr;
  
index e4de37fb9aabc0d634f0c95a2045a248e00a8f04,371ef3bac8d4c2b26b03d31d0072e76a3ba81ac5..d2e2eff7a515dd31ac5bd12cde06a80c5f806fcb
@@@ -86,6 -86,7 +86,7 @@@ static int rxe_query_port(struct ib_dev
  
        port = &rxe->port;
  
+       /* *attr being zeroed by the caller, avoid zeroing it here */
        *attr = port->attr;
  
        mutex_lock(&rxe->usdev_lock);
@@@ -234,7 -235,7 +235,7 @@@ static enum rdma_link_layer rxe_get_lin
  {
        struct rxe_dev *rxe = to_rdev(dev);
  
 -      return rxe->ifc_ops->link_layer(rxe, port_num);
 +      return rxe_link_layer(rxe, port_num);
  }
  
  static struct ib_ucontext *rxe_alloc_ucontext(struct ib_device *dev,
@@@ -261,13 -262,14 +262,14 @@@ static int rxe_port_immutable(struct ib
        int err;
        struct ib_port_attr attr;
  
-       err = rxe_query_port(dev, port_num, &attr);
+       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+       err = ib_query_port(dev, port_num, &attr);
        if (err)
                return err;
  
        immutable->pkey_tbl_len = attr.pkey_tbl_len;
        immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
        immutable->max_mad_size = IB_MGMT_MAD_SIZE;
  
        return 0;
@@@ -1209,8 -1211,10 +1211,8 @@@ static ssize_t rxe_show_parent(struct d
  {
        struct rxe_dev *rxe = container_of(device, struct rxe_dev,
                                           ib_dev.dev);
 -      char *name;
  
 -      name = rxe->ifc_ops->parent_name(rxe, 1);
 -      return snprintf(buf, 16, "%s\n", name);
 +      return snprintf(buf, 16, "%s\n", rxe_parent_name(rxe, 1));
  }
  
  static DEVICE_ATTR(parent, S_IRUGO, rxe_show_parent, NULL);
@@@ -1232,9 -1236,9 +1234,9 @@@ int rxe_register_device(struct rxe_dev 
        dev->node_type = RDMA_NODE_IB_CA;
        dev->phys_port_cnt = 1;
        dev->num_comp_vectors = RXE_NUM_COMP_VECTORS;
 -      dev->dma_device = rxe->ifc_ops->dma_device(rxe);
 +      dev->dma_device = rxe_dma_device(rxe);
        dev->local_dma_lkey = 0;
 -      dev->node_guid = rxe->ifc_ops->node_guid(rxe);
 +      dev->node_guid = rxe_node_guid(rxe);
        dev->dma_ops = &rxe_dma_mapping_ops;
  
        dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION;
index ce3d92106386b31d5aa44dae7a347c9d0830c49e,2f4eb99a50fa0001bfaf27236203569f8f445929..2478516a61e2ea547f5ae8af0c3aae7228e64db9
@@@ -1232,10 -1232,18 +1232,18 @@@ static struct mlx5_flow_handle *add_rul
        fs_for_each_fte(fte, fg) {
                nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
                if (compare_match_value(&fg->mask, match_value, &fte->val) &&
-                   (flow_act->action & fte->action) &&
-                   flow_act->flow_tag == fte->flow_tag) {
+                   (flow_act->action & fte->action)) {
                        int old_action = fte->action;
  
+                       if (fte->flow_tag != flow_act->flow_tag) {
+                               mlx5_core_warn(get_dev(&fte->node),
+                                              "FTE flow tag %u already exists with different flow tag %u\n",
+                                              fte->flow_tag,
+                                              flow_act->flow_tag);
+                               handle = ERR_PTR(-EEXIST);
+                               goto unlock_fte;
+                       }
                        fte->action |= flow_act->action;
                        handle = add_rule_fte(fte, fg, dest, dest_num,
                                              old_action != flow_act->action);
@@@ -1665,7 -1673,7 +1673,7 @@@ static int create_leaf_prios(struct mlx
  
  #define FLOW_TABLE_BIT_SZ 1
  #define GET_FLOW_TABLE_CAP(dev, offset) \
 -      ((be32_to_cpu(*((__be32 *)(dev->hca_caps_cur[MLX5_CAP_FLOW_TABLE]) +    \
 +      ((be32_to_cpu(*((__be32 *)(dev->caps.hca_cur[MLX5_CAP_FLOW_TABLE]) +    \
                        offset / 32)) >>                                        \
          (32 - FLOW_TABLE_BIT_SZ - (offset & 0x1f))) & FLOW_TABLE_BIT_SZ)
  static bool has_required_caps(struct mlx5_core_dev *dev, struct node_caps *caps)
@@@ -1822,7 -1830,7 +1830,7 @@@ static int create_anchor_flow_table(str
        struct mlx5_flow_table *ft;
  
        ns = mlx5_get_flow_namespace(steering->dev, MLX5_FLOW_NAMESPACE_ANCHOR);
 -      if (!ns)
 +      if (WARN_ON(!ns))
                return -EINVAL;
        ft = mlx5_create_flow_table(ns, ANCHOR_PRIO, ANCHOR_SIZE, ANCHOR_LEVEL, 0);
        if (IS_ERR(ft)) {
index 1bc4641734da943e80af43af5c82e4eda0a62319,886ff2b00500b2037279aa472c45410c4b55089e..2fcff6b4503f6a4824bea50c189b072ef6c486cb
@@@ -121,15 -121,10 +121,15 @@@ enum 
        MLX5_REG_PVLC            = 0x500f,
        MLX5_REG_PCMR            = 0x5041,
        MLX5_REG_PMLP            = 0x5002,
 +      MLX5_REG_PCAM            = 0x507f,
        MLX5_REG_NODE_DESC       = 0x6001,
        MLX5_REG_HOST_ENDIANNESS = 0x7004,
        MLX5_REG_MCIA            = 0x9014,
        MLX5_REG_MLCR            = 0x902b,
 +      MLX5_REG_MPCNT           = 0x9051,
 +      MLX5_REG_MTPPS           = 0x9053,
 +      MLX5_REG_MTPPSE          = 0x9054,
 +      MLX5_REG_MCAM            = 0x907f,
  };
  
  enum mlx5_dcbx_oper_mode {
@@@ -177,7 -172,6 +177,7 @@@ enum mlx5_dev_event 
        MLX5_DEV_EVENT_PKEY_CHANGE,
        MLX5_DEV_EVENT_GUID_CHANGE,
        MLX5_DEV_EVENT_CLIENT_REREG,
 +      MLX5_DEV_EVENT_PPS,
  };
  
  enum mlx5_port_status {
@@@ -295,6 -289,7 +295,7 @@@ struct mlx5_port_caps 
        int     gid_table_len;
        int     pkey_table_len;
        u8      ext_port_cap;
+       bool    has_smi;
  };
  
  struct mlx5_cmd_mailbox {
@@@ -738,12 -733,8 +739,12 @@@ struct mlx5_core_dev 
        char                    board_id[MLX5_BOARD_ID_LEN];
        struct mlx5_cmd         cmd;
        struct mlx5_port_caps   port_caps[MLX5_MAX_PORTS];
 -      u32 hca_caps_cur[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
 -      u32 hca_caps_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
 +      struct {
 +              u32 hca_cur[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
 +              u32 hca_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
 +              u32 pcam[MLX5_ST_SZ_DW(pcam_reg)];
 +              u32 mcam[MLX5_ST_SZ_DW(mcam_reg)];
 +      } caps;
        phys_addr_t             iseg_base;
        struct mlx5_init_seg __iomem *iseg;
        enum mlx5_device_state  state;
@@@ -1061,7 -1052,10 +1062,10 @@@ enum 
  };
  
  enum {
-       MAX_MR_CACHE_ENTRIES    = 21,
+       MAX_UMR_CACHE_ENTRY = 20,
+       MLX5_IMR_MTT_CACHE_ENTRY,
+       MLX5_IMR_KSM_CACHE_ENTRY,
+       MAX_MR_CACHE_ENTRIES
  };
  
  enum {
index afcd4736d8df7b57450e6d94c1b67bea7f55e561,2d197d8a7025ba345c8420aaaa6b5e67f078e49f..838242697541a28fdda4d90bf7b604e25f3bfba2
@@@ -365,8 -365,8 +365,8 @@@ struct mlx5_ifc_fte_match_set_lyr_2_4_b
        u8         ip_protocol[0x8];
        u8         ip_dscp[0x6];
        u8         ip_ecn[0x2];
 -      u8         vlan_tag[0x1];
 -      u8         reserved_at_91[0x1];
 +      u8         cvlan_tag[0x1];
 +      u8         svlan_tag[0x1];
        u8         frag[0x1];
        u8         reserved_at_93[0x4];
        u8         tcp_flags[0x9];
@@@ -398,11 -398,9 +398,11 @@@ struct mlx5_ifc_fte_match_set_misc_bit
        u8         inner_second_cfi[0x1];
        u8         inner_second_vid[0xc];
  
 -      u8         outer_second_vlan_tag[0x1];
 -      u8         inner_second_vlan_tag[0x1];
 -      u8         reserved_at_62[0xe];
 +      u8         outer_second_cvlan_tag[0x1];
 +      u8         inner_second_cvlan_tag[0x1];
 +      u8         outer_second_svlan_tag[0x1];
 +      u8         inner_second_svlan_tag[0x1];
 +      u8         reserved_at_64[0xc];
        u8         gre_protocol[0x10];
  
        u8         gre_key_h[0x18];
@@@ -547,9 -545,7 +547,9 @@@ struct mlx5_ifc_e_switch_cap_bits 
  struct mlx5_ifc_qos_cap_bits {
        u8         packet_pacing[0x1];
        u8         esw_scheduling[0x1];
 -      u8         reserved_at_2[0x1e];
 +      u8         esw_bw_share[0x1];
 +      u8         esw_rate_limit[0x1];
 +      u8         reserved_at_4[0x1c];
  
        u8         reserved_at_20[0x20];
  
@@@ -577,8 -573,7 +577,8 @@@ struct mlx5_ifc_per_protocol_networking
        u8         lro_cap[0x1];
        u8         lro_psh_flag[0x1];
        u8         lro_time_stamp[0x1];
 -      u8         reserved_at_5[0x3];
 +      u8         reserved_at_5[0x2];
 +      u8         wqe_vlan_insert[0x1];
        u8         self_lb_en_modifiable[0x1];
        u8         reserved_at_9[0x2];
        u8         max_lso_cap[0x5];
@@@ -805,12 -800,10 +805,12 @@@ struct mlx5_ifc_cmd_hca_cap_bits 
        u8         reserved_at_150[0xa];
        u8         log_max_ra_res_qp[0x6];
  
 -      u8         pad_cap[0x1];
 +      u8         end_pad[0x1];
        u8         cc_query_allowed[0x1];
        u8         cc_modify_allowed[0x1];
 -      u8         reserved_at_163[0xd];
 +      u8         start_pad[0x1];
 +      u8         cache_line_128byte[0x1];
 +      u8         reserved_at_163[0xb];
        u8         gid_table_size[0x10];
  
        u8         out_of_seq_cnt[0x1];
        u8         nic_flow_table[0x1];
        u8         eswitch_flow_table[0x1];
        u8         early_vf_enable[0x1];
 -      u8         reserved_at_1a9[0x2];
 +      u8         mcam_reg[0x1];
 +      u8         pcam_reg[0x1];
        u8         local_ca_ack_delay[0x5];
        u8         port_module_event[0x1];
        u8         reserved_at_1b1[0x1];
        u8         port_type[0x2];
        u8         num_ports[0x8];
  
 -      u8         reserved_at_1c0[0x3];
 +      u8         reserved_at_1c0[0x1];
 +      u8         pps[0x1];
 +      u8         pps_modify[0x1];
        u8         log_max_msg[0x5];
        u8         reserved_at_1c8[0x4];
        u8         max_tc[0x4];
@@@ -1389,42 -1379,6 +1389,42 @@@ struct mlx5_ifc_phys_layer_cntrs_bits 
        u8         reserved_at_640[0x180];
  };
  
 +struct mlx5_ifc_phys_layer_statistical_cntrs_bits {
 +      u8         time_since_last_clear_high[0x20];
 +
 +      u8         time_since_last_clear_low[0x20];
 +
 +      u8         phy_received_bits_high[0x20];
 +
 +      u8         phy_received_bits_low[0x20];
 +
 +      u8         phy_symbol_errors_high[0x20];
 +
 +      u8         phy_symbol_errors_low[0x20];
 +
 +      u8         phy_corrected_bits_high[0x20];
 +
 +      u8         phy_corrected_bits_low[0x20];
 +
 +      u8         phy_corrected_bits_lane0_high[0x20];
 +
 +      u8         phy_corrected_bits_lane0_low[0x20];
 +
 +      u8         phy_corrected_bits_lane1_high[0x20];
 +
 +      u8         phy_corrected_bits_lane1_low[0x20];
 +
 +      u8         phy_corrected_bits_lane2_high[0x20];
 +
 +      u8         phy_corrected_bits_lane2_low[0x20];
 +
 +      u8         phy_corrected_bits_lane3_high[0x20];
 +
 +      u8         phy_corrected_bits_lane3_low[0x20];
 +
 +      u8         reserved_at_200[0x5c0];
 +};
 +
  struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits {
        u8         symbol_error_counter[0x10];
  
@@@ -1807,30 -1761,6 +1807,30 @@@ struct mlx5_ifc_eth_802_3_cntrs_grp_dat
        u8         reserved_at_4c0[0x300];
  };
  
 +struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits {
 +      u8         life_time_counter_high[0x20];
 +
 +      u8         life_time_counter_low[0x20];
 +
 +      u8         rx_errors[0x20];
 +
 +      u8         tx_errors[0x20];
 +
 +      u8         l0_to_recovery_eieos[0x20];
 +
 +      u8         l0_to_recovery_ts[0x20];
 +
 +      u8         l0_to_recovery_framing[0x20];
 +
 +      u8         l0_to_recovery_retrain[0x20];
 +
 +      u8         crc_error_dllp[0x20];
 +
 +      u8         crc_error_tlp[0x20];
 +
 +      u8         reserved_at_140[0x680];
 +};
 +
  struct mlx5_ifc_cmd_inter_comp_event_bits {
        u8         command_completion_vector[0x20];
  
@@@ -2993,12 -2923,6 +2993,12 @@@ union mlx5_ifc_eth_cntrs_grp_data_layou
        struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout;
        struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits ib_port_cntrs_grp_data_layout;
        struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs;
 +      struct mlx5_ifc_phys_layer_statistical_cntrs_bits phys_layer_statistical_cntrs;
 +      u8         reserved_at_0[0x7c0];
 +};
 +
 +union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits {
 +      struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits pcie_perf_cntrs_grp_data_layout;
        u8         reserved_at_0[0x7c0];
  };
  
@@@ -5013,7 -4937,7 +5013,7 @@@ struct mlx5_ifc_modify_rq_out_bits 
  
  enum {
        MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD = 1ULL << 1,
-       MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_MODIFY_RQ_COUNTER_SET_ID = 1ULL << 3,
+       MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID = 1ULL << 3,
  };
  
  struct mlx5_ifc_modify_rq_in_bits {
@@@ -7324,18 -7248,6 +7324,18 @@@ struct mlx5_ifc_ppcnt_reg_bits 
        union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set;
  };
  
 +struct mlx5_ifc_mpcnt_reg_bits {
 +      u8         reserved_at_0[0x8];
 +      u8         pcie_index[0x8];
 +      u8         reserved_at_10[0xa];
 +      u8         grp[0x6];
 +
 +      u8         clr[0x1];
 +      u8         reserved_at_21[0x1f];
 +
 +      union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits counter_set;
 +};
 +
  struct mlx5_ifc_ppad_reg_bits {
        u8         reserved_at_0[0x3];
        u8         single_mac[0x1];
@@@ -7565,63 -7477,6 +7565,63 @@@ struct mlx5_ifc_peir_reg_bits 
        u8         error_type[0x8];
  };
  
 +struct mlx5_ifc_pcam_enhanced_features_bits {
 +      u8         reserved_at_0[0x7e];
 +
 +      u8         ppcnt_discard_group[0x1];
 +      u8         ppcnt_statistical_group[0x1];
 +};
 +
 +struct mlx5_ifc_pcam_reg_bits {
 +      u8         reserved_at_0[0x8];
 +      u8         feature_group[0x8];
 +      u8         reserved_at_10[0x8];
 +      u8         access_reg_group[0x8];
 +
 +      u8         reserved_at_20[0x20];
 +
 +      union {
 +              u8         reserved_at_0[0x80];
 +      } port_access_reg_cap_mask;
 +
 +      u8         reserved_at_c0[0x80];
 +
 +      union {
 +              struct mlx5_ifc_pcam_enhanced_features_bits enhanced_features;
 +              u8         reserved_at_0[0x80];
 +      } feature_cap_mask;
 +
 +      u8         reserved_at_1c0[0xc0];
 +};
 +
 +struct mlx5_ifc_mcam_enhanced_features_bits {
 +      u8         reserved_at_0[0x7f];
 +
 +      u8         pcie_performance_group[0x1];
 +};
 +
 +struct mlx5_ifc_mcam_reg_bits {
 +      u8         reserved_at_0[0x8];
 +      u8         feature_group[0x8];
 +      u8         reserved_at_10[0x8];
 +      u8         access_reg_group[0x8];
 +
 +      u8         reserved_at_20[0x20];
 +
 +      union {
 +              u8         reserved_at_0[0x80];
 +      } mng_access_reg_cap_mask;
 +
 +      u8         reserved_at_c0[0x80];
 +
 +      union {
 +              struct mlx5_ifc_mcam_enhanced_features_bits enhanced_features;
 +              u8         reserved_at_0[0x80];
 +      } mng_feature_cap_mask;
 +
 +      u8         reserved_at_1c0[0x80];
 +};
 +
  struct mlx5_ifc_pcap_reg_bits {
        u8         reserved_at_0[0x8];
        u8         local_port[0x8];
@@@ -7966,60 -7821,6 +7966,60 @@@ struct mlx5_ifc_initial_seg_bits 
        u8         reserved_at_80a0[0x17fc0];
  };
  
 +struct mlx5_ifc_mtpps_reg_bits {
 +      u8         reserved_at_0[0xc];
 +      u8         cap_number_of_pps_pins[0x4];
 +      u8         reserved_at_10[0x4];
 +      u8         cap_max_num_of_pps_in_pins[0x4];
 +      u8         reserved_at_18[0x4];
 +      u8         cap_max_num_of_pps_out_pins[0x4];
 +
 +      u8         reserved_at_20[0x24];
 +      u8         cap_pin_3_mode[0x4];
 +      u8         reserved_at_48[0x4];
 +      u8         cap_pin_2_mode[0x4];
 +      u8         reserved_at_50[0x4];
 +      u8         cap_pin_1_mode[0x4];
 +      u8         reserved_at_58[0x4];
 +      u8         cap_pin_0_mode[0x4];
 +
 +      u8         reserved_at_60[0x4];
 +      u8         cap_pin_7_mode[0x4];
 +      u8         reserved_at_68[0x4];
 +      u8         cap_pin_6_mode[0x4];
 +      u8         reserved_at_70[0x4];
 +      u8         cap_pin_5_mode[0x4];
 +      u8         reserved_at_78[0x4];
 +      u8         cap_pin_4_mode[0x4];
 +
 +      u8         reserved_at_80[0x80];
 +
 +      u8         enable[0x1];
 +      u8         reserved_at_101[0xb];
 +      u8         pattern[0x4];
 +      u8         reserved_at_110[0x4];
 +      u8         pin_mode[0x4];
 +      u8         pin[0x8];
 +
 +      u8         reserved_at_120[0x20];
 +
 +      u8         time_stamp[0x40];
 +
 +      u8         out_pulse_duration[0x10];
 +      u8         out_periodic_adjustment[0x10];
 +
 +      u8         reserved_at_1a0[0x60];
 +};
 +
 +struct mlx5_ifc_mtppse_reg_bits {
 +      u8         reserved_at_0[0x18];
 +      u8         pin[0x8];
 +      u8         event_arm[0x1];
 +      u8         reserved_at_21[0x1b];
 +      u8         event_generation_mode[0x4];
 +      u8         reserved_at_40[0x40];
 +};
 +
  union mlx5_ifc_ports_control_registers_document_bits {
        struct mlx5_ifc_bufferx_reg_bits bufferx_reg;
        struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout;
        struct mlx5_ifc_pmtu_reg_bits pmtu_reg;
        struct mlx5_ifc_ppad_reg_bits ppad_reg;
        struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg;
 +      struct mlx5_ifc_mpcnt_reg_bits mpcnt_reg;
        struct mlx5_ifc_pplm_reg_bits pplm_reg;
        struct mlx5_ifc_pplr_reg_bits pplr_reg;
        struct mlx5_ifc_ppsc_reg_bits ppsc_reg;
        struct mlx5_ifc_pvlc_reg_bits pvlc_reg;
        struct mlx5_ifc_slrg_reg_bits slrg_reg;
        struct mlx5_ifc_sltp_reg_bits sltp_reg;
 +      struct mlx5_ifc_mtpps_reg_bits mtpps_reg;
 +      struct mlx5_ifc_mtppse_reg_bits mtppse_reg;
        u8         reserved_at_0[0x60e0];
  };
  
diff --combined include/rdma/ib_verbs.h
index 8c61532cf5218e214f75f5dd50be74f30dcac5d7,07399023352b02533082ce17e871a1f2debe645d..89f5bd4e1d5201c847ff77823b6b4159a741ef2f
@@@ -207,6 -207,7 +207,7 @@@ enum ib_device_cap_flags 
        IB_DEVICE_MEM_WINDOW_TYPE_2A            = (1 << 23),
        IB_DEVICE_MEM_WINDOW_TYPE_2B            = (1 << 24),
        IB_DEVICE_RC_IP_CSUM                    = (1 << 25),
+       /* Deprecated. Please use IB_RAW_PACKET_CAP_IP_CSUM. */
        IB_DEVICE_RAW_IP_CSUM                   = (1 << 26),
        /*
         * Devices should set IB_DEVICE_CROSS_CHANNEL if they
        IB_DEVICE_ON_DEMAND_PAGING              = (1ULL << 31),
        IB_DEVICE_SG_GAPS_REG                   = (1ULL << 32),
        IB_DEVICE_VIRTUAL_FUNCTION              = (1ULL << 33),
+       /* Deprecated. Please use IB_RAW_PACKET_CAP_SCATTER_FCS. */
        IB_DEVICE_RAW_SCATTER_FCS               = (1ULL << 34),
  };
  
@@@ -241,7 -243,8 +243,8 @@@ enum ib_atomic_cap 
  };
  
  enum ib_odp_general_cap_bits {
-       IB_ODP_SUPPORT = 1 << 0,
+       IB_ODP_SUPPORT          = 1 << 0,
+       IB_ODP_SUPPORT_IMPLICIT = 1 << 1,
  };
  
  enum ib_odp_transport_cap_bits {
@@@ -330,6 -333,7 +333,7 @@@ struct ib_device_attr 
        uint64_t                hca_core_clock; /* in KHZ */
        struct ib_rss_caps      rss_caps;
        u32                     max_wq_type_rq;
+       u32                     raw_packet_caps; /* Use ib_raw_packet_caps enum */
  };
  
  enum ib_mtu {
@@@ -352,20 -356,6 +356,20 @@@ static inline int ib_mtu_enum_to_int(en
        }
  }
  
 +static inline enum ib_mtu ib_mtu_int_to_enum(int mtu)
 +{
 +      if (mtu >= 4096)
 +              return IB_MTU_4096;
 +      else if (mtu >= 2048)
 +              return IB_MTU_2048;
 +      else if (mtu >= 1024)
 +              return IB_MTU_1024;
 +      else if (mtu >= 512)
 +              return IB_MTU_512;
 +      else
 +              return IB_MTU_256;
 +}
 +
  enum ib_port_state {
        IB_PORT_NOP             = 0,
        IB_PORT_DOWN            = 1,
@@@ -499,6 -489,8 +503,8 @@@ static inline struct rdma_hw_stats *rdm
  #define RDMA_CORE_CAP_PROT_ROCE         0x00200000
  #define RDMA_CORE_CAP_PROT_IWARP        0x00400000
  #define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000
+ #define RDMA_CORE_CAP_PROT_RAW_PACKET   0x01000000
+ #define RDMA_CORE_CAP_PROT_USNIC        0x02000000
  
  #define RDMA_CORE_PORT_IBA_IB          (RDMA_CORE_CAP_PROT_IB  \
                                        | RDMA_CORE_CAP_IB_MAD \
  #define RDMA_CORE_PORT_INTEL_OPA       (RDMA_CORE_PORT_IBA_IB  \
                                        | RDMA_CORE_CAP_OPA_MAD)
  
+ #define RDMA_CORE_PORT_RAW_PACKET     (RDMA_CORE_CAP_PROT_RAW_PACKET)
+ #define RDMA_CORE_PORT_USNIC          (RDMA_CORE_CAP_PROT_USNIC)
  struct ib_port_attr {
        u64                     subnet_prefix;
        enum ib_port_state      state;
@@@ -1019,6 -1015,7 +1029,7 @@@ enum ib_qp_create_flags 
        IB_QP_CREATE_SIGNATURE_EN               = 1 << 6,
        IB_QP_CREATE_USE_GFP_NOIO               = 1 << 7,
        IB_QP_CREATE_SCATTER_FCS                = 1 << 8,
+       IB_QP_CREATE_CVLAN_STRIPPING            = 1 << 9,
        /* reserve bits 26-31 for low level drivers' internal use */
        IB_QP_CREATE_RESERVED_START             = 1 << 26,
        IB_QP_CREATE_RESERVED_END               = 1 << 31,
@@@ -1470,6 -1467,18 +1481,18 @@@ struct ib_srq 
        } ext;
  };
  
+ enum ib_raw_packet_caps {
+       /* Strip cvlan from incoming packet and report it in the matching work
+        * completion is supported.
+        */
+       IB_RAW_PACKET_CAP_CVLAN_STRIPPING       = (1 << 0),
+       /* Scatter FCS field of an incoming packet to host memory is supported.
+        */
+       IB_RAW_PACKET_CAP_SCATTER_FCS           = (1 << 1),
+       /* Checksum offloads are supported (for both send and receive). */
+       IB_RAW_PACKET_CAP_IP_CSUM               = (1 << 2),
+ };
  enum ib_wq_type {
        IB_WQT_RQ
  };
@@@ -1493,6 -1502,11 +1516,11 @@@ struct ib_wq 
        atomic_t                usecnt;
  };
  
+ enum ib_wq_flags {
+       IB_WQ_FLAGS_CVLAN_STRIPPING     = 1 << 0,
+       IB_WQ_FLAGS_SCATTER_FCS         = 1 << 1,
+ };
  struct ib_wq_init_attr {
        void                   *wq_context;
        enum ib_wq_type wq_type;
        u32             max_sge;
        struct  ib_cq          *cq;
        void                (*event_handler)(struct ib_event *, void *);
+       u32             create_flags; /* Use enum ib_wq_flags */
  };
  
  enum ib_wq_attr_mask {
-       IB_WQ_STATE     = 1 << 0,
-       IB_WQ_CUR_STATE = 1 << 1,
+       IB_WQ_STATE             = 1 << 0,
+       IB_WQ_CUR_STATE         = 1 << 1,
+       IB_WQ_FLAGS             = 1 << 2,
  };
  
  struct ib_wq_attr {
        enum    ib_wq_state     wq_state;
        enum    ib_wq_state     curr_wq_state;
+       u32                     flags; /* Use enum ib_wq_flags */
+       u32                     flags_mask; /* Use enum ib_wq_flags */
  };
  
  struct ib_rwq_ind_table {
@@@ -1618,6 -1636,8 +1650,8 @@@ enum ib_flow_spec_type 
        IB_FLOW_SPEC_UDP                = 0x41,
        IB_FLOW_SPEC_VXLAN_TUNNEL       = 0x50,
        IB_FLOW_SPEC_INNER              = 0x100,
+       /* Actions */
+       IB_FLOW_SPEC_ACTION_TAG         = 0x1000,
  };
  #define IB_FLOW_SPEC_LAYER_MASK       0xF0
  #define IB_FLOW_SPEC_SUPPORT_LAYERS 8
@@@ -1740,6 -1760,12 +1774,12 @@@ struct ib_flow_spec_tunnel 
        struct ib_flow_tunnel_filter  mask;
  };
  
+ struct ib_flow_spec_action_tag {
+       enum ib_flow_spec_type        type;
+       u16                           size;
+       u32                           tag_id;
+ };
  union ib_flow_spec {
        struct {
                u32                     type;
        struct ib_flow_spec_tcp_udp     tcp_udp;
        struct ib_flow_spec_ipv6        ipv6;
        struct ib_flow_spec_tunnel      tunnel;
+       struct ib_flow_spec_action_tag  flow_tag;
  };
  
  struct ib_flow_attr {
@@@ -1789,17 -1816,12 +1830,17 @@@ enum ib_mad_result 
  
  #define IB_DEVICE_NAME_MAX 64
  
 +struct ib_port_cache {
 +      struct ib_pkey_cache  *pkey;
 +      struct ib_gid_table   *gid;
 +      u8                     lmc;
 +      enum ib_port_state     port_state;
 +};
 +
  struct ib_cache {
        rwlock_t                lock;
        struct ib_event_handler event_handler;
 -      struct ib_pkey_cache  **pkey_cache;
 -      struct ib_gid_table   **gid_cache;
 -      u8                     *lmc_cache;
 +      struct ib_port_cache   *ports;
  };
  
  struct ib_dma_mapping_ops {
@@@ -2294,13 -2316,6 +2335,13 @@@ static inline u8 rdma_end_port(const st
        return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt;
  }
  
 +static inline int rdma_is_port_valid(const struct ib_device *device,
 +                                   unsigned int port)
 +{
 +      return (port >= rdma_start_port(device) &&
 +              port <= rdma_end_port(device));
 +}
 +
  static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num)
  {
        return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB;
@@@ -2333,6 -2348,16 +2374,16 @@@ static inline bool rdma_ib_or_roce(cons
                rdma_protocol_roce(device, port_num);
  }
  
+ static inline bool rdma_protocol_raw_packet(const struct ib_device *device, u8 port_num)
+ {
+       return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_RAW_PACKET;
+ }
+ static inline bool rdma_protocol_usnic(const struct ib_device *device, u8 port_num)
+ {
+       return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_USNIC;
+ }
  /**
   * rdma_cap_ib_mad - Check if the port of a device supports Infiniband
   * Management Datagrams.
index f4f87cff6dc6c5fc9f1a91e32c9e8d3a601f10c8,f8723580ffed0258dc3c9c58626c131785bec15f..997f904c76923057a376e0f4f0772ac82dcacef7
@@@ -37,6 -37,7 +37,6 @@@
  #define IB_USER_VERBS_H
  
  #include <linux/types.h>
 -#include <rdma/ib_verbs.h>
  
  /*
   * Increment this value if any changes that break userspace ABI
@@@ -246,7 -247,7 +246,7 @@@ struct ib_uverbs_ex_query_device_resp 
        __u64 device_cap_flags_ex;
        struct ib_uverbs_rss_caps rss_caps;
        __u32  max_wq_type_rq;
-       __u32 reserved;
+       __u32 raw_packet_caps;
  };
  
  struct ib_uverbs_query_port {
@@@ -547,17 -548,11 +547,17 @@@ enum 
  };
  
  enum {
 -      IB_USER_LEGACY_LAST_QP_ATTR_MASK = IB_QP_DEST_QPN
 +      /*
 +       * This value is equal to IB_QP_DEST_QPN.
 +       */
 +      IB_USER_LEGACY_LAST_QP_ATTR_MASK = 1ULL << 20,
  };
  
  enum {
 -      IB_USER_LAST_QP_ATTR_MASK = IB_QP_RATE_LIMIT
 +      /*
 +       * This value is equal to IB_QP_RATE_LIMIT.
 +       */
 +      IB_USER_LAST_QP_ATTR_MASK = 1ULL << 25,
  };
  
  struct ib_uverbs_ex_create_qp {
@@@ -934,6 -929,19 +934,19 @@@ struct ib_uverbs_flow_spec_ipv6 
        struct ib_uverbs_flow_ipv6_filter mask;
  };
  
+ struct ib_uverbs_flow_spec_action_tag {
+       union {
+               struct ib_uverbs_flow_spec_hdr hdr;
+               struct {
+                       __u32 type;
+                       __u16 size;
+                       __u16 reserved;
+               };
+       };
+       __u32                         tag_id;
+       __u32                         reserved1;
+ };
  struct ib_uverbs_flow_tunnel_filter {
        __be32 tunnel_id;
  };
@@@ -1053,6 -1061,8 +1066,8 @@@ struct ib_uverbs_ex_create_wq  
        __u32 cq_handle;
        __u32 max_wr;
        __u32 max_sge;
+       __u32 create_flags; /* Use enum ib_wq_flags */
+       __u32 reserved;
  };
  
  struct ib_uverbs_ex_create_wq_resp {
@@@ -1081,6 -1091,8 +1096,8 @@@ struct ib_uverbs_ex_modify_wq  
        __u32 wq_handle;
        __u32 wq_state;
        __u32 curr_wq_state;
+       __u32 flags; /* Use enum ib_wq_flags */
+       __u32 flags_mask; /* Use enum ib_wq_flags */
  };
  
  /* Prevent memory allocation rather than max expected size */