]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/commitdiff
RDMA/rxe: Fix soft lockup problem due to using tasklets in softirq
authorZhu Yanjun <yanjunz@mellanox.com>
Wed, 12 Feb 2020 07:26:33 +0000 (09:26 +0200)
committerKhalid Elmously <khalid.elmously@canonical.com>
Fri, 13 Mar 2020 04:31:00 +0000 (00:31 -0400)
BugLink: https://bugs.launchpad.net/bugs/1867194
commit 8ac0e6641c7ca14833a2a8c6f13d8e0a435e535c upstream.

When run stress tests with RXE, the following Call Traces often occur

  watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [swapper/2:0]
  ...
  Call Trace:
  <IRQ>
  create_object+0x3f/0x3b0
  kmem_cache_alloc_node_trace+0x129/0x2d0
  __kmalloc_reserve.isra.52+0x2e/0x80
  __alloc_skb+0x83/0x270
  rxe_init_packet+0x99/0x150 [rdma_rxe]
  rxe_requester+0x34e/0x11a0 [rdma_rxe]
  rxe_do_task+0x85/0xf0 [rdma_rxe]
  tasklet_action_common.isra.21+0xeb/0x100
  __do_softirq+0xd0/0x298
  irq_exit+0xc5/0xd0
  smp_apic_timer_interrupt+0x68/0x120
  apic_timer_interrupt+0xf/0x20
  </IRQ>
  ...

The root cause is that tasklet is actually a softirq. In a tasklet
handler, another softirq handler is triggered. Usually these softirq
handlers run on the same cpu core. So this will cause "soft lockup Bug".

Fixes: 8700e3e7c485 ("Soft RoCE driver")
Link: https://lore.kernel.org/r/20200212072635.682689-8-leon@kernel.org
Signed-off-by: Zhu Yanjun <yanjunz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Kamal Mostafa <kamal@canonical.com>
Signed-off-by: Khalid Elmously <khalid.elmously@canonical.com>
drivers/infiniband/sw/rxe/rxe_comp.c

index 83962c2e493a7ade22287d31bce0dcd2c5cc26a0..f0b040644fc214de548908ecb0f917261bac44cf 100644 (file)
@@ -329,7 +329,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp,
                                        qp->comp.psn = pkt->psn;
                                        if (qp->req.wait_psn) {
                                                qp->req.wait_psn = 0;
-                                               rxe_run_task(&qp->req.task, 1);
+                                               rxe_run_task(&qp->req.task, 0);
                                        }
                                }
                                return COMPST_ERROR_RETRY;
@@ -459,7 +459,7 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
         */
        if (qp->req.wait_fence) {
                qp->req.wait_fence = 0;
-               rxe_run_task(&qp->req.task, 1);
+               rxe_run_task(&qp->req.task, 0);
        }
 }
 
@@ -475,7 +475,7 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp,
                if (qp->req.need_rd_atomic) {
                        qp->comp.timeout_retry = 0;
                        qp->req.need_rd_atomic = 0;
-                       rxe_run_task(&qp->req.task, 1);
+                       rxe_run_task(&qp->req.task, 0);
                }
        }
 
@@ -723,7 +723,7 @@ int rxe_completer(void *arg)
                                                        RXE_CNT_COMP_RETRY);
                                        qp->req.need_retry = 1;
                                        qp->comp.started_retry = 1;
-                                       rxe_run_task(&qp->req.task, 1);
+                                       rxe_run_task(&qp->req.task, 0);
                                }
 
                                if (pkt) {