]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blame - drivers/infiniband/hw/hfi1/tid_rdma.c
IB/hfi1: Add the dual leg code
[mirror_ubuntu-focal-kernel.git] / drivers / infiniband / hw / hfi1 / tid_rdma.c
CommitLineData
5190f052
MM
1// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
2/*
3 * Copyright(c) 2018 Intel Corporation.
4 *
5 */
6
7#include "hfi.h"
37356e78 8#include "qp.h"
742a3826 9#include "rc.h"
5190f052
MM
10#include "verbs.h"
11#include "tid_rdma.h"
838b6fd2 12#include "exp_rcv.h"
a131d164 13#include "trace.h"
5190f052 14
742a3826
KW
15/**
16 * DOC: TID RDMA READ protocol
17 *
18 * This is an end-to-end protocol at the hfi1 level between two nodes that
19 * improves performance by avoiding data copy on the requester side. It
20 * converts a qualified RDMA READ request into a TID RDMA READ request on
21 * the requester side and thereafter handles the request and response
22 * differently. To be qualified, the RDMA READ request should meet the
23 * following:
24 * -- The total data length should be greater than 256K;
25 * -- The total data length should be a multiple of 4K page size;
26 * -- Each local scatter-gather entry should be 4K page aligned;
27 * -- Each local scatter-gather entry should be a multiple of 4K page size;
28 */
29
37356e78
KW
30#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
31#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
32#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
33#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35)
34#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
35#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
36
742a3826
KW
37/* Maximum number of packets within a flow generation. */
38#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
39
37356e78
KW
40#define GENERATION_MASK 0xFFFFF
41
42static u32 mask_generation(u32 a)
43{
44 return a & GENERATION_MASK;
45}
46
47/* Reserved generation value to set to unused flows for kernel contexts */
48#define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
49
d22a207d
KW
50/*
51 * J_KEY for kernel contexts when TID RDMA is used.
52 * See generate_jkey() in hfi.h for more information.
53 */
54#define TID_RDMA_JKEY 32
55#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
56#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
57
838b6fd2 58/* Maximum number of segments in flight per QP request. */
d22a207d
KW
59#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6
60#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
838b6fd2
KW
61#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
62 TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
63#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
64
65#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE)
d22a207d 66
742a3826
KW
67#define TID_RDMA_DESTQP_FLOW_SHIFT 11
68#define TID_RDMA_DESTQP_FLOW_MASK 0x1f
69
9905bf06
KW
70#define TID_FLOW_SW_PSN BIT(0)
71
d22a207d
KW
72#define TID_OPFN_QP_CTXT_MASK 0xff
73#define TID_OPFN_QP_CTXT_SHIFT 56
74#define TID_OPFN_QP_KDETH_MASK 0xff
75#define TID_OPFN_QP_KDETH_SHIFT 48
76#define TID_OPFN_MAX_LEN_MASK 0x7ff
77#define TID_OPFN_MAX_LEN_SHIFT 37
78#define TID_OPFN_TIMEOUT_MASK 0x1f
79#define TID_OPFN_TIMEOUT_SHIFT 32
80#define TID_OPFN_RESERVED_MASK 0x3f
81#define TID_OPFN_RESERVED_SHIFT 26
82#define TID_OPFN_URG_MASK 0x1
83#define TID_OPFN_URG_SHIFT 25
84#define TID_OPFN_VER_MASK 0x7
85#define TID_OPFN_VER_SHIFT 22
86#define TID_OPFN_JKEY_MASK 0x3f
87#define TID_OPFN_JKEY_SHIFT 16
88#define TID_OPFN_MAX_READ_MASK 0x3f
89#define TID_OPFN_MAX_READ_SHIFT 10
90#define TID_OPFN_MAX_WRITE_MASK 0x3f
91#define TID_OPFN_MAX_WRITE_SHIFT 4
92
93/*
94 * OPFN TID layout
95 *
96 * 63 47 31 15
97 * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC
98 * 3210987654321098 7654321098765432 1098765432109876 5432109876543210
99 * N - the context Number
100 * K - the Kdeth_qp
101 * M - Max_len
102 * T - Timeout
103 * D - reserveD
104 * V - version
105 * U - Urg capable
106 * J - Jkey
107 * R - max_Read
108 * W - max_Write
109 * C - Capcode
110 */
111
07b92370
KW
112static u32 tid_rdma_flow_wt;
113
37356e78 114static void tid_rdma_trigger_resume(struct work_struct *work);
838b6fd2
KW
115static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
116static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
117 gfp_t gfp);
118static void hfi1_init_trdma_req(struct rvt_qp *qp,
119 struct tid_rdma_request *req);
07b92370 120static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx);
3c759e00
KW
121static void hfi1_tid_timeout(struct timer_list *t);
122static void hfi1_add_tid_reap_timer(struct rvt_qp *qp);
123static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp);
829eaee5
KW
124static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp);
125static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp);
126static void hfi1_tid_retry_timeout(struct timer_list *t);
24c5bfea
KW
127static int make_tid_rdma_ack(struct rvt_qp *qp,
128 struct ib_other_headers *ohdr,
129 struct hfi1_pkt_state *ps);
572f0c33 130static void hfi1_do_tid_send(struct rvt_qp *qp);
37356e78 131
d22a207d
KW
132static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
133{
134 return
135 (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) <<
136 TID_OPFN_QP_CTXT_SHIFT) |
137 ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) <<
138 TID_OPFN_QP_KDETH_SHIFT) |
139 (((u64)((p->max_len >> PAGE_SHIFT) - 1) &
140 TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) |
141 (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) <<
142 TID_OPFN_TIMEOUT_SHIFT) |
143 (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) |
144 (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) |
145 (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) <<
146 TID_OPFN_MAX_READ_SHIFT) |
147 (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) <<
148 TID_OPFN_MAX_WRITE_SHIFT);
149}
150
151static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data)
152{
153 p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) &
154 TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT;
155 p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK;
156 p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) &
157 TID_OPFN_MAX_WRITE_MASK;
158 p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) &
159 TID_OPFN_MAX_READ_MASK;
160 p->qp =
161 ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK)
162 << 16) |
163 ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK));
164 p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK;
165 p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK;
166}
167
168void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p)
169{
170 struct hfi1_qp_priv *priv = qp->priv;
171
172 p->qp = (kdeth_qp << 16) | priv->rcd->ctxt;
173 p->max_len = TID_RDMA_MAX_SEGMENT_SIZE;
174 p->jkey = priv->rcd->jkey;
175 p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ;
176 p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ;
177 p->timeout = qp->timeout;
178 p->urg = is_urg_masked(priv->rcd);
179}
180
181bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data)
182{
183 struct hfi1_qp_priv *priv = qp->priv;
184
185 *data = tid_rdma_opfn_encode(&priv->tid_rdma.local);
186 return true;
187}
188
189bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data)
190{
191 struct hfi1_qp_priv *priv = qp->priv;
192 struct tid_rdma_params *remote, *old;
193 bool ret = true;
194
195 old = rcu_dereference_protected(priv->tid_rdma.remote,
196 lockdep_is_held(&priv->opfn.lock));
197 data &= ~0xfULL;
198 /*
199 * If data passed in is zero, return true so as not to continue the
200 * negotiation process
201 */
202 if (!data || !HFI1_CAP_IS_KSET(TID_RDMA))
203 goto null;
204 /*
205 * If kzalloc fails, return false. This will result in:
206 * * at the requester a new OPFN request being generated to retry
207 * the negotiation
208 * * at the responder, 0 being returned to the requester so as to
209 * disable TID RDMA at both the requester and the responder
210 */
211 remote = kzalloc(sizeof(*remote), GFP_ATOMIC);
212 if (!remote) {
213 ret = false;
214 goto null;
215 }
216
217 tid_rdma_opfn_decode(remote, data);
218 priv->tid_timer_timeout_jiffies =
219 usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) /
220 1000UL) << 3) * 7);
a131d164
KW
221 trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local);
222 trace_hfi1_opfn_param(qp, 1, remote);
d22a207d
KW
223 rcu_assign_pointer(priv->tid_rdma.remote, remote);
224 /*
225 * A TID RDMA READ request's segment size is not equal to
226 * remote->max_len only when the request's data length is smaller
227 * than remote->max_len. In that case, there will be only one segment.
228 * Therefore, when priv->pkts_ps is used to calculate req->cur_seg
229 * during retry, it will lead to req->cur_seg = 0, which is exactly
230 * what is expected.
231 */
232 priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len);
233 priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1;
234 goto free;
235null:
236 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
237 priv->timeout_shift = 0;
238free:
239 if (old)
240 kfree_rcu(old, rcu_head);
241 return ret;
242}
243
244bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data)
245{
246 bool ret;
247
248 ret = tid_rdma_conn_reply(qp, *data);
249 *data = 0;
250 /*
251 * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate
252 * TID RDMA could not be enabled. This will result in TID RDMA being
253 * disabled at the requester too.
254 */
255 if (ret)
256 (void)tid_rdma_conn_req(qp, data);
257 return ret;
258}
259
260void tid_rdma_conn_error(struct rvt_qp *qp)
261{
262 struct hfi1_qp_priv *priv = qp->priv;
263 struct tid_rdma_params *old;
264
265 old = rcu_dereference_protected(priv->tid_rdma.remote,
266 lockdep_is_held(&priv->opfn.lock));
267 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
268 if (old)
269 kfree_rcu(old, rcu_head);
270}
271
272/* This is called at context initialization time */
273int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
274{
275 if (reinit)
276 return 0;
277
278 BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY);
279 BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
280 rcd->jkey = TID_RDMA_JKEY;
281 hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
838b6fd2 282 return hfi1_alloc_ctxt_rcv_groups(rcd);
d22a207d
KW
283}
284
5190f052
MM
285/**
286 * qp_to_rcd - determine the receive context used by a qp
287 * @qp - the qp
288 *
289 * This routine returns the receive context associated
290 * with a a qp's qpn.
291 *
292 * Returns the context.
293 */
294static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi,
295 struct rvt_qp *qp)
296{
297 struct hfi1_ibdev *verbs_dev = container_of(rdi,
298 struct hfi1_ibdev,
299 rdi);
300 struct hfi1_devdata *dd = container_of(verbs_dev,
301 struct hfi1_devdata,
302 verbs_dev);
303 unsigned int ctxt;
304
305 if (qp->ibqp.qp_num == 0)
306 ctxt = 0;
307 else
308 ctxt = ((qp->ibqp.qp_num >> dd->qos_shift) %
309 (dd->n_krcv_queues - 1)) + 1;
310
311 return dd->rcd[ctxt];
312}
313
314int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
315 struct ib_qp_init_attr *init_attr)
316{
317 struct hfi1_qp_priv *qpriv = qp->priv;
838b6fd2 318 int i, ret;
5190f052
MM
319
320 qpriv->rcd = qp_to_rcd(rdi, qp);
321
48a615dc
KW
322 spin_lock_init(&qpriv->opfn.lock);
323 INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
37356e78
KW
324 INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume);
325 qpriv->flow_state.psn = 0;
326 qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
327 qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
328 qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
9e93e967 329 qpriv->s_state = TID_OP(WRITE_RESP);
72a0ea99
KW
330 qpriv->s_tid_cur = HFI1_QP_WQE_INVALID;
331 qpriv->s_tid_head = HFI1_QP_WQE_INVALID;
332 qpriv->s_tid_tail = HFI1_QP_WQE_INVALID;
07b92370
KW
333 qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
334 qpriv->r_tid_head = HFI1_QP_WQE_INVALID;
335 qpriv->r_tid_tail = HFI1_QP_WQE_INVALID;
336 qpriv->r_tid_ack = HFI1_QP_WQE_INVALID;
337 qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID;
70dcb2e3 338 atomic_set(&qpriv->n_requests, 0);
9e93e967 339 atomic_set(&qpriv->n_tid_requests, 0);
3c759e00 340 timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0);
829eaee5 341 timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0);
37356e78 342 INIT_LIST_HEAD(&qpriv->tid_wait);
48a615dc 343
838b6fd2
KW
344 if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
345 struct hfi1_devdata *dd = qpriv->rcd->dd;
346
347 qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
348 sizeof(*qpriv->pages),
349 GFP_KERNEL, dd->node);
350 if (!qpriv->pages)
351 return -ENOMEM;
352 for (i = 0; i < qp->s_size; i++) {
353 struct hfi1_swqe_priv *priv;
354 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
355
356 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
357 dd->node);
358 if (!priv)
359 return -ENOMEM;
360
361 hfi1_init_trdma_req(qp, &priv->tid_req);
362 priv->tid_req.e.swqe = wqe;
363 wqe->priv = priv;
364 }
365 for (i = 0; i < rvt_max_atomic(rdi); i++) {
366 struct hfi1_ack_priv *priv;
367
368 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
369 dd->node);
370 if (!priv)
371 return -ENOMEM;
372
373 hfi1_init_trdma_req(qp, &priv->tid_req);
374 priv->tid_req.e.ack = &qp->s_ack_queue[i];
375
376 ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req,
377 GFP_KERNEL);
378 if (ret) {
379 kfree(priv);
380 return ret;
381 }
382 qp->s_ack_queue[i].priv = priv;
383 }
384 }
385
5190f052
MM
386 return 0;
387}
48a615dc
KW
388
389void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
390{
838b6fd2
KW
391 struct hfi1_qp_priv *qpriv = qp->priv;
392 struct rvt_swqe *wqe;
393 u32 i;
394
395 if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
396 for (i = 0; i < qp->s_size; i++) {
397 wqe = rvt_get_swqe_ptr(qp, i);
398 kfree(wqe->priv);
399 wqe->priv = NULL;
400 }
401 for (i = 0; i < rvt_max_atomic(rdi); i++) {
402 struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
403
404 if (priv)
405 hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
406 kfree(priv);
407 qp->s_ack_queue[i].priv = NULL;
408 }
409 cancel_work_sync(&qpriv->opfn.opfn_work);
410 kfree(qpriv->pages);
411 qpriv->pages = NULL;
412 }
48a615dc 413}
37356e78
KW
414
415/* Flow and tid waiter functions */
416/**
417 * DOC: lock ordering
418 *
419 * There are two locks involved with the queuing
420 * routines: the qp s_lock and the exp_lock.
421 *
422 * Since the tid space allocation is called from
423 * the send engine, the qp s_lock is already held.
424 *
425 * The allocation routines will get the exp_lock.
426 *
427 * The first_qp() call is provided to allow the head of
428 * the rcd wait queue to be fetched under the exp_lock and
429 * followed by a drop of the exp_lock.
430 *
431 * Any qp in the wait list will have the qp reference count held
432 * to hold the qp in memory.
433 */
434
435/*
436 * return head of rcd wait list
437 *
438 * Must hold the exp_lock.
439 *
440 * Get a reference to the QP to hold the QP in memory.
441 *
442 * The caller must release the reference when the local
443 * is no longer being used.
444 */
445static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd,
446 struct tid_queue *queue)
447 __must_hold(&rcd->exp_lock)
448{
449 struct hfi1_qp_priv *priv;
450
451 lockdep_assert_held(&rcd->exp_lock);
452 priv = list_first_entry_or_null(&queue->queue_head,
453 struct hfi1_qp_priv,
454 tid_wait);
455 if (!priv)
456 return NULL;
457 rvt_get_qp(priv->owner);
458 return priv->owner;
459}
460
461/**
462 * kernel_tid_waiters - determine rcd wait
463 * @rcd: the receive context
464 * @qp: the head of the qp being processed
465 *
466 * This routine will return false IFF
467 * the list is NULL or the head of the
468 * list is the indicated qp.
469 *
470 * Must hold the qp s_lock and the exp_lock.
471 *
472 * Return:
473 * false if either of the conditions below are statisfied:
474 * 1. The list is empty or
475 * 2. The indicated qp is at the head of the list and the
476 * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
477 * true is returned otherwise.
478 */
479static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd,
480 struct tid_queue *queue, struct rvt_qp *qp)
481 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
482{
483 struct rvt_qp *fqp;
484 bool ret = true;
485
486 lockdep_assert_held(&qp->s_lock);
487 lockdep_assert_held(&rcd->exp_lock);
488 fqp = first_qp(rcd, queue);
489 if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE)))
490 ret = false;
491 rvt_put_qp(fqp);
492 return ret;
493}
494
495/**
496 * dequeue_tid_waiter - dequeue the qp from the list
497 * @qp - the qp to remove the wait list
498 *
499 * This routine removes the indicated qp from the
500 * wait list if it is there.
501 *
502 * This should be done after the hardware flow and
503 * tid array resources have been allocated.
504 *
505 * Must hold the qp s_lock and the rcd exp_lock.
506 *
507 * It assumes the s_lock to protect the s_flags
508 * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
509 */
510static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd,
511 struct tid_queue *queue, struct rvt_qp *qp)
512 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
513{
514 struct hfi1_qp_priv *priv = qp->priv;
515
516 lockdep_assert_held(&qp->s_lock);
517 lockdep_assert_held(&rcd->exp_lock);
518 if (list_empty(&priv->tid_wait))
519 return;
520 list_del_init(&priv->tid_wait);
521 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
522 queue->dequeue++;
523 rvt_put_qp(qp);
524}
525
526/**
527 * queue_qp_for_tid_wait - suspend QP on tid space
528 * @rcd: the receive context
529 * @qp: the qp
530 *
531 * The qp is inserted at the tail of the rcd
532 * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set.
533 *
534 * Must hold the qp s_lock and the exp_lock.
535 */
536static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd,
537 struct tid_queue *queue, struct rvt_qp *qp)
538 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
539{
540 struct hfi1_qp_priv *priv = qp->priv;
541
542 lockdep_assert_held(&qp->s_lock);
543 lockdep_assert_held(&rcd->exp_lock);
544 if (list_empty(&priv->tid_wait)) {
545 qp->s_flags |= HFI1_S_WAIT_TID_SPACE;
546 list_add_tail(&priv->tid_wait, &queue->queue_head);
547 priv->tid_enqueue = ++queue->enqueue;
2f16a696 548 rcd->dd->verbs_dev.n_tidwait++;
37356e78
KW
549 trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE);
550 rvt_get_qp(qp);
551 }
552}
553
554/**
555 * __trigger_tid_waiter - trigger tid waiter
556 * @qp: the qp
557 *
558 * This is a private entrance to schedule the qp
559 * assuming the caller is holding the qp->s_lock.
560 */
561static void __trigger_tid_waiter(struct rvt_qp *qp)
562 __must_hold(&qp->s_lock)
563{
564 lockdep_assert_held(&qp->s_lock);
565 if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE))
566 return;
567 trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
568 hfi1_schedule_send(qp);
569}
570
571/**
572 * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp
573 * @qp - the qp
574 *
575 * trigger a schedule or a waiting qp in a deadlock
576 * safe manner. The qp reference is held prior
577 * to this call via first_qp().
578 *
579 * If the qp trigger was already scheduled (!rval)
580 * the the reference is dropped, otherwise the resume
581 * or the destroy cancel will dispatch the reference.
582 */
583static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
584{
585 struct hfi1_qp_priv *priv;
586 struct hfi1_ibport *ibp;
587 struct hfi1_pportdata *ppd;
588 struct hfi1_devdata *dd;
589 bool rval;
590
591 if (!qp)
592 return;
593
594 priv = qp->priv;
595 ibp = to_iport(qp->ibqp.device, qp->port_num);
596 ppd = ppd_from_ibp(ibp);
597 dd = dd_from_ibdev(qp->ibqp.device);
598
599 rval = queue_work_on(priv->s_sde ?
600 priv->s_sde->cpu :
601 cpumask_first(cpumask_of_node(dd->node)),
602 ppd->hfi1_wq,
603 &priv->tid_rdma.trigger_work);
604 if (!rval)
605 rvt_put_qp(qp);
606}
607
608/**
609 * tid_rdma_trigger_resume - field a trigger work request
610 * @work - the work item
611 *
612 * Complete the off qp trigger processing by directly
613 * calling the progress routine.
614 */
615static void tid_rdma_trigger_resume(struct work_struct *work)
616{
617 struct tid_rdma_qp_params *tr;
618 struct hfi1_qp_priv *priv;
619 struct rvt_qp *qp;
620
621 tr = container_of(work, struct tid_rdma_qp_params, trigger_work);
622 priv = container_of(tr, struct hfi1_qp_priv, tid_rdma);
623 qp = priv->owner;
624 spin_lock_irq(&qp->s_lock);
625 if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) {
626 spin_unlock_irq(&qp->s_lock);
627 hfi1_do_send(priv->owner, true);
628 } else {
629 spin_unlock_irq(&qp->s_lock);
630 }
631 rvt_put_qp(qp);
632}
633
634/**
635 * tid_rdma_flush_wait - unwind any tid space wait
636 *
637 * This is called when resetting a qp to
638 * allow a destroy or reset to get rid
639 * of any tid space linkage and reference counts.
640 */
641static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue)
642 __must_hold(&qp->s_lock)
643{
644 struct hfi1_qp_priv *priv;
645
646 if (!qp)
647 return;
648 lockdep_assert_held(&qp->s_lock);
649 priv = qp->priv;
650 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
651 spin_lock(&priv->rcd->exp_lock);
652 if (!list_empty(&priv->tid_wait)) {
653 list_del_init(&priv->tid_wait);
654 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
655 queue->dequeue++;
656 rvt_put_qp(qp);
657 }
658 spin_unlock(&priv->rcd->exp_lock);
659}
660
661void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
662 __must_hold(&qp->s_lock)
663{
664 struct hfi1_qp_priv *priv = qp->priv;
665
666 _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue);
838b6fd2 667 _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue);
37356e78
KW
668}
669
670/* Flow functions */
671/**
672 * kern_reserve_flow - allocate a hardware flow
673 * @rcd - the context to use for allocation
674 * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
675 * signify "don't care".
676 *
677 * Use a bit mask based allocation to reserve a hardware
678 * flow for use in receiving KDETH data packets. If a preferred flow is
679 * specified the function will attempt to reserve that flow again, if
680 * available.
681 *
682 * The exp_lock must be held.
683 *
684 * Return:
685 * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1
686 * On failure: -EAGAIN
687 */
688static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
689 __must_hold(&rcd->exp_lock)
690{
691 int nr;
692
693 /* Attempt to reserve the preferred flow index */
694 if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
695 !test_and_set_bit(last, &rcd->flow_mask))
696 return last;
697
698 nr = ffz(rcd->flow_mask);
699 BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
700 (sizeof(rcd->flow_mask) * BITS_PER_BYTE));
701 if (nr > (RXE_NUM_TID_FLOWS - 1))
702 return -EAGAIN;
703 set_bit(nr, &rcd->flow_mask);
704 return nr;
705}
706
707static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
708 u32 flow_idx)
709{
710 u64 reg;
711
712 reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
713 RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK |
714 RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK |
715 RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK |
716 RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK |
717 RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
718
719 if (generation != KERN_GENERATION_RESERVED)
720 reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
721
722 write_uctxt_csr(rcd->dd, rcd->ctxt,
723 RCV_TID_FLOW_TABLE + 8 * flow_idx, reg);
724}
725
726static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
727 __must_hold(&rcd->exp_lock)
728{
729 u32 generation = rcd->flows[flow_idx].generation;
730
731 kern_set_hw_flow(rcd, generation, flow_idx);
732 return generation;
733}
734
735static u32 kern_flow_generation_next(u32 gen)
736{
737 u32 generation = mask_generation(gen + 1);
738
739 if (generation == KERN_GENERATION_RESERVED)
740 generation = mask_generation(generation + 1);
741 return generation;
742}
743
744static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
745 __must_hold(&rcd->exp_lock)
746{
747 rcd->flows[flow_idx].generation =
748 kern_flow_generation_next(rcd->flows[flow_idx].generation);
749 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
750}
751
752int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
753{
754 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
755 struct tid_flow_state *fs = &qpriv->flow_state;
756 struct rvt_qp *fqp;
757 unsigned long flags;
758 int ret = 0;
759
760 /* The QP already has an allocated flow */
761 if (fs->index != RXE_NUM_TID_FLOWS)
762 return ret;
763
764 spin_lock_irqsave(&rcd->exp_lock, flags);
765 if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp))
766 goto queue;
767
768 ret = kern_reserve_flow(rcd, fs->last_index);
769 if (ret < 0)
770 goto queue;
771 fs->index = ret;
772 fs->last_index = fs->index;
773
774 /* Generation received in a RESYNC overrides default flow generation */
775 if (fs->generation != KERN_GENERATION_RESERVED)
776 rcd->flows[fs->index].generation = fs->generation;
777 fs->generation = kern_setup_hw_flow(rcd, fs->index);
778 fs->psn = 0;
779 fs->flags = 0;
780 dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
781 /* get head before dropping lock */
782 fqp = first_qp(rcd, &rcd->flow_queue);
783 spin_unlock_irqrestore(&rcd->exp_lock, flags);
784
785 tid_rdma_schedule_tid_wakeup(fqp);
786 return 0;
787queue:
788 queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp);
789 spin_unlock_irqrestore(&rcd->exp_lock, flags);
790 return -EAGAIN;
791}
792
793void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
794{
795 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
796 struct tid_flow_state *fs = &qpriv->flow_state;
797 struct rvt_qp *fqp;
798 unsigned long flags;
799
800 if (fs->index >= RXE_NUM_TID_FLOWS)
801 return;
802 spin_lock_irqsave(&rcd->exp_lock, flags);
803 kern_clear_hw_flow(rcd, fs->index);
804 clear_bit(fs->index, &rcd->flow_mask);
805 fs->index = RXE_NUM_TID_FLOWS;
806 fs->psn = 0;
807 fs->generation = KERN_GENERATION_RESERVED;
808
809 /* get head before dropping lock */
810 fqp = first_qp(rcd, &rcd->flow_queue);
811 spin_unlock_irqrestore(&rcd->exp_lock, flags);
812
813 if (fqp == qp) {
814 __trigger_tid_waiter(fqp);
815 rvt_put_qp(fqp);
816 } else {
817 tid_rdma_schedule_tid_wakeup(fqp);
818 }
819}
820
821void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
822{
823 int i;
824
825 for (i = 0; i < RXE_NUM_TID_FLOWS; i++) {
826 rcd->flows[i].generation = mask_generation(prandom_u32());
827 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
828 }
829}
838b6fd2
KW
830
831/* TID allocation functions */
832static u8 trdma_pset_order(struct tid_rdma_pageset *s)
833{
834 u8 count = s->count;
835
836 return ilog2(count) + 1;
837}
838
839/**
840 * tid_rdma_find_phys_blocks_4k - get groups base on mr info
841 * @npages - number of pages
842 * @pages - pointer to an array of page structs
843 * @list - page set array to return
844 *
845 * This routine returns the number of groups associated with
846 * the current sge information. This implementation is based
847 * on the expected receive find_phys_blocks() adjusted to
848 * use the MR information vs. the pfn.
849 *
850 * Return:
851 * the number of RcvArray entries
852 */
853static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
854 struct page **pages,
855 u32 npages,
856 struct tid_rdma_pageset *list)
857{
858 u32 pagecount, pageidx, setcount = 0, i;
859 void *vaddr, *this_vaddr;
860
861 if (!npages)
862 return 0;
863
864 /*
865 * Look for sets of physically contiguous pages in the user buffer.
866 * This will allow us to optimize Expected RcvArray entry usage by
867 * using the bigger supported sizes.
868 */
869 vaddr = page_address(pages[0]);
84f4a40d 870 trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr);
838b6fd2
KW
871 for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
872 this_vaddr = i < npages ? page_address(pages[i]) : NULL;
84f4a40d
KW
873 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0,
874 this_vaddr);
838b6fd2
KW
875 /*
876 * If the vaddr's are not sequential, pages are not physically
877 * contiguous.
878 */
879 if (this_vaddr != (vaddr + PAGE_SIZE)) {
880 /*
881 * At this point we have to loop over the set of
882 * physically contiguous pages and break them down it
883 * sizes supported by the HW.
884 * There are two main constraints:
885 * 1. The max buffer size is MAX_EXPECTED_BUFFER.
886 * If the total set size is bigger than that
887 * program only a MAX_EXPECTED_BUFFER chunk.
888 * 2. The buffer size has to be a power of two. If
889 * it is not, round down to the closes power of
890 * 2 and program that size.
891 */
892 while (pagecount) {
893 int maxpages = pagecount;
894 u32 bufsize = pagecount * PAGE_SIZE;
895
896 if (bufsize > MAX_EXPECTED_BUFFER)
897 maxpages =
898 MAX_EXPECTED_BUFFER >>
899 PAGE_SHIFT;
900 else if (!is_power_of_2(bufsize))
901 maxpages =
902 rounddown_pow_of_two(bufsize) >>
903 PAGE_SHIFT;
904
905 list[setcount].idx = pageidx;
906 list[setcount].count = maxpages;
84f4a40d
KW
907 trace_hfi1_tid_pageset(flow->req->qp, setcount,
908 list[setcount].idx,
909 list[setcount].count);
838b6fd2
KW
910 pagecount -= maxpages;
911 pageidx += maxpages;
912 setcount++;
913 }
914 pageidx = i;
915 pagecount = 1;
916 vaddr = this_vaddr;
917 } else {
918 vaddr += PAGE_SIZE;
919 pagecount++;
920 }
921 }
922 /* insure we always return an even number of sets */
923 if (setcount & 1)
924 list[setcount++].count = 0;
925 return setcount;
926}
927
928/**
929 * tid_flush_pages - dump out pages into pagesets
930 * @list - list of pagesets
931 * @idx - pointer to current page index
932 * @pages - number of pages to dump
933 * @sets - current number of pagesset
934 *
935 * This routine flushes out accumuated pages.
936 *
937 * To insure an even number of sets the
938 * code may add a filler.
939 *
940 * This can happen with when pages is not
941 * a power of 2 or pages is a power of 2
942 * less than the maximum pages.
943 *
944 * Return:
945 * The new number of sets
946 */
947
948static u32 tid_flush_pages(struct tid_rdma_pageset *list,
949 u32 *idx, u32 pages, u32 sets)
950{
951 while (pages) {
952 u32 maxpages = pages;
953
954 if (maxpages > MAX_EXPECTED_PAGES)
955 maxpages = MAX_EXPECTED_PAGES;
956 else if (!is_power_of_2(maxpages))
957 maxpages = rounddown_pow_of_two(maxpages);
958 list[sets].idx = *idx;
959 list[sets++].count = maxpages;
960 *idx += maxpages;
961 pages -= maxpages;
962 }
963 /* might need a filler */
964 if (sets & 1)
965 list[sets++].count = 0;
966 return sets;
967}
968
969/**
970 * tid_rdma_find_phys_blocks_8k - get groups base on mr info
971 * @pages - pointer to an array of page structs
972 * @npages - number of pages
973 * @list - page set array to return
974 *
975 * This routine parses an array of pages to compute pagesets
976 * in an 8k compatible way.
977 *
978 * pages are tested two at a time, i, i + 1 for contiguous
979 * pages and i - 1 and i contiguous pages.
980 *
981 * If any condition is false, any accumlated pages are flushed and
982 * v0,v1 are emitted as separate PAGE_SIZE pagesets
983 *
984 * Otherwise, the current 8k is totaled for a future flush.
985 *
986 * Return:
987 * The number of pagesets
988 * list set with the returned number of pagesets
989 *
990 */
991static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
992 struct page **pages,
993 u32 npages,
994 struct tid_rdma_pageset *list)
995{
996 u32 idx, sets = 0, i;
997 u32 pagecnt = 0;
998 void *v0, *v1, *vm1;
999
1000 if (!npages)
1001 return 0;
1002 for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) {
1003 /* get a new v0 */
1004 v0 = page_address(pages[i]);
84f4a40d 1005 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0);
838b6fd2
KW
1006 v1 = i + 1 < npages ?
1007 page_address(pages[i + 1]) : NULL;
84f4a40d 1008 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1);
838b6fd2
KW
1009 /* compare i, i + 1 vaddr */
1010 if (v1 != (v0 + PAGE_SIZE)) {
1011 /* flush out pages */
1012 sets = tid_flush_pages(list, &idx, pagecnt, sets);
1013 /* output v0,v1 as two pagesets */
1014 list[sets].idx = idx++;
1015 list[sets++].count = 1;
1016 if (v1) {
1017 list[sets].count = 1;
1018 list[sets++].idx = idx++;
1019 } else {
1020 list[sets++].count = 0;
1021 }
1022 vm1 = NULL;
1023 pagecnt = 0;
1024 continue;
1025 }
1026 /* i,i+1 consecutive, look at i-1,i */
1027 if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
1028 /* flush out pages */
1029 sets = tid_flush_pages(list, &idx, pagecnt, sets);
1030 pagecnt = 0;
1031 }
1032 /* pages will always be a multiple of 8k */
1033 pagecnt += 2;
1034 /* save i-1 */
1035 vm1 = v1;
1036 /* move to next pair */
1037 }
1038 /* dump residual pages at end */
1039 sets = tid_flush_pages(list, &idx, npages - idx, sets);
1040 /* by design cannot be odd sets */
1041 WARN_ON(sets & 1);
1042 return sets;
1043}
1044
1045/**
1046 * Find pages for one segment of a sge array represented by @ss. The function
1047 * does not check the sge, the sge must have been checked for alignment with a
1048 * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
1049 * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
1050 * copy maintained in @ss->sge, the original sge is not modified.
1051 *
1052 * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
1053 * releasing the MR reference count at the same time. Otherwise, we'll "leak"
1054 * references to the MR. This difference requires that we keep track of progress
1055 * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
1056 * structure.
1057 */
1058static u32 kern_find_pages(struct tid_rdma_flow *flow,
1059 struct page **pages,
1060 struct rvt_sge_state *ss, bool *last)
1061{
1062 struct tid_rdma_request *req = flow->req;
1063 struct rvt_sge *sge = &ss->sge;
1064 u32 length = flow->req->seg_len;
1065 u32 len = PAGE_SIZE;
1066 u32 i = 0;
1067
1068 while (length && req->isge < ss->num_sge) {
1069 pages[i++] = virt_to_page(sge->vaddr);
1070
1071 sge->vaddr += len;
1072 sge->length -= len;
1073 sge->sge_length -= len;
1074 if (!sge->sge_length) {
1075 if (++req->isge < ss->num_sge)
1076 *sge = ss->sg_list[req->isge - 1];
1077 } else if (sge->length == 0 && sge->mr->lkey) {
1078 if (++sge->n >= RVT_SEGSZ) {
1079 ++sge->m;
1080 sge->n = 0;
1081 }
1082 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
1083 sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
1084 }
1085 length -= len;
1086 }
1087
1088 flow->length = flow->req->seg_len - length;
1089 *last = req->isge == ss->num_sge ? false : true;
1090 return i;
1091}
1092
1093static void dma_unmap_flow(struct tid_rdma_flow *flow)
1094{
1095 struct hfi1_devdata *dd;
1096 int i;
1097 struct tid_rdma_pageset *pset;
1098
1099 dd = flow->req->rcd->dd;
1100 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
1101 i++, pset++) {
1102 if (pset->count && pset->addr) {
1103 dma_unmap_page(&dd->pcidev->dev,
1104 pset->addr,
1105 PAGE_SIZE * pset->count,
1106 DMA_FROM_DEVICE);
1107 pset->mapped = 0;
1108 }
1109 }
1110}
1111
1112static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages)
1113{
1114 int i;
1115 struct hfi1_devdata *dd = flow->req->rcd->dd;
1116 struct tid_rdma_pageset *pset;
1117
1118 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
1119 i++, pset++) {
1120 if (pset->count) {
1121 pset->addr = dma_map_page(&dd->pcidev->dev,
1122 pages[pset->idx],
1123 0,
1124 PAGE_SIZE * pset->count,
1125 DMA_FROM_DEVICE);
1126
1127 if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) {
1128 dma_unmap_flow(flow);
1129 return -ENOMEM;
1130 }
1131 pset->mapped = 1;
1132 }
1133 }
1134 return 0;
1135}
1136
1137static inline bool dma_mapped(struct tid_rdma_flow *flow)
1138{
1139 return !!flow->pagesets[0].mapped;
1140}
1141
1142/*
1143 * Get pages pointers and identify contiguous physical memory chunks for a
1144 * segment. All segments are of length flow->req->seg_len.
1145 */
1146static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
1147 struct page **pages,
1148 struct rvt_sge_state *ss, bool *last)
1149{
1150 u8 npages;
1151
1152 /* Reuse previously computed pagesets, if any */
1153 if (flow->npagesets) {
84f4a40d
KW
1154 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head,
1155 flow);
838b6fd2
KW
1156 if (!dma_mapped(flow))
1157 return dma_map_flow(flow, pages);
1158 return 0;
1159 }
1160
1161 npages = kern_find_pages(flow, pages, ss, last);
1162
1163 if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
1164 flow->npagesets =
1165 tid_rdma_find_phys_blocks_4k(flow, pages, npages,
1166 flow->pagesets);
1167 else
1168 flow->npagesets =
1169 tid_rdma_find_phys_blocks_8k(flow, pages, npages,
1170 flow->pagesets);
1171
1172 return dma_map_flow(flow, pages);
1173}
1174
1175static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
1176 struct hfi1_ctxtdata *rcd, char *s,
1177 struct tid_group *grp, u8 cnt)
1178{
1179 struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
1180
1181 WARN_ON_ONCE(flow->tnode_cnt >=
1182 (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
1183 if (WARN_ON_ONCE(cnt & 1))
1184 dd_dev_err(rcd->dd,
1185 "unexpected odd allocation cnt %u map 0x%x used %u",
1186 cnt, grp->map, grp->used);
1187
1188 node->grp = grp;
1189 node->map = grp->map;
1190 node->cnt = cnt;
84f4a40d
KW
1191 trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1,
1192 grp->base, grp->map, grp->used, cnt);
838b6fd2
KW
1193}
1194
1195/*
1196 * Try to allocate pageset_count TID's from TID groups for a context
1197 *
1198 * This function allocates TID's without moving groups between lists or
1199 * modifying grp->map. This is done as follows, being cogizant of the lists
1200 * between which the TID groups will move:
1201 * 1. First allocate complete groups of 8 TID's since this is more efficient,
1202 * these groups will move from group->full without affecting used
1203 * 2. If more TID's are needed allocate from used (will move from used->full or
1204 * stay in used)
1205 * 3. If we still don't have the required number of TID's go back and look again
1206 * at a complete group (will move from group->used)
1207 */
1208static int kern_alloc_tids(struct tid_rdma_flow *flow)
1209{
1210 struct hfi1_ctxtdata *rcd = flow->req->rcd;
1211 struct hfi1_devdata *dd = rcd->dd;
1212 u32 ngroups, pageidx = 0;
1213 struct tid_group *group = NULL, *used;
1214 u8 use;
1215
1216 flow->tnode_cnt = 0;
1217 ngroups = flow->npagesets / dd->rcv_entries.group_size;
1218 if (!ngroups)
1219 goto used_list;
1220
1221 /* First look at complete groups */
1222 list_for_each_entry(group, &rcd->tid_group_list.list, list) {
1223 kern_add_tid_node(flow, rcd, "complete groups", group,
1224 group->size);
1225
1226 pageidx += group->size;
1227 if (!--ngroups)
1228 break;
1229 }
1230
1231 if (pageidx >= flow->npagesets)
1232 goto ok;
1233
1234used_list:
1235 /* Now look at partially used groups */
1236 list_for_each_entry(used, &rcd->tid_used_list.list, list) {
1237 use = min_t(u32, flow->npagesets - pageidx,
1238 used->size - used->used);
1239 kern_add_tid_node(flow, rcd, "used groups", used, use);
1240
1241 pageidx += use;
1242 if (pageidx >= flow->npagesets)
1243 goto ok;
1244 }
1245
1246 /*
1247 * Look again at a complete group, continuing from where we left.
1248 * However, if we are at the head, we have reached the end of the
1249 * complete groups list from the first loop above
1250 */
1251 if (group && &group->list == &rcd->tid_group_list.list)
1252 goto bail_eagain;
1253 group = list_prepare_entry(group, &rcd->tid_group_list.list,
1254 list);
1255 if (list_is_last(&group->list, &rcd->tid_group_list.list))
1256 goto bail_eagain;
1257 group = list_next_entry(group, list);
1258 use = min_t(u32, flow->npagesets - pageidx, group->size);
1259 kern_add_tid_node(flow, rcd, "complete continue", group, use);
1260 pageidx += use;
1261 if (pageidx >= flow->npagesets)
1262 goto ok;
1263bail_eagain:
84f4a40d
KW
1264 trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ",
1265 (u64)flow->npagesets);
838b6fd2
KW
1266 return -EAGAIN;
1267ok:
1268 return 0;
1269}
1270
1271static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num,
1272 u32 *pset_idx)
1273{
1274 struct hfi1_ctxtdata *rcd = flow->req->rcd;
1275 struct hfi1_devdata *dd = rcd->dd;
1276 struct kern_tid_node *node = &flow->tnode[grp_num];
1277 struct tid_group *grp = node->grp;
1278 struct tid_rdma_pageset *pset;
1279 u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
1280 u32 rcventry, npages = 0, pair = 0, tidctrl;
1281 u8 i, cnt = 0;
1282
1283 for (i = 0; i < grp->size; i++) {
1284 rcventry = grp->base + i;
1285
1286 if (node->map & BIT(i) || cnt >= node->cnt) {
1287 rcv_array_wc_fill(dd, rcventry);
1288 continue;
1289 }
1290 pset = &flow->pagesets[(*pset_idx)++];
1291 if (pset->count) {
1292 hfi1_put_tid(dd, rcventry, PT_EXPECTED,
1293 pset->addr, trdma_pset_order(pset));
1294 } else {
1295 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
1296 }
1297 npages += pset->count;
1298
1299 rcventry -= rcd->expected_base;
1300 tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1;
1301 /*
1302 * A single TID entry will be used to use a rcvarr pair (with
1303 * tidctrl 0x3), if ALL these are true (a) the bit pos is even
1304 * (b) the group map shows current and the next bits as free
1305 * indicating two consecutive rcvarry entries are available (c)
1306 * we actually need 2 more entries
1307 */
1308 pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
1309 node->cnt >= cnt + 2;
1310 if (!pair) {
1311 if (!pset->count)
1312 tidctrl = 0x1;
1313 flow->tid_entry[flow->tidcnt++] =
1314 EXP_TID_SET(IDX, rcventry >> 1) |
1315 EXP_TID_SET(CTRL, tidctrl) |
1316 EXP_TID_SET(LEN, npages);
84f4a40d
KW
1317 trace_hfi1_tid_entry_alloc(/* entry */
1318 flow->req->qp, flow->tidcnt - 1,
1319 flow->tid_entry[flow->tidcnt - 1]);
1320
838b6fd2
KW
1321 /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */
1322 flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg);
1323 npages = 0;
1324 }
1325
1326 if (grp->used == grp->size - 1)
1327 tid_group_move(grp, &rcd->tid_used_list,
1328 &rcd->tid_full_list);
1329 else if (!grp->used)
1330 tid_group_move(grp, &rcd->tid_group_list,
1331 &rcd->tid_used_list);
1332
1333 grp->used++;
1334 grp->map |= BIT(i);
1335 cnt++;
1336 }
1337}
1338
1339static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num)
1340{
1341 struct hfi1_ctxtdata *rcd = flow->req->rcd;
1342 struct hfi1_devdata *dd = rcd->dd;
1343 struct kern_tid_node *node = &flow->tnode[grp_num];
1344 struct tid_group *grp = node->grp;
1345 u32 rcventry;
1346 u8 i, cnt = 0;
1347
1348 for (i = 0; i < grp->size; i++) {
1349 rcventry = grp->base + i;
1350
1351 if (node->map & BIT(i) || cnt >= node->cnt) {
1352 rcv_array_wc_fill(dd, rcventry);
1353 continue;
1354 }
1355
1356 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
1357
1358 grp->used--;
1359 grp->map &= ~BIT(i);
1360 cnt++;
1361
1362 if (grp->used == grp->size - 1)
1363 tid_group_move(grp, &rcd->tid_full_list,
1364 &rcd->tid_used_list);
1365 else if (!grp->used)
1366 tid_group_move(grp, &rcd->tid_used_list,
1367 &rcd->tid_group_list);
1368 }
1369 if (WARN_ON_ONCE(cnt & 1)) {
1370 struct hfi1_ctxtdata *rcd = flow->req->rcd;
1371 struct hfi1_devdata *dd = rcd->dd;
1372
1373 dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
1374 cnt, grp->map, grp->used);
1375 }
1376}
1377
1378static void kern_program_rcvarray(struct tid_rdma_flow *flow)
1379{
1380 u32 pset_idx = 0;
1381 int i;
1382
1383 flow->npkts = 0;
1384 flow->tidcnt = 0;
1385 for (i = 0; i < flow->tnode_cnt; i++)
1386 kern_program_rcv_group(flow, i, &pset_idx);
84f4a40d 1387 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow);
838b6fd2
KW
1388}
1389
1390/**
1391 * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
1392 * TID RDMA request
1393 *
1394 * @req: TID RDMA request for which the segment/flow is being set up
1395 * @ss: sge state, maintains state across successive segments of a sge
1396 * @last: set to true after the last sge segment has been processed
1397 *
1398 * This function
1399 * (1) finds a free flow entry in the flow circular buffer
1400 * (2) finds pages and continuous physical chunks constituing one segment
1401 * of an sge
1402 * (3) allocates TID group entries for those chunks
1403 * (4) programs rcvarray entries in the hardware corresponding to those
1404 * TID's
1405 * (5) computes a tidarray with formatted TID entries which can be sent
1406 * to the sender
1407 * (6) Reserves and programs HW flows.
1408 * (7) It also manages queing the QP when TID/flow resources are not
1409 * available.
1410 *
1411 * @req points to struct tid_rdma_request of which the segments are a part. The
1412 * function uses qp, rcd and seg_len members of @req. In the absence of errors,
1413 * req->flow_idx is the index of the flow which has been prepared in this
1414 * invocation of function call. With flow = &req->flows[req->flow_idx],
1415 * flow->tid_entry contains the TID array which the sender can use for TID RDMA
1416 * sends and flow->npkts contains number of packets required to send the
1417 * segment.
1418 *
1419 * hfi1_check_sge_align should be called prior to calling this function and if
1420 * it signals error TID RDMA cannot be used for this sge and this function
1421 * should not be called.
1422 *
1423 * For the queuing, caller must hold the flow->req->qp s_lock from the send
1424 * engine and the function will procure the exp_lock.
1425 *
1426 * Return:
1427 * The function returns -EAGAIN if sufficient number of TID/flow resources to
1428 * map the segment could not be allocated. In this case the function should be
1429 * called again with previous arguments to retry the TID allocation. There are
1430 * no other error returns. The function returns 0 on success.
1431 */
1432int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
1433 struct rvt_sge_state *ss, bool *last)
1434 __must_hold(&req->qp->s_lock)
1435{
1436 struct tid_rdma_flow *flow = &req->flows[req->setup_head];
1437 struct hfi1_ctxtdata *rcd = req->rcd;
1438 struct hfi1_qp_priv *qpriv = req->qp->priv;
1439 unsigned long flags;
1440 struct rvt_qp *fqp;
1441 u16 clear_tail = req->clear_tail;
1442
1443 lockdep_assert_held(&req->qp->s_lock);
1444 /*
1445 * We return error if either (a) we don't have space in the flow
1446 * circular buffer, or (b) we already have max entries in the buffer.
1447 * Max entries depend on the type of request we are processing and the
1448 * negotiated TID RDMA parameters.
1449 */
1450 if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
1451 CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
1452 req->n_flows)
1453 return -EINVAL;
1454
1455 /*
1456 * Get pages, identify contiguous physical memory chunks for the segment
1457 * If we can not determine a DMA address mapping we will treat it just
1458 * like if we ran out of space above.
1459 */
1460 if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
1461 hfi1_wait_kmem(flow->req->qp);
1462 return -ENOMEM;
1463 }
1464
1465 spin_lock_irqsave(&rcd->exp_lock, flags);
1466 if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp))
1467 goto queue;
1468
1469 /*
1470 * At this point we know the number of pagesets and hence the number of
1471 * TID's to map the segment. Allocate the TID's from the TID groups. If
1472 * we cannot allocate the required number we exit and try again later
1473 */
1474 if (kern_alloc_tids(flow))
1475 goto queue;
1476 /*
1477 * Finally program the TID entries with the pagesets, compute the
1478 * tidarray and enable the HW flow
1479 */
1480 kern_program_rcvarray(flow);
1481
1482 /*
1483 * Setup the flow state with relevant information.
1484 * This information is used for tracking the sequence of data packets
1485 * for the segment.
1486 * The flow is setup here as this is the most accurate time and place
1487 * to do so. Doing at a later time runs the risk of the flow data in
1488 * qpriv getting out of sync.
1489 */
1490 memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
1491 flow->idx = qpriv->flow_state.index;
1492 flow->flow_state.generation = qpriv->flow_state.generation;
1493 flow->flow_state.spsn = qpriv->flow_state.psn;
1494 flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
1495 flow->flow_state.r_next_psn =
1496 full_flow_psn(flow, flow->flow_state.spsn);
1497 qpriv->flow_state.psn += flow->npkts;
1498
1499 dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp);
1500 /* get head before dropping lock */
1501 fqp = first_qp(rcd, &rcd->rarr_queue);
1502 spin_unlock_irqrestore(&rcd->exp_lock, flags);
1503 tid_rdma_schedule_tid_wakeup(fqp);
1504
1505 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
1506 return 0;
1507queue:
1508 queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp);
1509 spin_unlock_irqrestore(&rcd->exp_lock, flags);
1510 return -EAGAIN;
1511}
1512
1513static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
1514{
1515 flow->npagesets = 0;
1516}
1517
1518/*
1519 * This function is called after one segment has been successfully sent to
1520 * release the flow and TID HW/SW resources for that segment. The segments for a
1521 * TID RDMA request are setup and cleared in FIFO order which is managed using a
1522 * circular buffer.
1523 */
1524int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
1525 __must_hold(&req->qp->s_lock)
1526{
1527 struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
1528 struct hfi1_ctxtdata *rcd = req->rcd;
1529 unsigned long flags;
1530 int i;
1531 struct rvt_qp *fqp;
1532
1533 lockdep_assert_held(&req->qp->s_lock);
1534 /* Exit if we have nothing in the flow circular buffer */
1535 if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
1536 return -EINVAL;
1537
1538 spin_lock_irqsave(&rcd->exp_lock, flags);
1539
1540 for (i = 0; i < flow->tnode_cnt; i++)
1541 kern_unprogram_rcv_group(flow, i);
1542 /* To prevent double unprogramming */
1543 flow->tnode_cnt = 0;
1544 /* get head before dropping lock */
1545 fqp = first_qp(rcd, &rcd->rarr_queue);
1546 spin_unlock_irqrestore(&rcd->exp_lock, flags);
1547
1548 dma_unmap_flow(flow);
1549
1550 hfi1_tid_rdma_reset_flow(flow);
1551 req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1);
1552
1553 if (fqp == req->qp) {
1554 __trigger_tid_waiter(fqp);
1555 rvt_put_qp(fqp);
1556 } else {
1557 tid_rdma_schedule_tid_wakeup(fqp);
1558 }
1559
1560 return 0;
1561}
1562
1563/*
1564 * This function is called to release all the tid entries for
1565 * a request.
1566 */
1567void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
1568 __must_hold(&req->qp->s_lock)
1569{
1570 /* Use memory barrier for proper ordering */
1571 while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
1572 if (hfi1_kern_exp_rcv_clear(req))
1573 break;
1574 }
1575}
1576
1577/**
1578 * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information
1579 * @req - the tid rdma request to be cleaned
1580 */
1581static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
1582{
1583 kfree(req->flows);
1584 req->flows = NULL;
1585}
1586
1587/**
1588 * __trdma_clean_swqe - clean up for large sized QPs
1589 * @qp: the queue patch
1590 * @wqe: the send wqe
1591 */
1592void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
1593{
1594 struct hfi1_swqe_priv *p = wqe->priv;
1595
1596 hfi1_kern_exp_rcv_free_flows(&p->tid_req);
1597}
1598
1599/*
1600 * This can be called at QP create time or in the data path.
1601 */
1602static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
1603 gfp_t gfp)
1604{
1605 struct tid_rdma_flow *flows;
1606 int i;
1607
1608 if (likely(req->flows))
1609 return 0;
1610 flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
1611 req->rcd->numa_id);
1612 if (!flows)
1613 return -ENOMEM;
1614 /* mini init */
1615 for (i = 0; i < MAX_FLOWS; i++) {
1616 flows[i].req = req;
1617 flows[i].npagesets = 0;
1618 flows[i].pagesets[0].mapped = 0;
1619 }
1620 req->flows = flows;
1621 return 0;
1622}
1623
1624static void hfi1_init_trdma_req(struct rvt_qp *qp,
1625 struct tid_rdma_request *req)
1626{
1627 struct hfi1_qp_priv *qpriv = qp->priv;
1628
1629 /*
1630 * Initialize various TID RDMA request variables.
1631 * These variables are "static", which is why they
1632 * can be pre-initialized here before the WRs has
1633 * even been submitted.
1634 * However, non-NULL values for these variables do not
1635 * imply that this WQE has been enabled for TID RDMA.
1636 * Drivers should check the WQE's opcode to determine
1637 * if a request is a TID RDMA one or not.
1638 */
1639 req->qp = qp;
1640 req->rcd = qpriv->rcd;
1641}
2f16a696
KW
1642
1643u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
1644 void *context, int vl, int mode, u64 data)
1645{
1646 struct hfi1_devdata *dd = context;
1647
1648 return dd->verbs_dev.n_tidwait;
1649}
742a3826 1650
b126078e
KW
1651static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
1652 u32 psn, u16 *fidx)
1653{
1654 u16 head, tail;
1655 struct tid_rdma_flow *flow;
1656
1657 head = req->setup_head;
1658 tail = req->clear_tail;
1659 for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
1660 tail = CIRC_NEXT(tail, MAX_FLOWS)) {
1661 flow = &req->flows[tail];
1662 if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 &&
1663 cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) {
1664 if (fidx)
1665 *fidx = tail;
1666 return flow;
1667 }
1668 }
1669 return NULL;
1670}
1671
9905bf06
KW
1672static struct tid_rdma_flow *
1673__find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail,
1674 u32 psn, u16 *fidx)
1675{
1676 for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
1677 tail = CIRC_NEXT(tail, MAX_FLOWS)) {
1678 struct tid_rdma_flow *flow = &req->flows[tail];
1679 u32 spsn, lpsn;
1680
1681 spsn = full_flow_psn(flow, flow->flow_state.spsn);
1682 lpsn = full_flow_psn(flow, flow->flow_state.lpsn);
1683
1684 if (cmp_psn(psn, spsn) >= 0 && cmp_psn(psn, lpsn) <= 0) {
1685 if (fidx)
1686 *fidx = tail;
1687 return flow;
1688 }
1689 }
1690 return NULL;
1691}
1692
1693static struct tid_rdma_flow *find_flow(struct tid_rdma_request *req,
1694 u32 psn, u16 *fidx)
1695{
1696 return __find_flow_ranged(req, req->setup_head, req->clear_tail, psn,
1697 fidx);
1698}
1699
742a3826
KW
1700/* TID RDMA READ functions */
1701u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
1702 struct ib_other_headers *ohdr, u32 *bth1,
1703 u32 *bth2, u32 *len)
1704{
1705 struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1706 struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
1707 struct rvt_qp *qp = req->qp;
1708 struct hfi1_qp_priv *qpriv = qp->priv;
1709 struct hfi1_swqe_priv *wpriv = wqe->priv;
1710 struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
1711 struct tid_rdma_params *remote;
1712 u32 req_len = 0;
1713 void *req_addr = NULL;
1714
1715 /* This is the IB psn used to send the request */
1716 *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
3ce5daa2 1717 trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow);
742a3826
KW
1718
1719 /* TID Entries for TID RDMA READ payload */
1720 req_addr = &flow->tid_entry[flow->tid_idx];
1721 req_len = sizeof(*flow->tid_entry) *
1722 (flow->tidcnt - flow->tid_idx);
1723
1724 memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
1725 wpriv->ss.sge.vaddr = req_addr;
1726 wpriv->ss.sge.sge_length = req_len;
1727 wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
1728 /*
1729 * We can safely zero these out. Since the first SGE covers the
1730 * entire packet, nothing else should even look at the MR.
1731 */
1732 wpriv->ss.sge.mr = NULL;
1733 wpriv->ss.sge.m = 0;
1734 wpriv->ss.sge.n = 0;
1735
1736 wpriv->ss.sg_list = NULL;
1737 wpriv->ss.total_len = wpriv->ss.sge.sge_length;
1738 wpriv->ss.num_sge = 1;
1739
1740 /* Construct the TID RDMA READ REQ packet header */
1741 rcu_read_lock();
1742 remote = rcu_dereference(qpriv->tid_rdma.remote);
1743
1744 KDETH_RESET(rreq->kdeth0, KVER, 0x1);
1745 KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
1746 rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
1747 req->cur_seg * req->seg_len + flow->sent);
1748 rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
1749 rreq->reth.length = cpu_to_be32(*len);
1750 rreq->tid_flow_psn =
1751 cpu_to_be32((flow->flow_state.generation <<
1752 HFI1_KDETH_BTH_SEQ_SHIFT) |
1753 ((flow->flow_state.spsn + flow->pkt) &
1754 HFI1_KDETH_BTH_SEQ_MASK));
1755 rreq->tid_flow_qp =
1756 cpu_to_be32(qpriv->tid_rdma.local.qp |
1757 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
1758 TID_RDMA_DESTQP_FLOW_SHIFT) |
1759 qpriv->rcd->ctxt);
1760 rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
1761 *bth1 &= ~RVT_QPN_MASK;
1762 *bth1 |= remote->qp;
1763 *bth2 |= IB_BTH_REQ_ACK;
1764 rcu_read_unlock();
1765
1766 /* We are done with this segment */
1767 flow->sent += *len;
1768 req->cur_seg++;
1769 qp->s_state = TID_OP(READ_REQ);
1770 req->ack_pending++;
1771 req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1);
1772 qpriv->pending_tid_r_segs++;
1773 qp->s_num_rd_atomic++;
1774
1775 /* Set the TID RDMA READ request payload size */
1776 *len = req_len;
1777
1778 return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
1779}
1780
1781/*
1782 * @len: contains the data length to read upon entry and the read request
1783 * payload length upon exit.
1784 */
1785u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
1786 struct ib_other_headers *ohdr, u32 *bth1,
1787 u32 *bth2, u32 *len)
1788 __must_hold(&qp->s_lock)
1789{
1790 struct hfi1_qp_priv *qpriv = qp->priv;
1791 struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1792 struct tid_rdma_flow *flow = NULL;
1793 u32 hdwords = 0;
1794 bool last;
1795 bool retry = true;
1796 u32 npkts = rvt_div_round_up_mtu(qp, *len);
1797
3ce5daa2
KW
1798 trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn,
1799 wqe->lpsn, req);
742a3826
KW
1800 /*
1801 * Check sync conditions. Make sure that there are no pending
1802 * segments before freeing the flow.
1803 */
1804sync_check:
1805 if (req->state == TID_REQUEST_SYNC) {
1806 if (qpriv->pending_tid_r_segs)
1807 goto done;
1808
1809 hfi1_kern_clear_hw_flow(req->rcd, qp);
1810 req->state = TID_REQUEST_ACTIVE;
1811 }
1812
1813 /*
1814 * If the request for this segment is resent, the tid resources should
1815 * have been allocated before. In this case, req->flow_idx should
1816 * fall behind req->setup_head.
1817 */
1818 if (req->flow_idx == req->setup_head) {
1819 retry = false;
1820 if (req->state == TID_REQUEST_RESEND) {
1821 /*
1822 * This is the first new segment for a request whose
1823 * earlier segments have been re-sent. We need to
1824 * set up the sge pointer correctly.
1825 */
1826 restart_sge(&qp->s_sge, wqe, req->s_next_psn,
1827 qp->pmtu);
1828 req->isge = 0;
1829 req->state = TID_REQUEST_ACTIVE;
1830 }
1831
1832 /*
1833 * Check sync. The last PSN of each generation is reserved for
1834 * RESYNC.
1835 */
1836 if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
1837 req->state = TID_REQUEST_SYNC;
1838 goto sync_check;
1839 }
1840
1841 /* Allocate the flow if not yet */
1842 if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
1843 goto done;
1844
1845 /*
1846 * The following call will advance req->setup_head after
1847 * allocating the tid entries.
1848 */
1849 if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
1850 req->state = TID_REQUEST_QUEUED;
1851
1852 /*
1853 * We don't have resources for this segment. The QP has
1854 * already been queued.
1855 */
1856 goto done;
1857 }
1858 }
1859
1860 /* req->flow_idx should only be one slot behind req->setup_head */
1861 flow = &req->flows[req->flow_idx];
1862 flow->pkt = 0;
1863 flow->tid_idx = 0;
1864 flow->sent = 0;
1865 if (!retry) {
1866 /* Set the first and last IB PSN for the flow in use.*/
1867 flow->flow_state.ib_spsn = req->s_next_psn;
1868 flow->flow_state.ib_lpsn =
1869 flow->flow_state.ib_spsn + flow->npkts - 1;
1870 }
1871
1872 /* Calculate the next segment start psn.*/
1873 req->s_next_psn += flow->npkts;
1874
1875 /* Build the packet header */
1876 hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
1877done:
1878 return hdwords;
1879}
d0d564a1
KW
1880
1881/*
1882 * Validate and accept the TID RDMA READ request parameters.
1883 * Return 0 if the request is accepted successfully;
1884 * Return 1 otherwise.
1885 */
1886static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
1887 struct rvt_ack_entry *e,
1888 struct hfi1_packet *packet,
1889 struct ib_other_headers *ohdr,
1890 u32 bth0, u32 psn, u64 vaddr, u32 len)
1891{
1892 struct hfi1_qp_priv *qpriv = qp->priv;
1893 struct tid_rdma_request *req;
1894 struct tid_rdma_flow *flow;
1895 u32 flow_psn, i, tidlen = 0, pktlen, tlen;
1896
1897 req = ack_to_tid_req(e);
1898
1899 /* Validate the payload first */
1900 flow = &req->flows[req->setup_head];
1901
1902 /* payload length = packet length - (header length + ICRC length) */
1903 pktlen = packet->tlen - (packet->hlen + 4);
1904 if (pktlen > sizeof(flow->tid_entry))
1905 return 1;
1906 memcpy(flow->tid_entry, packet->ebuf, pktlen);
1907 flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
1908
1909 /*
1910 * Walk the TID_ENTRY list to make sure we have enough space for a
1911 * complete segment. Also calculate the number of required packets.
1912 */
1913 flow->npkts = rvt_div_round_up_mtu(qp, len);
1914 for (i = 0; i < flow->tidcnt; i++) {
3ce5daa2
KW
1915 trace_hfi1_tid_entry_rcv_read_req(qp, i,
1916 flow->tid_entry[i]);
d0d564a1
KW
1917 tlen = EXP_TID_GET(flow->tid_entry[i], LEN);
1918 if (!tlen)
1919 return 1;
1920
1921 /*
1922 * For tid pair (tidctr == 3), the buffer size of the pair
1923 * should be the sum of the buffer size described by each
1924 * tid entry. However, only the first entry needs to be
1925 * specified in the request (see WFR HAS Section 8.5.7.1).
1926 */
1927 tidlen += tlen;
1928 }
1929 if (tidlen * PAGE_SIZE < len)
1930 return 1;
1931
1932 /* Empty the flow array */
1933 req->clear_tail = req->setup_head;
1934 flow->pkt = 0;
1935 flow->tid_idx = 0;
1936 flow->tid_offset = 0;
1937 flow->sent = 0;
1938 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
1939 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
1940 TID_RDMA_DESTQP_FLOW_MASK;
1941 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
1942 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
1943 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
1944 flow->length = len;
1945
1946 flow->flow_state.lpsn = flow->flow_state.spsn +
1947 flow->npkts - 1;
1948 flow->flow_state.ib_spsn = psn;
1949 flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1;
1950
3ce5daa2 1951 trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow);
d0d564a1
KW
1952 /* Set the initial flow index to the current flow. */
1953 req->flow_idx = req->setup_head;
1954
1955 /* advance circular buffer head */
1956 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
1957
1958 /*
1959 * Compute last PSN for request.
1960 */
1961 e->opcode = (bth0 >> 24) & 0xff;
1962 e->psn = psn;
1963 e->lpsn = psn + flow->npkts - 1;
1964 e->sent = 0;
1965
1966 req->n_flows = qpriv->tid_rdma.local.max_read;
1967 req->state = TID_REQUEST_ACTIVE;
1968 req->cur_seg = 0;
1969 req->comp_seg = 0;
1970 req->ack_seg = 0;
1971 req->isge = 0;
1972 req->seg_len = qpriv->tid_rdma.local.max_len;
1973 req->total_len = len;
1974 req->total_segs = 1;
1975 req->r_flow_psn = e->psn;
1976
3ce5daa2
KW
1977 trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn,
1978 req);
d0d564a1
KW
1979 return 0;
1980}
1981
1982static int tid_rdma_rcv_error(struct hfi1_packet *packet,
1983 struct ib_other_headers *ohdr,
1984 struct rvt_qp *qp, u32 psn, int diff)
1985{
1986 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1987 struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
07b92370
KW
1988 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
1989 struct hfi1_qp_priv *qpriv = qp->priv;
d0d564a1
KW
1990 struct rvt_ack_entry *e;
1991 struct tid_rdma_request *req;
1992 unsigned long flags;
1993 u8 prev;
1994 bool old_req;
1995
3ce5daa2
KW
1996 trace_hfi1_rsp_tid_rcv_error(qp, psn);
1997 trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff);
d0d564a1
KW
1998 if (diff > 0) {
1999 /* sequence error */
2000 if (!qp->r_nak_state) {
2001 ibp->rvp.n_rc_seqnak++;
2002 qp->r_nak_state = IB_NAK_PSN_ERROR;
2003 qp->r_ack_psn = qp->r_psn;
2004 rc_defered_ack(rcd, qp);
2005 }
2006 goto done;
2007 }
2008
2009 ibp->rvp.n_rc_dupreq++;
2010
2011 spin_lock_irqsave(&qp->s_lock, flags);
2012 e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
07b92370
KW
2013 if (!e || (e->opcode != TID_OP(READ_REQ) &&
2014 e->opcode != TID_OP(WRITE_REQ)))
d0d564a1
KW
2015 goto unlock;
2016
2017 req = ack_to_tid_req(e);
2018 req->r_flow_psn = psn;
3ce5daa2 2019 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req);
d0d564a1
KW
2020 if (e->opcode == TID_OP(READ_REQ)) {
2021 struct ib_reth *reth;
2022 u32 offset;
2023 u32 len;
2024 u32 rkey;
2025 u64 vaddr;
2026 int ok;
2027 u32 bth0;
2028
2029 reth = &ohdr->u.tid_rdma.r_req.reth;
2030 /*
2031 * The requester always restarts from the start of the original
2032 * request.
2033 */
2034 offset = delta_psn(psn, e->psn) * qp->pmtu;
2035 len = be32_to_cpu(reth->length);
2036 if (psn != e->psn || len != req->total_len)
2037 goto unlock;
2038
2039 if (e->rdma_sge.mr) {
2040 rvt_put_mr(e->rdma_sge.mr);
2041 e->rdma_sge.mr = NULL;
2042 }
2043
2044 rkey = be32_to_cpu(reth->rkey);
2045 vaddr = get_ib_reth_vaddr(reth);
2046
2047 qp->r_len = len;
2048 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
2049 IB_ACCESS_REMOTE_READ);
2050 if (unlikely(!ok))
2051 goto unlock;
2052
2053 /*
2054 * If all the response packets for the current request have
2055 * been sent out and this request is complete (old_request
2056 * == false) and the TID flow may be unusable (the
2057 * req->clear_tail is advanced). However, when an earlier
2058 * request is received, this request will not be complete any
2059 * more (qp->s_tail_ack_queue is moved back, see below).
2060 * Consequently, we need to update the TID flow info everytime
2061 * a duplicate request is received.
2062 */
2063 bth0 = be32_to_cpu(ohdr->bth[0]);
2064 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
2065 vaddr, len))
2066 goto unlock;
2067
2068 /*
2069 * True if the request is already scheduled (between
2070 * qp->s_tail_ack_queue and qp->r_head_ack_queue);
2071 */
2072 if (old_req)
2073 goto unlock;
07b92370
KW
2074 } else {
2075 struct flow_state *fstate;
2076 bool schedule = false;
2077 u8 i;
2078
2079 if (req->state == TID_REQUEST_RESEND) {
2080 req->state = TID_REQUEST_RESEND_ACTIVE;
2081 } else if (req->state == TID_REQUEST_INIT_RESEND) {
2082 req->state = TID_REQUEST_INIT;
2083 schedule = true;
2084 }
2085
2086 /*
2087 * True if the request is already scheduled (between
2088 * qp->s_tail_ack_queue and qp->r_head_ack_queue).
2089 * Also, don't change requests, which are at the SYNC
2090 * point and haven't generated any responses yet.
2091 * There is nothing to retransmit for them yet.
2092 */
2093 if (old_req || req->state == TID_REQUEST_INIT ||
2094 (req->state == TID_REQUEST_SYNC && !req->cur_seg)) {
2095 for (i = prev + 1; ; i++) {
2096 if (i > rvt_size_atomic(&dev->rdi))
2097 i = 0;
2098 if (i == qp->r_head_ack_queue)
2099 break;
2100 e = &qp->s_ack_queue[i];
2101 req = ack_to_tid_req(e);
2102 if (e->opcode == TID_OP(WRITE_REQ) &&
2103 req->state == TID_REQUEST_INIT)
2104 req->state = TID_REQUEST_INIT_RESEND;
2105 }
2106 /*
2107 * If the state of the request has been changed,
2108 * the first leg needs to get scheduled in order to
2109 * pick up the change. Otherwise, normal response
2110 * processing should take care of it.
2111 */
2112 if (!schedule)
2113 goto unlock;
2114 }
2115
2116 /*
2117 * If there is no more allocated segment, just schedule the qp
2118 * without changing any state.
2119 */
2120 if (req->clear_tail == req->setup_head)
2121 goto schedule;
2122 /*
2123 * If this request has sent responses for segments, which have
2124 * not received data yet (flow_idx != clear_tail), the flow_idx
2125 * pointer needs to be adjusted so the same responses can be
2126 * re-sent.
2127 */
2128 if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) {
2129 fstate = &req->flows[req->clear_tail].flow_state;
2130 qpriv->pending_tid_w_segs -=
2131 CIRC_CNT(req->flow_idx, req->clear_tail,
2132 MAX_FLOWS);
2133 req->flow_idx =
2134 CIRC_ADD(req->clear_tail,
2135 delta_psn(psn, fstate->resp_ib_psn),
2136 MAX_FLOWS);
2137 qpriv->pending_tid_w_segs +=
2138 delta_psn(psn, fstate->resp_ib_psn);
2139 /*
2140 * When flow_idx == setup_head, we've gotten a duplicate
2141 * request for a segment, which has not been allocated
2142 * yet. In that case, don't adjust this request.
2143 * However, we still want to go through the loop below
2144 * to adjust all subsequent requests.
2145 */
2146 if (CIRC_CNT(req->setup_head, req->flow_idx,
2147 MAX_FLOWS)) {
2148 req->cur_seg = delta_psn(psn, e->psn);
2149 req->state = TID_REQUEST_RESEND_ACTIVE;
2150 }
2151 }
2152
2153 for (i = prev + 1; ; i++) {
2154 /*
2155 * Look at everything up to and including
2156 * s_tail_ack_queue
2157 */
2158 if (i > rvt_size_atomic(&dev->rdi))
2159 i = 0;
2160 if (i == qp->r_head_ack_queue)
2161 break;
2162 e = &qp->s_ack_queue[i];
2163 req = ack_to_tid_req(e);
2164 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn,
2165 e->lpsn, req);
2166 if (e->opcode != TID_OP(WRITE_REQ) ||
2167 req->cur_seg == req->comp_seg ||
2168 req->state == TID_REQUEST_INIT ||
2169 req->state == TID_REQUEST_INIT_RESEND) {
2170 if (req->state == TID_REQUEST_INIT)
2171 req->state = TID_REQUEST_INIT_RESEND;
2172 continue;
2173 }
2174 qpriv->pending_tid_w_segs -=
2175 CIRC_CNT(req->flow_idx,
2176 req->clear_tail,
2177 MAX_FLOWS);
2178 req->flow_idx = req->clear_tail;
2179 req->state = TID_REQUEST_RESEND;
2180 req->cur_seg = req->comp_seg;
2181 }
d0d564a1
KW
2182 }
2183 /* Re-process old requests.*/
4f9264d1
KW
2184 if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
2185 qp->s_acked_ack_queue = prev;
d0d564a1
KW
2186 qp->s_tail_ack_queue = prev;
2187 /*
2188 * Since the qp->s_tail_ack_queue is modified, the
2189 * qp->s_ack_state must be changed to re-initialize
2190 * qp->s_ack_rdma_sge; Otherwise, we will end up in
2191 * wrong memory region.
2192 */
2193 qp->s_ack_state = OP(ACKNOWLEDGE);
07b92370
KW
2194schedule:
2195 /*
2196 * It's possible to receive a retry psn that is earlier than an RNRNAK
2197 * psn. In this case, the rnrnak state should be cleared.
2198 */
2199 if (qpriv->rnr_nak_state) {
2200 qp->s_nak_state = 0;
2201 qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
2202 qp->r_psn = e->lpsn + 1;
2203 hfi1_tid_write_alloc_resources(qp, true);
2204 }
2205
d0d564a1
KW
2206 qp->r_state = e->opcode;
2207 qp->r_nak_state = 0;
2208 qp->s_flags |= RVT_S_RESP_PENDING;
2209 hfi1_schedule_send(qp);
2210unlock:
2211 spin_unlock_irqrestore(&qp->s_lock, flags);
2212done:
2213 return 1;
2214}
2215
2216void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
2217{
2218 /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/
2219
2220 /*
2221 * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
2222 * (see hfi1_rc_rcv())
2223 * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue)
2224 * - Setup struct tid_rdma_req with request info
2225 * - Initialize struct tid_rdma_flow info;
2226 * - Copy TID entries;
2227 * 3. Set the qp->s_ack_state.
2228 * 4. Set RVT_S_RESP_PENDING in s_flags.
2229 * 5. Kick the send engine (hfi1_schedule_send())
2230 */
2231 struct hfi1_ctxtdata *rcd = packet->rcd;
2232 struct rvt_qp *qp = packet->qp;
2233 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
2234 struct ib_other_headers *ohdr = packet->ohdr;
2235 struct rvt_ack_entry *e;
2236 unsigned long flags;
2237 struct ib_reth *reth;
2238 struct hfi1_qp_priv *qpriv = qp->priv;
2239 u32 bth0, psn, len, rkey;
2240 bool is_fecn;
2241 u8 next;
2242 u64 vaddr;
2243 int diff;
2244 u8 nack_state = IB_NAK_INVALID_REQUEST;
2245
2246 bth0 = be32_to_cpu(ohdr->bth[0]);
2247 if (hfi1_ruc_check_hdr(ibp, packet))
2248 return;
2249
2250 is_fecn = process_ecn(qp, packet);
2251 psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
3ce5daa2 2252 trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
d0d564a1
KW
2253
2254 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
2255 rvt_comm_est(qp);
2256
2257 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2258 goto nack_inv;
2259
2260 reth = &ohdr->u.tid_rdma.r_req.reth;
2261 vaddr = be64_to_cpu(reth->vaddr);
2262 len = be32_to_cpu(reth->length);
2263 /* The length needs to be in multiples of PAGE_SIZE */
2264 if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len)
2265 goto nack_inv;
2266
2267 diff = delta_psn(psn, qp->r_psn);
2268 if (unlikely(diff)) {
2269 if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
2270 return;
2271 goto send_ack;
2272 }
2273
2274 /* We've verified the request, insert it into the ack queue. */
2275 next = qp->r_head_ack_queue + 1;
2276 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
2277 next = 0;
2278 spin_lock_irqsave(&qp->s_lock, flags);
2279 if (unlikely(next == qp->s_tail_ack_queue)) {
2280 if (!qp->s_ack_queue[next].sent) {
2281 nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2282 goto nack_inv_unlock;
2283 }
2284 update_ack_queue(qp, next);
2285 }
2286 e = &qp->s_ack_queue[qp->r_head_ack_queue];
2287 if (e->rdma_sge.mr) {
2288 rvt_put_mr(e->rdma_sge.mr);
2289 e->rdma_sge.mr = NULL;
2290 }
2291
2292 rkey = be32_to_cpu(reth->rkey);
2293 qp->r_len = len;
2294
2295 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
2296 rkey, IB_ACCESS_REMOTE_READ)))
2297 goto nack_acc;
2298
2299 /* Accept the request parameters */
2300 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
2301 len))
2302 goto nack_inv_unlock;
2303
2304 qp->r_state = e->opcode;
2305 qp->r_nak_state = 0;
2306 /*
2307 * We need to increment the MSN here instead of when we
2308 * finish sending the result since a duplicate request would
2309 * increment it more than once.
2310 */
2311 qp->r_msn++;
2312 qp->r_psn += e->lpsn - e->psn + 1;
2313
2314 qp->r_head_ack_queue = next;
2315
07b92370
KW
2316 /*
2317 * For all requests other than TID WRITE which are added to the ack
2318 * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to
2319 * do this because of interlocks between these and TID WRITE
2320 * requests. The same change has also been made in hfi1_rc_rcv().
2321 */
2322 qpriv->r_tid_alloc = qp->r_head_ack_queue;
2323
d0d564a1
KW
2324 /* Schedule the send tasklet. */
2325 qp->s_flags |= RVT_S_RESP_PENDING;
2326 hfi1_schedule_send(qp);
2327
2328 spin_unlock_irqrestore(&qp->s_lock, flags);
2329 if (is_fecn)
2330 goto send_ack;
2331 return;
2332
2333nack_inv_unlock:
2334 spin_unlock_irqrestore(&qp->s_lock, flags);
2335nack_inv:
2336 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2337 qp->r_nak_state = nack_state;
2338 qp->r_ack_psn = qp->r_psn;
2339 /* Queue NAK for later */
2340 rc_defered_ack(rcd, qp);
2341 return;
2342nack_acc:
2343 spin_unlock_irqrestore(&qp->s_lock, flags);
2344 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
2345 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2346 qp->r_ack_psn = qp->r_psn;
2347send_ack:
2348 hfi1_send_rc_ack(packet, is_fecn);
2349}
1db21b50
KW
2350
2351u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
2352 struct ib_other_headers *ohdr, u32 *bth0,
2353 u32 *bth1, u32 *bth2, u32 *len, bool *last)
2354{
2355 struct hfi1_ack_priv *epriv = e->priv;
2356 struct tid_rdma_request *req = &epriv->tid_req;
2357 struct hfi1_qp_priv *qpriv = qp->priv;
2358 struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
2359 u32 tidentry = flow->tid_entry[flow->tid_idx];
2360 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
2361 struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
2362 u32 next_offset, om = KDETH_OM_LARGE;
2363 bool last_pkt;
2364 u32 hdwords = 0;
2365 struct tid_rdma_params *remote;
2366
2367 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
2368 flow->sent += *len;
2369 next_offset = flow->tid_offset + *len;
2370 last_pkt = (flow->sent >= flow->length);
2371
3ce5daa2
KW
2372 trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry);
2373 trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow);
2374
1db21b50
KW
2375 rcu_read_lock();
2376 remote = rcu_dereference(qpriv->tid_rdma.remote);
2377 if (!remote) {
2378 rcu_read_unlock();
2379 goto done;
2380 }
2381 KDETH_RESET(resp->kdeth0, KVER, 0x1);
2382 KDETH_SET(resp->kdeth0, SH, !last_pkt);
2383 KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
2384 KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
2385 KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
2386 KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
2387 KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
2388 KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
2389 resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
2390 rcu_read_unlock();
2391
2392 resp->aeth = rvt_compute_aeth(qp);
2393 resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
2394 flow->pkt));
2395
2396 *bth0 = TID_OP(READ_RESP) << 24;
2397 *bth1 = flow->tid_qpn;
2398 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
2399 HFI1_KDETH_BTH_SEQ_MASK) |
2400 (flow->flow_state.generation <<
2401 HFI1_KDETH_BTH_SEQ_SHIFT));
2402 *last = last_pkt;
2403 if (last_pkt)
2404 /* Advance to next flow */
2405 req->clear_tail = (req->clear_tail + 1) &
2406 (MAX_FLOWS - 1);
2407
2408 if (next_offset >= tidlen) {
2409 flow->tid_offset = 0;
2410 flow->tid_idx++;
2411 } else {
2412 flow->tid_offset = next_offset;
2413 }
2414
2415 hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
2416
2417done:
2418 return hdwords;
2419}
9905bf06
KW
2420
2421static inline struct tid_rdma_request *
2422find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode)
2423 __must_hold(&qp->s_lock)
2424{
2425 struct rvt_swqe *wqe;
2426 struct tid_rdma_request *req = NULL;
2427 u32 i, end;
2428
2429 end = qp->s_cur + 1;
2430 if (end == qp->s_size)
2431 end = 0;
2432 for (i = qp->s_acked; i != end;) {
2433 wqe = rvt_get_swqe_ptr(qp, i);
2434 if (cmp_psn(psn, wqe->psn) >= 0 &&
2435 cmp_psn(psn, wqe->lpsn) <= 0) {
2436 if (wqe->wr.opcode == opcode)
2437 req = wqe_to_tid_req(wqe);
2438 break;
2439 }
2440 if (++i == qp->s_size)
2441 i = 0;
2442 }
2443
2444 return req;
2445}
2446
2447void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
2448{
2449 /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */
2450
2451 /*
2452 * 1. Find matching SWQE
2453 * 2. Check that the entire segment has been read.
2454 * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags.
2455 * 4. Free the TID flow resources.
2456 * 5. Kick the send engine (hfi1_schedule_send())
2457 */
2458 struct ib_other_headers *ohdr = packet->ohdr;
2459 struct rvt_qp *qp = packet->qp;
2460 struct hfi1_qp_priv *priv = qp->priv;
2461 struct hfi1_ctxtdata *rcd = packet->rcd;
2462 struct tid_rdma_request *req;
2463 struct tid_rdma_flow *flow;
2464 u32 opcode, aeth;
2465 bool is_fecn;
2466 unsigned long flags;
2467 u32 kpsn, ipsn;
2468
3ce5daa2 2469 trace_hfi1_sender_rcv_tid_read_resp(qp);
9905bf06
KW
2470 is_fecn = process_ecn(qp, packet);
2471 kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2472 aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
2473 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
2474
2475 spin_lock_irqsave(&qp->s_lock, flags);
2476 ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
2477 req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ);
2478 if (unlikely(!req))
2479 goto ack_op_err;
2480
2481 flow = &req->flows[req->clear_tail];
2482 /* When header suppression is disabled */
2483 if (cmp_psn(ipsn, flow->flow_state.ib_lpsn))
2484 goto ack_done;
2485 req->ack_pending--;
2486 priv->pending_tid_r_segs--;
2487 qp->s_num_rd_atomic--;
2488 if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
2489 !qp->s_num_rd_atomic) {
2490 qp->s_flags &= ~(RVT_S_WAIT_FENCE |
2491 RVT_S_WAIT_ACK);
2492 hfi1_schedule_send(qp);
2493 }
2494 if (qp->s_flags & RVT_S_WAIT_RDMAR) {
2495 qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK);
2496 hfi1_schedule_send(qp);
2497 }
2498
3ce5daa2
KW
2499 trace_hfi1_ack(qp, ipsn);
2500 trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode,
2501 req->e.swqe->psn, req->e.swqe->lpsn,
2502 req);
2503 trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow);
2504
9905bf06
KW
2505 /* Release the tid resources */
2506 hfi1_kern_exp_rcv_clear(req);
2507
2508 if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd))
2509 goto ack_done;
2510
2511 /* If not done yet, build next read request */
2512 if (++req->comp_seg >= req->total_segs) {
2513 priv->tid_r_comp++;
2514 req->state = TID_REQUEST_COMPLETE;
2515 }
2516
2517 /*
2518 * Clear the hw flow under two conditions:
2519 * 1. This request is a sync point and it is complete;
2520 * 2. Current request is completed and there are no more requests.
2521 */
2522 if ((req->state == TID_REQUEST_SYNC &&
2523 req->comp_seg == req->cur_seg) ||
2524 priv->tid_r_comp == priv->tid_r_reqs) {
2525 hfi1_kern_clear_hw_flow(priv->rcd, qp);
2526 if (req->state == TID_REQUEST_SYNC)
2527 req->state = TID_REQUEST_ACTIVE;
2528 }
2529
2530 hfi1_schedule_send(qp);
2531 goto ack_done;
2532
2533ack_op_err:
2534 /*
2535 * The test indicates that the send engine has finished its cleanup
2536 * after sending the request and it's now safe to put the QP into error
2537 * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
2538 * == qp->s_head), it would be unsafe to complete the wqe pointed by
2539 * qp->s_acked here. Putting the qp into error state will safely flush
2540 * all remaining requests.
2541 */
2542 if (qp->s_last == qp->s_acked)
2543 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
2544
2545ack_done:
2546 spin_unlock_irqrestore(&qp->s_lock, flags);
2547 if (is_fecn)
2548 hfi1_send_rc_ack(packet, is_fecn);
2549}
2550
2551void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
2552 __must_hold(&qp->s_lock)
2553{
2554 u32 n = qp->s_acked;
2555 struct rvt_swqe *wqe;
2556 struct tid_rdma_request *req;
2557 struct hfi1_qp_priv *priv = qp->priv;
2558
2559 lockdep_assert_held(&qp->s_lock);
2560 /* Free any TID entries */
2561 while (n != qp->s_tail) {
2562 wqe = rvt_get_swqe_ptr(qp, n);
2563 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2564 req = wqe_to_tid_req(wqe);
2565 hfi1_kern_exp_rcv_clear_all(req);
2566 }
2567
2568 if (++n == qp->s_size)
2569 n = 0;
2570 }
2571 /* Free flow */
2572 hfi1_kern_clear_hw_flow(priv->rcd, qp);
2573}
2574
2575static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
2576 struct hfi1_packet *packet, u8 rcv_type,
2577 u8 opcode)
2578{
2579 struct rvt_qp *qp = packet->qp;
d72fe7d5 2580 struct hfi1_qp_priv *qpriv = qp->priv;
9905bf06
KW
2581 u32 ipsn;
2582 struct ib_other_headers *ohdr = packet->ohdr;
d72fe7d5
KW
2583 struct rvt_ack_entry *e;
2584 struct tid_rdma_request *req;
2585 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
2586 u32 i;
9905bf06
KW
2587
2588 if (rcv_type >= RHF_RCV_TYPE_IB)
2589 goto done;
2590
2591 spin_lock(&qp->s_lock);
d72fe7d5
KW
2592
2593 /*
2594 * We've ran out of space in the eager buffer.
2595 * Eagerly received KDETH packets which require space in the
2596 * Eager buffer (packet that have payload) are TID RDMA WRITE
2597 * response packets. In this case, we have to re-transmit the
2598 * TID RDMA WRITE request.
2599 */
2600 if (rcv_type == RHF_RCV_TYPE_EAGER) {
2601 hfi1_restart_rc(qp, qp->s_last_psn + 1, 1);
2602 hfi1_schedule_send(qp);
2603 goto done_unlock;
2604 }
2605
9905bf06
KW
2606 /*
2607 * For TID READ response, error out QP after freeing the tid
2608 * resources.
2609 */
2610 if (opcode == TID_OP(READ_RESP)) {
2611 ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
2612 if (cmp_psn(ipsn, qp->s_last_psn) > 0 &&
2613 cmp_psn(ipsn, qp->s_psn) < 0) {
2614 hfi1_kern_read_tid_flow_free(qp);
2615 spin_unlock(&qp->s_lock);
2616 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2617 goto done;
2618 }
d72fe7d5
KW
2619 goto done_unlock;
2620 }
2621
2622 /*
2623 * Error out the qp for TID RDMA WRITE
2624 */
2625 hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
2626 for (i = 0; i < rvt_max_atomic(rdi); i++) {
2627 e = &qp->s_ack_queue[i];
2628 if (e->opcode == TID_OP(WRITE_REQ)) {
2629 req = ack_to_tid_req(e);
2630 hfi1_kern_exp_rcv_clear_all(req);
2631 }
9905bf06 2632 }
d72fe7d5
KW
2633 spin_unlock(&qp->s_lock);
2634 rvt_rc_error(qp, IB_WC_LOC_LEN_ERR);
2635 goto done;
9905bf06 2636
d72fe7d5 2637done_unlock:
9905bf06
KW
2638 spin_unlock(&qp->s_lock);
2639done:
2640 return true;
2641}
2642
2643static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd,
2644 struct rvt_qp *qp, struct rvt_swqe *wqe)
2645{
2646 struct tid_rdma_request *req;
2647 struct tid_rdma_flow *flow;
2648
2649 /* Start from the right segment */
2650 qp->r_flags |= RVT_R_RDMAR_SEQ;
2651 req = wqe_to_tid_req(wqe);
2652 flow = &req->flows[req->clear_tail];
2653 hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0);
2654 if (list_empty(&qp->rspwait)) {
2655 qp->r_flags |= RVT_R_RSP_SEND;
2656 rvt_get_qp(qp);
2657 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2658 }
2659}
2660
2661/*
2662 * Handle the KDETH eflags for TID RDMA READ response.
2663 *
2664 * Return true if the last packet for a segment has been received and it is
2665 * time to process the response normally; otherwise, return true.
2666 *
2667 * The caller must hold the packet->qp->r_lock and the rcu_read_lock.
2668 */
2669static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
2670 struct hfi1_packet *packet, u8 rcv_type,
2671 u8 rte, u32 psn, u32 ibpsn)
2672 __must_hold(&packet->qp->r_lock) __must_hold(RCU)
2673{
2674 struct hfi1_pportdata *ppd = rcd->ppd;
2675 struct hfi1_devdata *dd = ppd->dd;
2676 struct hfi1_ibport *ibp;
2677 struct rvt_swqe *wqe;
2678 struct tid_rdma_request *req;
2679 struct tid_rdma_flow *flow;
2680 u32 ack_psn;
2681 struct rvt_qp *qp = packet->qp;
2682 struct hfi1_qp_priv *priv = qp->priv;
2683 bool ret = true;
2684 int diff = 0;
2685 u32 fpsn;
2686
2687 lockdep_assert_held(&qp->r_lock);
2688 /* If the psn is out of valid range, drop the packet */
2689 if (cmp_psn(ibpsn, qp->s_last_psn) < 0 ||
2690 cmp_psn(ibpsn, qp->s_psn) > 0)
2691 return ret;
2692
2693 spin_lock(&qp->s_lock);
2694 /*
2695 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
2696 * requests and implicitly NAK RDMA read and atomic requests issued
2697 * before the NAK'ed request.
2698 */
2699 ack_psn = ibpsn - 1;
2700 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2701 ibp = to_iport(qp->ibqp.device, qp->port_num);
2702
2703 /* Complete WQEs that the PSN finishes. */
2704 while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) {
2705 /*
2706 * If this request is a RDMA read or atomic, and the NACK is
2707 * for a later operation, this NACK NAKs the RDMA read or
2708 * atomic.
2709 */
2710 if (wqe->wr.opcode == IB_WR_RDMA_READ ||
2711 wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
2712 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2713 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
2714 /* Retry this request. */
2715 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
2716 qp->r_flags |= RVT_R_RDMAR_SEQ;
2717 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2718 restart_tid_rdma_read_req(rcd, qp,
2719 wqe);
2720 } else {
2721 hfi1_restart_rc(qp, qp->s_last_psn + 1,
2722 0);
2723 if (list_empty(&qp->rspwait)) {
2724 qp->r_flags |= RVT_R_RSP_SEND;
2725 rvt_get_qp(qp);
2726 list_add_tail(/* wait */
2727 &qp->rspwait,
2728 &rcd->qp_wait_list);
2729 }
2730 }
2731 }
2732 /*
2733 * No need to process the NAK since we are
2734 * restarting an earlier request.
2735 */
2736 break;
2737 }
2738
2739 wqe = do_rc_completion(qp, wqe, ibp);
2740 if (qp->s_acked == qp->s_tail)
2741 break;
2742 }
2743
2744 /* Handle the eflags for the request */
2745 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
2746 goto s_unlock;
2747
2748 req = wqe_to_tid_req(wqe);
2749 switch (rcv_type) {
2750 case RHF_RCV_TYPE_EXPECTED:
2751 switch (rte) {
2752 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
2753 /*
2754 * On the first occurrence of a Flow Sequence error,
2755 * the flag TID_FLOW_SW_PSN is set.
2756 *
2757 * After that, the flow is *not* reprogrammed and the
2758 * protocol falls back to SW PSN checking. This is done
2759 * to prevent continuous Flow Sequence errors for any
2760 * packets that could be still in the fabric.
2761 */
2762 flow = find_flow(req, psn, NULL);
2763 if (!flow) {
2764 /*
2765 * We can't find the IB PSN matching the
2766 * received KDETH PSN. The only thing we can
2767 * do at this point is report the error to
2768 * the QP.
2769 */
2770 hfi1_kern_read_tid_flow_free(qp);
2771 spin_unlock(&qp->s_lock);
2772 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2773 return ret;
2774 }
2775 if (priv->flow_state.flags & TID_FLOW_SW_PSN) {
2776 diff = cmp_psn(psn,
2777 priv->flow_state.r_next_psn);
2778 if (diff > 0) {
2779 if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
2780 restart_tid_rdma_read_req(rcd,
2781 qp,
2782 wqe);
2783
2784 /* Drop the packet.*/
2785 goto s_unlock;
2786 } else if (diff < 0) {
2787 /*
2788 * If a response packet for a restarted
2789 * request has come back, reset the
2790 * restart flag.
2791 */
2792 if (qp->r_flags & RVT_R_RDMAR_SEQ)
2793 qp->r_flags &=
2794 ~RVT_R_RDMAR_SEQ;
2795
2796 /* Drop the packet.*/
2797 goto s_unlock;
2798 }
2799
2800 /*
2801 * If SW PSN verification is successful and
2802 * this is the last packet in the segment, tell
2803 * the caller to process it as a normal packet.
2804 */
2805 fpsn = full_flow_psn(flow,
2806 flow->flow_state.lpsn);
2807 if (cmp_psn(fpsn, psn) == 0) {
2808 ret = false;
2809 if (qp->r_flags & RVT_R_RDMAR_SEQ)
2810 qp->r_flags &=
2811 ~RVT_R_RDMAR_SEQ;
2812 }
2813 priv->flow_state.r_next_psn++;
2814 } else {
2815 u64 reg;
2816 u32 last_psn;
2817
2818 /*
2819 * The only sane way to get the amount of
2820 * progress is to read the HW flow state.
2821 */
2822 reg = read_uctxt_csr(dd, rcd->ctxt,
2823 RCV_TID_FLOW_TABLE +
2824 (8 * flow->idx));
2825 last_psn = mask_psn(reg);
2826
2827 priv->flow_state.r_next_psn = last_psn;
2828 priv->flow_state.flags |= TID_FLOW_SW_PSN;
2829 /*
2830 * If no request has been restarted yet,
2831 * restart the current one.
2832 */
2833 if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
2834 restart_tid_rdma_read_req(rcd, qp,
2835 wqe);
2836 }
2837
2838 break;
2839
2840 case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
2841 /*
2842 * Since the TID flow is able to ride through
2843 * generation mismatch, drop this stale packet.
2844 */
2845 break;
2846
2847 default:
2848 break;
2849 }
2850 break;
2851
2852 case RHF_RCV_TYPE_ERROR:
2853 switch (rte) {
2854 case RHF_RTE_ERROR_OP_CODE_ERR:
2855 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
2856 case RHF_RTE_ERROR_KHDR_HCRC_ERR:
2857 case RHF_RTE_ERROR_KHDR_KVER_ERR:
2858 case RHF_RTE_ERROR_CONTEXT_ERR:
2859 case RHF_RTE_ERROR_KHDR_TID_ERR:
2860 default:
2861 break;
2862 }
2863 default:
2864 break;
2865 }
2866s_unlock:
2867 spin_unlock(&qp->s_lock);
2868 return ret;
2869}
2870
2871bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
2872 struct hfi1_pportdata *ppd,
2873 struct hfi1_packet *packet)
2874{
2875 struct hfi1_ibport *ibp = &ppd->ibport_data;
2876 struct hfi1_devdata *dd = ppd->dd;
2877 struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
2878 u8 rcv_type = rhf_rcv_type(packet->rhf);
2879 u8 rte = rhf_rcv_type_err(packet->rhf);
2880 struct ib_header *hdr = packet->hdr;
2881 struct ib_other_headers *ohdr = NULL;
2882 int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
2883 u16 lid = be16_to_cpu(hdr->lrh[1]);
2884 u8 opcode;
2885 u32 qp_num, psn, ibpsn;
2886 struct rvt_qp *qp;
d72fe7d5 2887 struct hfi1_qp_priv *qpriv;
9905bf06
KW
2888 unsigned long flags;
2889 bool ret = true;
d72fe7d5
KW
2890 struct rvt_ack_entry *e;
2891 struct tid_rdma_request *req;
2892 struct tid_rdma_flow *flow;
9905bf06 2893
3ce5daa2
KW
2894 trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
2895 packet->rhf);
9905bf06
KW
2896 if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
2897 return ret;
2898
2899 packet->ohdr = &hdr->u.oth;
2900 ohdr = packet->ohdr;
2901 trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
2902
2903 /* Get the destination QP number. */
2904 qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) &
2905 RVT_QPN_MASK;
2906 if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
2907 goto drop;
2908
2909 psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2910 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
2911
2912 rcu_read_lock();
2913 qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
2914 if (!qp)
2915 goto rcu_unlock;
2916
2917 packet->qp = qp;
2918
2919 /* Check for valid receive state. */
2920 spin_lock_irqsave(&qp->r_lock, flags);
2921 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
2922 ibp->rvp.n_pkt_drops++;
2923 goto r_unlock;
2924 }
2925
2926 if (packet->rhf & RHF_TID_ERR) {
2927 /* For TIDERR and RC QPs preemptively schedule a NAK */
2928 u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
2929
2930 /* Sanity check packet */
2931 if (tlen < 24)
2932 goto r_unlock;
2933
2934 /*
2935 * Check for GRH. We should never get packets with GRH in this
2936 * path.
2937 */
2938 if (lnh == HFI1_LRH_GRH)
2939 goto r_unlock;
2940
2941 if (tid_rdma_tid_err(rcd, packet, rcv_type, opcode))
2942 goto r_unlock;
2943 }
2944
2945 /* handle TID RDMA READ */
2946 if (opcode == TID_OP(READ_RESP)) {
2947 ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn);
2948 ibpsn = mask_psn(ibpsn);
2949 ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
2950 ibpsn);
d72fe7d5
KW
2951 goto r_unlock;
2952 }
2953
2954 /*
2955 * qp->s_tail_ack_queue points to the rvt_ack_entry currently being
2956 * processed. These a completed sequentially so we can be sure that
2957 * the pointer will not change until the entire request has completed.
2958 */
2959 spin_lock(&qp->s_lock);
2960 qpriv = qp->priv;
2961 e = &qp->s_ack_queue[qpriv->r_tid_tail];
2962 req = ack_to_tid_req(e);
2963 flow = &req->flows[req->clear_tail];
2964
2965 switch (rcv_type) {
2966 case RHF_RCV_TYPE_EXPECTED:
2967 switch (rte) {
2968 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
2969 if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
2970 u64 reg;
2971
2972 qpriv->s_flags |= HFI1_R_TID_SW_PSN;
2973 /*
2974 * The only sane way to get the amount of
2975 * progress is to read the HW flow state.
2976 */
2977 reg = read_uctxt_csr(dd, rcd->ctxt,
2978 RCV_TID_FLOW_TABLE +
2979 (8 * flow->idx));
2980 flow->flow_state.r_next_psn = mask_psn(reg);
2981 qpriv->r_next_psn_kdeth =
2982 flow->flow_state.r_next_psn;
2983 goto nak_psn;
2984 } else {
2985 /*
2986 * If the received PSN does not match the next
2987 * expected PSN, NAK the packet.
2988 * However, only do that if we know that the a
2989 * NAK has already been sent. Otherwise, this
2990 * mismatch could be due to packets that were
2991 * already in flight.
2992 */
2993 if (psn != flow->flow_state.r_next_psn) {
2994 psn = flow->flow_state.r_next_psn;
2995 goto nak_psn;
2996 }
2997
2998 qpriv->s_nak_state = 0;
2999 /*
3000 * If SW PSN verification is successful and this
3001 * is the last packet in the segment, tell the
3002 * caller to process it as a normal packet.
3003 */
3004 if (psn == full_flow_psn(flow,
3005 flow->flow_state.lpsn))
3006 ret = false;
3007 qpriv->r_next_psn_kdeth =
3008 ++flow->flow_state.r_next_psn;
3009 }
3010 break;
3011
3012 case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
3013 goto nak_psn;
3014
3015 default:
3016 break;
3017 }
3018 break;
3019
3020 case RHF_RCV_TYPE_ERROR:
3021 switch (rte) {
3022 case RHF_RTE_ERROR_OP_CODE_ERR:
3023 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
3024 case RHF_RTE_ERROR_KHDR_HCRC_ERR:
3025 case RHF_RTE_ERROR_KHDR_KVER_ERR:
3026 case RHF_RTE_ERROR_CONTEXT_ERR:
3027 case RHF_RTE_ERROR_KHDR_TID_ERR:
3028 default:
3029 break;
3030 }
3031 default:
3032 break;
9905bf06
KW
3033 }
3034
d72fe7d5
KW
3035unlock:
3036 spin_unlock(&qp->s_lock);
9905bf06
KW
3037r_unlock:
3038 spin_unlock_irqrestore(&qp->r_lock, flags);
3039rcu_unlock:
3040 rcu_read_unlock();
3041drop:
3042 return ret;
d72fe7d5
KW
3043nak_psn:
3044 ibp->rvp.n_rc_seqnak++;
3045 if (!qpriv->s_nak_state) {
3046 qpriv->s_nak_state = IB_NAK_PSN_ERROR;
3047 /* We are NAK'ing the next expected PSN */
3048 qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn);
3049 qpriv->s_flags |= RVT_S_ACK_PENDING;
3050 if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID)
3051 qpriv->r_tid_ack = qpriv->r_tid_tail;
572f0c33 3052 hfi1_schedule_tid_send(qp);
d72fe7d5
KW
3053 }
3054 goto unlock;
9905bf06 3055}
b126078e
KW
3056
3057/*
3058 * "Rewind" the TID request information.
3059 * This means that we reset the state back to ACTIVE,
3060 * find the proper flow, set the flow index to that flow,
3061 * and reset the flow information.
3062 */
3063void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
3064 u32 *bth2)
3065{
3066 struct tid_rdma_request *req = wqe_to_tid_req(wqe);
3067 struct tid_rdma_flow *flow;
6e38fca6
KW
3068 struct hfi1_qp_priv *qpriv = qp->priv;
3069 int diff, delta_pkts;
3070 u32 tididx = 0, i;
b126078e
KW
3071 u16 fidx;
3072
3073 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
3074 *bth2 = mask_psn(qp->s_psn);
3075 flow = find_flow_ib(req, *bth2, &fidx);
3ce5daa2
KW
3076 if (!flow) {
3077 trace_hfi1_msg_tid_restart_req(/* msg */
3078 qp, "!!!!!! Could not find flow to restart: bth2 ",
3079 (u64)*bth2);
3080 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode,
3081 wqe->psn, wqe->lpsn,
3082 req);
b126078e 3083 return;
3ce5daa2 3084 }
b126078e 3085 } else {
6e38fca6
KW
3086 fidx = req->acked_tail;
3087 flow = &req->flows[fidx];
3088 *bth2 = mask_psn(req->r_ack_psn);
b126078e
KW
3089 }
3090
6e38fca6
KW
3091 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
3092 delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn);
3093 else
3094 delta_pkts = delta_psn(*bth2,
3095 full_flow_psn(flow,
3096 flow->flow_state.spsn));
3097
3ce5daa2 3098 trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
6e38fca6 3099 diff = delta_pkts + flow->resync_npkts;
b126078e
KW
3100
3101 flow->sent = 0;
3102 flow->pkt = 0;
3103 flow->tid_idx = 0;
3104 flow->tid_offset = 0;
3105 if (diff) {
3106 for (tididx = 0; tididx < flow->tidcnt; tididx++) {
3107 u32 tidentry = flow->tid_entry[tididx], tidlen,
3108 tidnpkts, npkts;
3109
3110 flow->tid_offset = 0;
3111 tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE;
3112 tidnpkts = rvt_div_round_up_mtu(qp, tidlen);
3113 npkts = min_t(u32, diff, tidnpkts);
3114 flow->pkt += npkts;
3115 flow->sent += (npkts == tidnpkts ? tidlen :
3116 npkts * qp->pmtu);
3117 flow->tid_offset += npkts * qp->pmtu;
3118 diff -= npkts;
3119 if (!diff)
3120 break;
3121 }
3122 }
6e38fca6
KW
3123 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
3124 rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) +
3125 flow->sent, 0);
3126 /*
3127 * Packet PSN is based on flow_state.spsn + flow->pkt. However,
3128 * during a RESYNC, the generation is incremented and the
3129 * sequence is reset to 0. Since we've adjusted the npkts in the
3130 * flow and the SGE has been sufficiently advanced, we have to
3131 * adjust flow->pkt in order to calculate the correct PSN.
3132 */
3133 flow->pkt -= flow->resync_npkts;
3134 }
b126078e
KW
3135
3136 if (flow->tid_offset ==
3137 EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
3138 tididx++;
3139 flow->tid_offset = 0;
3140 }
3141 flow->tid_idx = tididx;
6e38fca6
KW
3142 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
3143 /* Move flow_idx to correct index */
3144 req->flow_idx = fidx;
3145 else
3146 req->clear_tail = fidx;
b126078e 3147
3ce5daa2
KW
3148 trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
3149 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
3150 wqe->lpsn, req);
b126078e 3151 req->state = TID_REQUEST_ACTIVE;
6e38fca6
KW
3152 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
3153 /* Reset all the flows that we are going to resend */
3154 fidx = CIRC_NEXT(fidx, MAX_FLOWS);
3155 i = qpriv->s_tid_tail;
3156 do {
3157 for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS);
3158 fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
3159 req->flows[fidx].sent = 0;
3160 req->flows[fidx].pkt = 0;
3161 req->flows[fidx].tid_idx = 0;
3162 req->flows[fidx].tid_offset = 0;
3163 req->flows[fidx].resync_npkts = 0;
3164 }
3165 if (i == qpriv->s_tid_cur)
3166 break;
3167 do {
3168 i = (++i == qp->s_size ? 0 : i);
3169 wqe = rvt_get_swqe_ptr(qp, i);
3170 } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE);
3171 req = wqe_to_tid_req(wqe);
3172 req->cur_seg = req->ack_seg;
3173 fidx = req->acked_tail;
3174 /* Pull req->clear_tail back */
3175 req->clear_tail = fidx;
3176 } while (1);
3177 }
b126078e 3178}
24b11923
KW
3179
3180void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
3181{
3182 int i, ret;
3183 struct hfi1_qp_priv *qpriv = qp->priv;
3184 struct tid_flow_state *fs;
3185
3186 if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA))
3187 return;
3188
3189 /*
3190 * First, clear the flow to help prevent any delayed packets from
3191 * being delivered.
3192 */
3193 fs = &qpriv->flow_state;
3194 if (fs->index != RXE_NUM_TID_FLOWS)
3195 hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
3196
3197 for (i = qp->s_acked; i != qp->s_head;) {
3198 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
3199
3200 if (++i == qp->s_size)
3201 i = 0;
3202 /* Free only locally allocated TID entries */
3203 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
3204 continue;
3205 do {
3206 struct hfi1_swqe_priv *priv = wqe->priv;
3207
3208 ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
3209 } while (!ret);
3210 }
3211}
a0b34f75
KW
3212
3213bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
3214{
3215 struct rvt_swqe *prev;
3216 struct hfi1_qp_priv *priv = qp->priv;
3217 u32 s_prev;
3218
3219 s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
3220 prev = rvt_get_swqe_ptr(qp, s_prev);
3221
3222 switch (wqe->wr.opcode) {
3223 case IB_WR_SEND:
3224 case IB_WR_SEND_WITH_IMM:
3225 case IB_WR_SEND_WITH_INV:
3226 case IB_WR_ATOMIC_CMP_AND_SWP:
3227 case IB_WR_ATOMIC_FETCH_AND_ADD:
3228 case IB_WR_RDMA_WRITE:
3229 case IB_WR_RDMA_READ:
3230 break;
3231 case IB_WR_TID_RDMA_READ:
3232 switch (prev->wr.opcode) {
3233 case IB_WR_RDMA_READ:
3234 if (qp->s_acked != qp->s_cur)
3235 goto interlock;
3236 break;
3237 default:
3238 break;
3239 }
3240 default:
3241 break;
3242 }
3243 return false;
3244
3245interlock:
3246 priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK;
3247 return true;
3248}
f1ab4efa
KW
3249
3250/* Does @sge meet the alignment requirements for tid rdma? */
3ce5daa2
KW
3251static inline bool hfi1_check_sge_align(struct rvt_qp *qp,
3252 struct rvt_sge *sge, int num_sge)
f1ab4efa
KW
3253{
3254 int i;
3255
3ce5daa2
KW
3256 for (i = 0; i < num_sge; i++, sge++) {
3257 trace_hfi1_sge_check_align(qp, i, sge);
f1ab4efa
KW
3258 if ((u64)sge->vaddr & ~PAGE_MASK ||
3259 sge->sge_length & ~PAGE_MASK)
3260 return false;
3ce5daa2 3261 }
f1ab4efa
KW
3262 return true;
3263}
3264
3265void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
3266{
3267 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
3268 struct hfi1_swqe_priv *priv = wqe->priv;
3269 struct tid_rdma_params *remote;
3270 enum ib_wr_opcode new_opcode;
3271 bool do_tid_rdma = false;
3272 struct hfi1_pportdata *ppd = qpriv->rcd->ppd;
3273
3274 if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) ==
3275 ppd->lid)
3276 return;
3277 if (qpriv->hdr_type != HFI1_PKT_TYPE_9B)
3278 return;
3279
3280 rcu_read_lock();
3281 remote = rcu_dereference(qpriv->tid_rdma.remote);
3282 /*
3283 * If TID RDMA is disabled by the negotiation, don't
3284 * use it.
3285 */
3286 if (!remote)
3287 goto exit;
3288
3289 if (wqe->wr.opcode == IB_WR_RDMA_READ) {
3ce5daa2
KW
3290 if (hfi1_check_sge_align(qp, &wqe->sg_list[0],
3291 wqe->wr.num_sge)) {
f1ab4efa
KW
3292 new_opcode = IB_WR_TID_RDMA_READ;
3293 do_tid_rdma = true;
3294 }
3295 }
3296
3297 if (do_tid_rdma) {
3298 if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC))
3299 goto exit;
3300 wqe->wr.opcode = new_opcode;
3301 priv->tid_req.seg_len =
3302 min_t(u32, remote->max_len, wqe->length);
3303 priv->tid_req.total_segs =
3304 DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
3305 /* Compute the last PSN of the request */
3306 wqe->lpsn = wqe->psn;
3307 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
3308 priv->tid_req.n_flows = remote->max_read;
3309 qpriv->tid_r_reqs++;
3310 wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
3311 }
3312
3313 priv->tid_req.cur_seg = 0;
3314 priv->tid_req.comp_seg = 0;
3315 priv->tid_req.ack_seg = 0;
3316 priv->tid_req.state = TID_REQUEST_INACTIVE;
3ce5daa2
KW
3317 trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
3318 wqe->psn, wqe->lpsn,
3319 &priv->tid_req);
f1ab4efa
KW
3320 }
3321exit:
3322 rcu_read_unlock();
3323}
c098bbb0
KW
3324
3325/* TID RDMA WRITE functions */
3326
3327u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
3328 struct ib_other_headers *ohdr,
3329 u32 *bth1, u32 *bth2, u32 *len)
3330{
3331 struct hfi1_qp_priv *qpriv = qp->priv;
3332 struct tid_rdma_request *req = wqe_to_tid_req(wqe);
3333 struct tid_rdma_params *remote;
3334
3335 rcu_read_lock();
3336 remote = rcu_dereference(qpriv->tid_rdma.remote);
3337 /*
3338 * Set the number of flow to be used based on negotiated
3339 * parameters.
3340 */
3341 req->n_flows = remote->max_write;
3342 req->state = TID_REQUEST_ACTIVE;
3343
3344 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1);
3345 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
3346 ohdr->u.tid_rdma.w_req.reth.vaddr =
3347 cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
3348 ohdr->u.tid_rdma.w_req.reth.rkey =
3349 cpu_to_be32(wqe->rdma_wr.rkey);
3350 ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
3351 ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
3352 *bth1 &= ~RVT_QPN_MASK;
3353 *bth1 |= remote->qp;
3354 qp->s_state = TID_OP(WRITE_REQ);
3355 qp->s_flags |= HFI1_S_WAIT_TID_RESP;
3356 *bth2 |= IB_BTH_REQ_ACK;
3357 *len = 0;
3358
3359 rcu_read_unlock();
3360 return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
3361}
07b92370
KW
3362
3363void hfi1_compute_tid_rdma_flow_wt(void)
3364{
3365 /*
3366 * Heuristic for computing the RNR timeout when waiting on the flow
3367 * queue. Rather than a computationaly expensive exact estimate of when
3368 * a flow will be available, we assume that if a QP is at position N in
3369 * the flow queue it has to wait approximately (N + 1) * (number of
3370 * segments between two sync points), assuming PMTU of 4K. The rationale
3371 * for this is that flows are released and recycled at each sync point.
3372 */
3373 tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) /
3374 TID_RDMA_MAX_SEGMENT_SIZE;
3375}
3376
3377static u32 position_in_queue(struct hfi1_qp_priv *qpriv,
3378 struct tid_queue *queue)
3379{
3380 return qpriv->tid_enqueue - queue->dequeue;
3381}
3382
3383/*
3384 * @qp: points to rvt_qp context.
3385 * @to_seg: desired RNR timeout in segments.
3386 * Return: index of the next highest timeout in the ib_hfi1_rnr_table[]
3387 */
3388static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg)
3389{
3390 struct hfi1_qp_priv *qpriv = qp->priv;
3391 u64 timeout;
3392 u32 bytes_per_us;
3393 u8 i;
3394
3395 bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8;
3396 timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us;
3397 /*
3398 * Find the next highest value in the RNR table to the required
3399 * timeout. This gives the responder some padding.
3400 */
3401 for (i = 1; i <= IB_AETH_CREDIT_MASK; i++)
3402 if (rvt_rnr_tbl_to_usec(i) >= timeout)
3403 return i;
3404 return 0;
3405}
3406
3407/**
3408 * Central place for resource allocation at TID write responder,
3409 * is called from write_req and write_data interrupt handlers as
3410 * well as the send thread when a queued QP is scheduled for
3411 * resource allocation.
3412 *
3413 * Iterates over (a) segments of a request and then (b) queued requests
3414 * themselves to allocate resources for up to local->max_write
3415 * segments across multiple requests. Stop allocating when we
3416 * hit a sync point, resume allocating after data packets at
3417 * sync point have been received.
3418 *
3419 * Resource allocation and sending of responses is decoupled. The
3420 * request/segment which are being allocated and sent are as follows.
3421 * Resources are allocated for:
3422 * [request: qpriv->r_tid_alloc, segment: req->alloc_seg]
3423 * The send thread sends:
3424 * [request: qp->s_tail_ack_queue, segment:req->cur_seg]
3425 */
3426static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
3427{
3428 struct tid_rdma_request *req;
3429 struct hfi1_qp_priv *qpriv = qp->priv;
3430 struct hfi1_ctxtdata *rcd = qpriv->rcd;
3431 struct tid_rdma_params *local = &qpriv->tid_rdma.local;
3432 struct rvt_ack_entry *e;
3433 u32 npkts, to_seg;
3434 bool last;
3435 int ret = 0;
3436
3437 lockdep_assert_held(&qp->s_lock);
3438
3439 while (1) {
3440 /*
3441 * Don't allocate more segments if a RNR NAK has already been
3442 * scheduled to avoid messing up qp->r_psn: the RNR NAK will
3443 * be sent only when all allocated segments have been sent.
3444 * However, if more segments are allocated before that, TID RDMA
3445 * WRITE RESP packets will be sent out for these new segments
3446 * before the RNR NAK packet. When the requester receives the
3447 * RNR NAK packet, it will restart with qp->s_last_psn + 1,
3448 * which does not match qp->r_psn and will be dropped.
3449 * Consequently, the requester will exhaust its retries and
3450 * put the qp into error state.
3451 */
3452 if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND)
3453 break;
3454
3455 /* No requests left to process */
3456 if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
3457 /* If all data has been received, clear the flow */
3458 if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
3459 !qpriv->alloc_w_segs)
3460 hfi1_kern_clear_hw_flow(rcd, qp);
3461 break;
3462 }
3463
3464 e = &qp->s_ack_queue[qpriv->r_tid_alloc];
3465 if (e->opcode != TID_OP(WRITE_REQ))
3466 goto next_req;
3467 req = ack_to_tid_req(e);
3468 /* Finished allocating for all segments of this request */
3469 if (req->alloc_seg >= req->total_segs)
3470 goto next_req;
3471
3472 /* Can allocate only a maximum of local->max_write for a QP */
3473 if (qpriv->alloc_w_segs >= local->max_write)
3474 break;
3475
3476 /* Don't allocate at a sync point with data packets pending */
3477 if (qpriv->sync_pt && qpriv->alloc_w_segs)
3478 break;
3479
3480 /* All data received at the sync point, continue */
3481 if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
3482 hfi1_kern_clear_hw_flow(rcd, qp);
3483 qpriv->sync_pt = false;
3484 if (qpriv->s_flags & HFI1_R_TID_SW_PSN)
3485 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
3486 }
3487
3488 /* Allocate flow if we don't have one */
3489 if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) {
3490 ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp);
3491 if (ret) {
3492 to_seg = tid_rdma_flow_wt *
3493 position_in_queue(qpriv,
3494 &rcd->flow_queue);
3495 break;
3496 }
3497 }
3498
3499 npkts = rvt_div_round_up_mtu(qp, req->seg_len);
3500
3501 /*
3502 * We are at a sync point if we run out of KDETH PSN space.
3503 * Last PSN of every generation is reserved for RESYNC.
3504 */
3505 if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) {
3506 qpriv->sync_pt = true;
3507 break;
3508 }
3509
3510 /*
3511 * If overtaking req->acked_tail, send an RNR NAK. Because the
3512 * QP is not queued in this case, and the issue can only be
3513 * caused due a delay in scheduling the second leg which we
3514 * cannot estimate, we use a rather arbitrary RNR timeout of
3515 * (MAX_FLOWS / 2) segments
3516 */
3517 if (!CIRC_SPACE(req->setup_head, req->acked_tail,
3518 MAX_FLOWS)) {
3519 ret = -EAGAIN;
3520 to_seg = MAX_FLOWS >> 1;
3521 qpriv->s_flags |= RVT_S_ACK_PENDING;
572f0c33 3522 hfi1_schedule_tid_send(qp);
07b92370
KW
3523 break;
3524 }
3525
3526 /* Try to allocate rcv array / TID entries */
3527 ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last);
3528 if (ret == -EAGAIN)
3529 to_seg = position_in_queue(qpriv, &rcd->rarr_queue);
3530 if (ret)
3531 break;
3532
3533 qpriv->alloc_w_segs++;
3534 req->alloc_seg++;
3535 continue;
3536next_req:
3537 /* Begin processing the next request */
3538 if (++qpriv->r_tid_alloc >
3539 rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3540 qpriv->r_tid_alloc = 0;
3541 }
3542
3543 /*
3544 * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation
3545 * has failed (b) we are called from the rcv handler interrupt context
3546 * (c) an RNR NAK has not already been scheduled
3547 */
3548 if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state)
3549 goto send_rnr_nak;
3550
3551 return;
3552
3553send_rnr_nak:
3554 lockdep_assert_held(&qp->r_lock);
3555
3556 /* Set r_nak_state to prevent unrelated events from generating NAK's */
3557 qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK;
3558
3559 /* Pull back r_psn to the segment being RNR NAK'd */
3560 qp->r_psn = e->psn + req->alloc_seg;
3561 qp->r_ack_psn = qp->r_psn;
3562 /*
3563 * Pull back r_head_ack_queue to the ack entry following the request
3564 * being RNR NAK'd. This allows resources to be allocated to the request
3565 * if the queued QP is scheduled.
3566 */
3567 qp->r_head_ack_queue = qpriv->r_tid_alloc + 1;
3568 if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3569 qp->r_head_ack_queue = 0;
3570 qpriv->r_tid_head = qp->r_head_ack_queue;
3571 /*
3572 * These send side fields are used in make_rc_ack(). They are set in
3573 * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock
3574 * for consistency
3575 */
3576 qp->s_nak_state = qp->r_nak_state;
3577 qp->s_ack_psn = qp->r_ack_psn;
3578 /*
3579 * Clear the ACK PENDING flag to prevent unwanted ACK because we
3580 * have modified qp->s_ack_psn here.
3581 */
3582 qp->s_flags &= ~(RVT_S_ACK_PENDING);
3583
3584 /*
3585 * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK
3586 * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be
3587 * used for this because qp->s_lock is dropped before calling
3588 * hfi1_send_rc_ack() leading to inconsistency between the receive
3589 * interrupt handlers and the send thread in make_rc_ack()
3590 */
3591 qpriv->rnr_nak_state = TID_RNR_NAK_SEND;
3592
3593 /*
3594 * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive
3595 * interrupt handlers but will be sent from the send engine behind any
3596 * previous responses that may have been scheduled
3597 */
3598 rc_defered_ack(rcd, qp);
3599}
3600
3601void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
3602{
3603 /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/
3604
3605 /*
3606 * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST
3607 * (see hfi1_rc_rcv())
3608 * - Don't allow 0-length requests.
3609 * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue)
3610 * - Setup struct tid_rdma_req with request info
3611 * - Prepare struct tid_rdma_flow array?
3612 * 3. Set the qp->s_ack_state as state diagram in design doc.
3613 * 4. Set RVT_S_RESP_PENDING in s_flags.
3614 * 5. Kick the send engine (hfi1_schedule_send())
3615 */
3616 struct hfi1_ctxtdata *rcd = packet->rcd;
3617 struct rvt_qp *qp = packet->qp;
3618 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
3619 struct ib_other_headers *ohdr = packet->ohdr;
3620 struct rvt_ack_entry *e;
3621 unsigned long flags;
3622 struct ib_reth *reth;
3623 struct hfi1_qp_priv *qpriv = qp->priv;
3624 struct tid_rdma_request *req;
3625 u32 bth0, psn, len, rkey, num_segs;
3626 bool is_fecn;
3627 u8 next;
3628 u64 vaddr;
3629 int diff;
3630
3631 bth0 = be32_to_cpu(ohdr->bth[0]);
3632 if (hfi1_ruc_check_hdr(ibp, packet))
3633 return;
3634
3635 is_fecn = process_ecn(qp, packet);
3636 psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
3637
3638 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
3639 rvt_comm_est(qp);
3640
3641 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
3642 goto nack_inv;
3643
3644 reth = &ohdr->u.tid_rdma.w_req.reth;
3645 vaddr = be64_to_cpu(reth->vaddr);
3646 len = be32_to_cpu(reth->length);
3647
3648 num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
3649 diff = delta_psn(psn, qp->r_psn);
3650 if (unlikely(diff)) {
3651 if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
3652 return;
3653 goto send_ack;
3654 }
3655
3656 /*
3657 * The resent request which was previously RNR NAK'd is inserted at the
3658 * location of the original request, which is one entry behind
3659 * r_head_ack_queue
3660 */
3661 if (qpriv->rnr_nak_state)
3662 qp->r_head_ack_queue = qp->r_head_ack_queue ?
3663 qp->r_head_ack_queue - 1 :
3664 rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
3665
3666 /* We've verified the request, insert it into the ack queue. */
3667 next = qp->r_head_ack_queue + 1;
3668 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3669 next = 0;
3670 spin_lock_irqsave(&qp->s_lock, flags);
3671 if (unlikely(next == qp->s_acked_ack_queue)) {
3672 if (!qp->s_ack_queue[next].sent)
3673 goto nack_inv_unlock;
3674 update_ack_queue(qp, next);
3675 }
3676 e = &qp->s_ack_queue[qp->r_head_ack_queue];
3677 req = ack_to_tid_req(e);
3678
3679 /* Bring previously RNR NAK'd request back to life */
3680 if (qpriv->rnr_nak_state) {
3681 qp->r_nak_state = 0;
3682 qp->s_nak_state = 0;
3683 qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
3684 qp->r_psn = e->lpsn + 1;
3685 req->state = TID_REQUEST_INIT;
3686 goto update_head;
3687 }
3688
3689 if (e->rdma_sge.mr) {
3690 rvt_put_mr(e->rdma_sge.mr);
3691 e->rdma_sge.mr = NULL;
3692 }
3693
3694 /* The length needs to be in multiples of PAGE_SIZE */
3695 if (!len || len & ~PAGE_MASK)
3696 goto nack_inv_unlock;
3697
3698 rkey = be32_to_cpu(reth->rkey);
3699 qp->r_len = len;
3700
3701 if (e->opcode == TID_OP(WRITE_REQ) &&
3702 (req->setup_head != req->clear_tail ||
3703 req->clear_tail != req->acked_tail))
3704 goto nack_inv_unlock;
3705
3706 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
3707 rkey, IB_ACCESS_REMOTE_WRITE)))
3708 goto nack_acc;
3709
3710 qp->r_psn += num_segs - 1;
3711
3712 e->opcode = (bth0 >> 24) & 0xff;
3713 e->psn = psn;
3714 e->lpsn = qp->r_psn;
3715 e->sent = 0;
3716
3717 req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write);
3718 req->state = TID_REQUEST_INIT;
3719 req->cur_seg = 0;
3720 req->comp_seg = 0;
3721 req->ack_seg = 0;
3722 req->alloc_seg = 0;
3723 req->isge = 0;
3724 req->seg_len = qpriv->tid_rdma.local.max_len;
3725 req->total_len = len;
3726 req->total_segs = num_segs;
3727 req->r_flow_psn = e->psn;
3728 req->ss.sge = e->rdma_sge;
3729 req->ss.num_sge = 1;
3730
3731 req->flow_idx = req->setup_head;
3732 req->clear_tail = req->setup_head;
3733 req->acked_tail = req->setup_head;
3734
3735 qp->r_state = e->opcode;
3736 qp->r_nak_state = 0;
3737 /*
3738 * We need to increment the MSN here instead of when we
3739 * finish sending the result since a duplicate request would
3740 * increment it more than once.
3741 */
3742 qp->r_msn++;
3743 qp->r_psn++;
3744
3745 if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) {
3746 qpriv->r_tid_tail = qp->r_head_ack_queue;
3747 } else if (qpriv->r_tid_tail == qpriv->r_tid_head) {
3748 struct tid_rdma_request *ptr;
3749
3750 e = &qp->s_ack_queue[qpriv->r_tid_tail];
3751 ptr = ack_to_tid_req(e);
3752
3753 if (e->opcode != TID_OP(WRITE_REQ) ||
3754 ptr->comp_seg == ptr->total_segs) {
3755 if (qpriv->r_tid_tail == qpriv->r_tid_ack)
3756 qpriv->r_tid_ack = qp->r_head_ack_queue;
3757 qpriv->r_tid_tail = qp->r_head_ack_queue;
3758 }
3759 }
3760update_head:
3761 qp->r_head_ack_queue = next;
3762 qpriv->r_tid_head = qp->r_head_ack_queue;
3763
3764 hfi1_tid_write_alloc_resources(qp, true);
3765
3766 /* Schedule the send tasklet. */
3767 qp->s_flags |= RVT_S_RESP_PENDING;
3768 hfi1_schedule_send(qp);
3769
3770 spin_unlock_irqrestore(&qp->s_lock, flags);
3771 if (is_fecn)
3772 goto send_ack;
3773 return;
3774
3775nack_inv_unlock:
3776 spin_unlock_irqrestore(&qp->s_lock, flags);
3777nack_inv:
3778 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
3779 qp->r_nak_state = IB_NAK_INVALID_REQUEST;
3780 qp->r_ack_psn = qp->r_psn;
3781 /* Queue NAK for later */
3782 rc_defered_ack(rcd, qp);
3783 return;
3784nack_acc:
3785 spin_unlock_irqrestore(&qp->s_lock, flags);
3786 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
3787 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
3788 qp->r_ack_psn = qp->r_psn;
3789send_ack:
3790 hfi1_send_rc_ack(packet, is_fecn);
3791}
38d46d36
KW
3792
3793u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
3794 struct ib_other_headers *ohdr, u32 *bth1,
3795 u32 bth2, u32 *len,
3796 struct rvt_sge_state **ss)
3797{
3798 struct hfi1_ack_priv *epriv = e->priv;
3799 struct tid_rdma_request *req = &epriv->tid_req;
3800 struct hfi1_qp_priv *qpriv = qp->priv;
3801 struct tid_rdma_flow *flow = NULL;
3802 u32 resp_len = 0, hdwords = 0;
3803 void *resp_addr = NULL;
3804 struct tid_rdma_params *remote;
3805
3806 flow = &req->flows[req->flow_idx];
3807 switch (req->state) {
3808 default:
3809 /*
3810 * Try to allocate resources here in case QP was queued and was
3811 * later scheduled when resources became available
3812 */
3813 hfi1_tid_write_alloc_resources(qp, false);
3814
3815 /* We've already sent everything which is ready */
3816 if (req->cur_seg >= req->alloc_seg)
3817 goto done;
3818
3819 /*
3820 * Resources can be assigned but responses cannot be sent in
3821 * rnr_nak state, till the resent request is received
3822 */
3823 if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT)
3824 goto done;
3825
3826 req->state = TID_REQUEST_ACTIVE;
3827 req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
3c759e00 3828 hfi1_add_tid_reap_timer(qp);
38d46d36
KW
3829 break;
3830
3831 case TID_REQUEST_RESEND_ACTIVE:
3832 case TID_REQUEST_RESEND:
3833 req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
3834 if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS))
3835 req->state = TID_REQUEST_ACTIVE;
3836
3c759e00 3837 hfi1_mod_tid_reap_timer(qp);
38d46d36
KW
3838 break;
3839 }
3840 flow->flow_state.resp_ib_psn = bth2;
3841 resp_addr = (void *)flow->tid_entry;
3842 resp_len = sizeof(*flow->tid_entry) * flow->tidcnt;
3843 req->cur_seg++;
3844
3845 memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp));
3846 epriv->ss.sge.vaddr = resp_addr;
3847 epriv->ss.sge.sge_length = resp_len;
3848 epriv->ss.sge.length = epriv->ss.sge.sge_length;
3849 /*
3850 * We can safely zero these out. Since the first SGE covers the
3851 * entire packet, nothing else should even look at the MR.
3852 */
3853 epriv->ss.sge.mr = NULL;
3854 epriv->ss.sge.m = 0;
3855 epriv->ss.sge.n = 0;
3856
3857 epriv->ss.sg_list = NULL;
3858 epriv->ss.total_len = epriv->ss.sge.sge_length;
3859 epriv->ss.num_sge = 1;
3860
3861 *ss = &epriv->ss;
3862 *len = epriv->ss.total_len;
3863
3864 /* Construct the TID RDMA WRITE RESP packet header */
3865 rcu_read_lock();
3866 remote = rcu_dereference(qpriv->tid_rdma.remote);
3867
3868 KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1);
3869 KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey);
3870 ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp);
3871 ohdr->u.tid_rdma.w_rsp.tid_flow_psn =
3872 cpu_to_be32((flow->flow_state.generation <<
3873 HFI1_KDETH_BTH_SEQ_SHIFT) |
3874 (flow->flow_state.spsn &
3875 HFI1_KDETH_BTH_SEQ_MASK));
3876 ohdr->u.tid_rdma.w_rsp.tid_flow_qp =
3877 cpu_to_be32(qpriv->tid_rdma.local.qp |
3878 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
3879 TID_RDMA_DESTQP_FLOW_SHIFT) |
3880 qpriv->rcd->ctxt);
3881 ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn);
3882 *bth1 = remote->qp;
3883 rcu_read_unlock();
3884 hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32);
3885 qpriv->pending_tid_w_segs++;
3886done:
3887 return hdwords;
3888}
3c759e00
KW
3889
3890static void hfi1_add_tid_reap_timer(struct rvt_qp *qp)
3891{
3892 struct hfi1_qp_priv *qpriv = qp->priv;
3893
3894 lockdep_assert_held(&qp->s_lock);
3895 if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) {
3896 qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
3897 qpriv->s_tid_timer.expires = jiffies +
3898 qpriv->tid_timer_timeout_jiffies;
3899 add_timer(&qpriv->s_tid_timer);
3900 }
3901}
3902
3903static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp)
3904{
3905 struct hfi1_qp_priv *qpriv = qp->priv;
3906
3907 lockdep_assert_held(&qp->s_lock);
3908 qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
3909 mod_timer(&qpriv->s_tid_timer, jiffies +
3910 qpriv->tid_timer_timeout_jiffies);
3911}
3912
3913static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp)
3914{
3915 struct hfi1_qp_priv *qpriv = qp->priv;
3916 int rval = 0;
3917
3918 lockdep_assert_held(&qp->s_lock);
3919 if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
3920 rval = del_timer(&qpriv->s_tid_timer);
3921 qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
3922 }
3923 return rval;
3924}
3925
3926void hfi1_del_tid_reap_timer(struct rvt_qp *qp)
3927{
3928 struct hfi1_qp_priv *qpriv = qp->priv;
3929
3930 del_timer_sync(&qpriv->s_tid_timer);
3931 qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
3932}
3933
3934static void hfi1_tid_timeout(struct timer_list *t)
3935{
3936 struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer);
3937 struct rvt_qp *qp = qpriv->owner;
3938 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
3939 unsigned long flags;
3940 u32 i;
3941
3942 spin_lock_irqsave(&qp->r_lock, flags);
3943 spin_lock(&qp->s_lock);
3944 if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
3945 dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n",
3946 qp->ibqp.qp_num, __func__, __LINE__);
3947 hfi1_stop_tid_reap_timer(qp);
3948 /*
3949 * Go though the entire ack queue and clear any outstanding
3950 * HW flow and RcvArray resources.
3951 */
3952 hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
3953 for (i = 0; i < rvt_max_atomic(rdi); i++) {
3954 struct tid_rdma_request *req =
3955 ack_to_tid_req(&qp->s_ack_queue[i]);
3956
3957 hfi1_kern_exp_rcv_clear_all(req);
3958 }
3959 spin_unlock(&qp->s_lock);
3960 if (qp->ibqp.event_handler) {
3961 struct ib_event ev;
3962
3963 ev.device = qp->ibqp.device;
3964 ev.element.qp = &qp->ibqp;
3965 ev.event = IB_EVENT_QP_FATAL;
3966 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
3967 }
3968 rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR);
3969 goto unlock_r_lock;
3970 }
3971 spin_unlock(&qp->s_lock);
3972unlock_r_lock:
3973 spin_unlock_irqrestore(&qp->r_lock, flags);
3974}
72a0ea99
KW
3975
3976void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
3977{
3978 /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */
3979
3980 /*
3981 * 1. Find matching SWQE
3982 * 2. Check that TIDENTRY array has enough space for a complete
3983 * segment. If not, put QP in error state.
3984 * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow
3985 * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags.
3986 * 5. Set qp->s_state
3987 * 6. Kick the send engine (hfi1_schedule_send())
3988 */
3989 struct ib_other_headers *ohdr = packet->ohdr;
3990 struct rvt_qp *qp = packet->qp;
3991 struct hfi1_qp_priv *qpriv = qp->priv;
3992 struct hfi1_ctxtdata *rcd = packet->rcd;
3993 struct rvt_swqe *wqe;
3994 struct tid_rdma_request *req;
3995 struct tid_rdma_flow *flow;
3996 enum ib_wc_status status;
3997 u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen;
3998 bool is_fecn;
3999 unsigned long flags;
4000
4001 is_fecn = process_ecn(qp, packet);
4002 psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4003 aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
4004 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
4005
4006 spin_lock_irqsave(&qp->s_lock, flags);
4007
4008 /* Ignore invalid responses */
4009 if (cmp_psn(psn, qp->s_next_psn) >= 0)
4010 goto ack_done;
4011
4012 /* Ignore duplicate responses. */
4013 if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0))
4014 goto ack_done;
4015
4016 if (unlikely(qp->s_acked == qp->s_tail))
4017 goto ack_done;
4018
4019 /*
4020 * If we are waiting for a particular packet sequence number
4021 * due to a request being resent, check for it. Otherwise,
4022 * ensure that we haven't missed anything.
4023 */
4024 if (qp->r_flags & RVT_R_RDMAR_SEQ) {
4025 if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
4026 goto ack_done;
4027 qp->r_flags &= ~RVT_R_RDMAR_SEQ;
4028 }
4029
4030 wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
4031 if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE))
4032 goto ack_op_err;
4033
4034 req = wqe_to_tid_req(wqe);
4035 /*
4036 * If we've lost ACKs and our acked_tail pointer is too far
4037 * behind, don't overwrite segments. Just drop the packet and
4038 * let the reliability protocol take care of it.
4039 */
4040 if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS))
4041 goto ack_done;
4042
4043 /*
4044 * The call to do_rc_ack() should be last in the chain of
4045 * packet checks because it will end up updating the QP state.
4046 * Therefore, anything that would prevent the packet from
4047 * being accepted as a successful response should be prior
4048 * to it.
4049 */
4050 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
4051 goto ack_done;
4052
4053 flow = &req->flows[req->setup_head];
4054 flow->pkt = 0;
4055 flow->tid_idx = 0;
4056 flow->tid_offset = 0;
4057 flow->sent = 0;
4058 flow->resync_npkts = 0;
4059 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp);
4060 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
4061 TID_RDMA_DESTQP_FLOW_MASK;
4062 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn));
4063 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
4064 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
4065 flow->flow_state.resp_ib_psn = psn;
4066 flow->length = min_t(u32, req->seg_len,
4067 (wqe->length - (req->comp_seg * req->seg_len)));
4068
4069 flow->npkts = rvt_div_round_up_mtu(qp, flow->length);
4070 flow->flow_state.lpsn = flow->flow_state.spsn +
4071 flow->npkts - 1;
4072 /* payload length = packet length - (header length + ICRC length) */
4073 pktlen = packet->tlen - (packet->hlen + 4);
4074 if (pktlen > sizeof(flow->tid_entry)) {
4075 status = IB_WC_LOC_LEN_ERR;
4076 goto ack_err;
4077 }
4078 memcpy(flow->tid_entry, packet->ebuf, pktlen);
4079 flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
4080
4081 req->comp_seg++;
4082 /*
4083 * Walk the TID_ENTRY list to make sure we have enough space for a
4084 * complete segment.
4085 */
4086 for (i = 0; i < flow->tidcnt; i++) {
4087 if (!EXP_TID_GET(flow->tid_entry[i], LEN)) {
4088 status = IB_WC_LOC_LEN_ERR;
4089 goto ack_err;
4090 }
4091 tidlen += EXP_TID_GET(flow->tid_entry[i], LEN);
4092 }
4093 if (tidlen * PAGE_SIZE < flow->length) {
4094 status = IB_WC_LOC_LEN_ERR;
4095 goto ack_err;
4096 }
4097
4098 /*
4099 * If this is the first response for this request, set the initial
4100 * flow index to the current flow.
4101 */
4102 if (!cmp_psn(psn, wqe->psn)) {
4103 req->r_last_acked = mask_psn(wqe->psn - 1);
4104 /* Set acked flow index to head index */
4105 req->acked_tail = req->setup_head;
4106 }
4107
4108 /* advance circular buffer head */
4109 req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS);
4110 req->state = TID_REQUEST_ACTIVE;
4111
4112 /*
4113 * If all responses for this TID RDMA WRITE request have been received
4114 * advance the pointer to the next one.
4115 * Since TID RDMA requests could be mixed in with regular IB requests,
4116 * they might not appear sequentially in the queue. Therefore, the
4117 * next request needs to be "found".
4118 */
4119 if (qpriv->s_tid_cur != qpriv->s_tid_head &&
4120 req->comp_seg == req->total_segs) {
4121 for (i = qpriv->s_tid_cur + 1; ; i++) {
4122 if (i == qp->s_size)
4123 i = 0;
4124 wqe = rvt_get_swqe_ptr(qp, i);
4125 if (i == qpriv->s_tid_head)
4126 break;
4127 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
4128 break;
4129 }
4130 qpriv->s_tid_cur = i;
4131 }
4132 qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
4133
572f0c33 4134 hfi1_schedule_tid_send(qp);
72a0ea99
KW
4135 goto ack_done;
4136
4137ack_op_err:
4138 status = IB_WC_LOC_QP_OP_ERR;
4139ack_err:
4140 rvt_error_qp(qp, status);
4141ack_done:
4142 spin_unlock_irqrestore(&qp->s_lock, flags);
4143 if (is_fecn)
4144 hfi1_send_rc_ack(packet, is_fecn);
4145}
539e1908
KW
4146
4147bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
4148 struct ib_other_headers *ohdr,
4149 u32 *bth1, u32 *bth2, u32 *len)
4150{
4151 struct tid_rdma_request *req = wqe_to_tid_req(wqe);
4152 struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
4153 struct tid_rdma_params *remote;
4154 struct rvt_qp *qp = req->qp;
4155 struct hfi1_qp_priv *qpriv = qp->priv;
4156 u32 tidentry = flow->tid_entry[flow->tid_idx];
4157 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
4158 struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data;
4159 u32 next_offset, om = KDETH_OM_LARGE;
4160 bool last_pkt;
4161
4162 if (!tidlen) {
4163 hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR);
4164 rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR);
4165 }
4166
4167 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
4168 flow->sent += *len;
4169 next_offset = flow->tid_offset + *len;
4170 last_pkt = (flow->tid_idx == (flow->tidcnt - 1) &&
4171 next_offset >= tidlen) || (flow->sent >= flow->length);
4172
4173 rcu_read_lock();
4174 remote = rcu_dereference(qpriv->tid_rdma.remote);
4175 KDETH_RESET(wd->kdeth0, KVER, 0x1);
4176 KDETH_SET(wd->kdeth0, SH, !last_pkt);
4177 KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg));
4178 KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
4179 KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
4180 KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE);
4181 KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om);
4182 KDETH_RESET(wd->kdeth1, JKEY, remote->jkey);
4183 wd->verbs_qp = cpu_to_be32(qp->remote_qpn);
4184 rcu_read_unlock();
4185
4186 *bth1 = flow->tid_qpn;
4187 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
4188 HFI1_KDETH_BTH_SEQ_MASK) |
4189 (flow->flow_state.generation <<
4190 HFI1_KDETH_BTH_SEQ_SHIFT));
4191 if (last_pkt) {
4192 /* PSNs are zero-based, so +1 to count number of packets */
4193 if (flow->flow_state.lpsn + 1 +
4194 rvt_div_round_up_mtu(qp, req->seg_len) >
4195 MAX_TID_FLOW_PSN)
4196 req->state = TID_REQUEST_SYNC;
4197 *bth2 |= IB_BTH_REQ_ACK;
4198 }
4199
4200 if (next_offset >= tidlen) {
4201 flow->tid_offset = 0;
4202 flow->tid_idx++;
4203 } else {
4204 flow->tid_offset = next_offset;
4205 }
4206 return last_pkt;
4207}
d72fe7d5
KW
4208
4209void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
4210{
4211 struct rvt_qp *qp = packet->qp;
4212 struct hfi1_qp_priv *priv = qp->priv;
4213 struct hfi1_ctxtdata *rcd = priv->rcd;
4214 struct ib_other_headers *ohdr = packet->ohdr;
4215 struct rvt_ack_entry *e;
4216 struct tid_rdma_request *req;
4217 struct tid_rdma_flow *flow;
4218 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
4219 unsigned long flags;
4220 u32 psn, next;
4221 u8 opcode;
4222
4223 psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4224 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
4225
4226 /*
4227 * All error handling should be done by now. If we are here, the packet
4228 * is either good or been accepted by the error handler.
4229 */
4230 spin_lock_irqsave(&qp->s_lock, flags);
4231 e = &qp->s_ack_queue[priv->r_tid_tail];
4232 req = ack_to_tid_req(e);
4233 flow = &req->flows[req->clear_tail];
4234 if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
4235 if (cmp_psn(psn, flow->flow_state.r_next_psn))
4236 goto send_nak;
4237 flow->flow_state.r_next_psn++;
4238 goto exit;
4239 }
4240 flow->flow_state.r_next_psn = mask_psn(psn + 1);
4241 hfi1_kern_exp_rcv_clear(req);
4242 priv->alloc_w_segs--;
4243 rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK;
4244 req->comp_seg++;
4245 priv->s_nak_state = 0;
4246
4247 /*
4248 * Release the flow if one of the following conditions has been met:
4249 * - The request has reached a sync point AND all outstanding
4250 * segments have been completed, or
4251 * - The entire request is complete and there are no more requests
4252 * (of any kind) in the queue.
4253 */
4254 if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
4255 priv->r_tid_ack = priv->r_tid_tail;
4256
4257 if (opcode == TID_OP(WRITE_DATA_LAST)) {
4258 for (next = priv->r_tid_tail + 1; ; next++) {
4259 if (next > rvt_size_atomic(&dev->rdi))
4260 next = 0;
4261 if (next == priv->r_tid_head)
4262 break;
4263 e = &qp->s_ack_queue[next];
4264 if (e->opcode == TID_OP(WRITE_REQ))
4265 break;
4266 }
4267 priv->r_tid_tail = next;
4268 if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi))
4269 qp->s_acked_ack_queue = 0;
4270 }
4271
4272 hfi1_tid_write_alloc_resources(qp, true);
4273
4274 /*
4275 * If we need to generate more responses, schedule the
4276 * send engine.
4277 */
4278 if (req->cur_seg < req->total_segs ||
4279 qp->s_tail_ack_queue != qp->r_head_ack_queue) {
4280 qp->s_flags |= RVT_S_RESP_PENDING;
4281 hfi1_schedule_send(qp);
4282 }
4283
4284 priv->pending_tid_w_segs--;
4285 if (priv->s_flags & HFI1_R_TID_RSC_TIMER) {
4286 if (priv->pending_tid_w_segs)
4287 hfi1_mod_tid_reap_timer(req->qp);
4288 else
4289 hfi1_stop_tid_reap_timer(req->qp);
4290 }
4291
4292done:
4293 priv->s_flags |= RVT_S_ACK_PENDING;
572f0c33 4294 hfi1_schedule_tid_send(qp);
d72fe7d5
KW
4295exit:
4296 priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
4297 spin_unlock_irqrestore(&qp->s_lock, flags);
4298 return;
4299
4300send_nak:
4301 if (!priv->s_nak_state) {
4302 priv->s_nak_state = IB_NAK_PSN_ERROR;
4303 priv->s_nak_psn = flow->flow_state.r_next_psn;
4304 priv->s_flags |= RVT_S_ACK_PENDING;
4305 if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
4306 priv->r_tid_ack = priv->r_tid_tail;
572f0c33 4307 hfi1_schedule_tid_send(qp);
d72fe7d5
KW
4308 }
4309 goto done;
4310}
0f75e325
KW
4311
4312static bool hfi1_tid_rdma_is_resync_psn(u32 psn)
4313{
4314 return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) ==
4315 HFI1_KDETH_BTH_SEQ_MASK);
4316}
4317
4318u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
4319 struct ib_other_headers *ohdr, u16 iflow,
4320 u32 *bth1, u32 *bth2)
4321{
4322 struct hfi1_qp_priv *qpriv = qp->priv;
4323 struct tid_flow_state *fs = &qpriv->flow_state;
4324 struct tid_rdma_request *req = ack_to_tid_req(e);
4325 struct tid_rdma_flow *flow = &req->flows[iflow];
4326 struct tid_rdma_params *remote;
4327
4328 rcu_read_lock();
4329 remote = rcu_dereference(qpriv->tid_rdma.remote);
4330 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
4331 ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
4332 *bth1 = remote->qp;
4333 rcu_read_unlock();
4334
4335 if (qpriv->resync) {
4336 *bth2 = mask_psn((fs->generation <<
4337 HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
4338 ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
4339 } else if (qpriv->s_nak_state) {
4340 *bth2 = mask_psn(qpriv->s_nak_psn);
4341 ohdr->u.tid_rdma.ack.aeth =
4342 cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
4343 (qpriv->s_nak_state <<
4344 IB_AETH_CREDIT_SHIFT));
4345 } else {
4346 *bth2 = full_flow_psn(flow, flow->flow_state.lpsn);
4347 ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
4348 }
4349 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
4350 ohdr->u.tid_rdma.ack.tid_flow_qp =
4351 cpu_to_be32(qpriv->tid_rdma.local.qp |
4352 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
4353 TID_RDMA_DESTQP_FLOW_SHIFT) |
4354 qpriv->rcd->ctxt);
4355
4356 ohdr->u.tid_rdma.ack.tid_flow_psn = 0;
4357 ohdr->u.tid_rdma.ack.verbs_psn =
4358 cpu_to_be32(flow->flow_state.resp_ib_psn);
4359
4360 if (qpriv->resync) {
4361 /*
4362 * If the PSN before the current expect KDETH PSN is the
4363 * RESYNC PSN, then we never received a good TID RDMA WRITE
4364 * DATA packet after a previous RESYNC.
4365 * In this case, the next expected KDETH PSN stays the same.
4366 */
4367 if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) {
4368 ohdr->u.tid_rdma.ack.tid_flow_psn =
4369 cpu_to_be32(qpriv->r_next_psn_kdeth_save);
4370 } else {
4371 /*
4372 * Because the KDETH PSNs jump during a RESYNC, it's
4373 * not possible to infer (or compute) the previous value
4374 * of r_next_psn_kdeth in the case of back-to-back
4375 * RESYNC packets. Therefore, we save it.
4376 */
4377 qpriv->r_next_psn_kdeth_save =
4378 qpriv->r_next_psn_kdeth - 1;
4379 ohdr->u.tid_rdma.ack.tid_flow_psn =
4380 cpu_to_be32(qpriv->r_next_psn_kdeth_save);
4381 qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1);
4382 }
4383 qpriv->resync = false;
4384 }
4385
4386 return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32);
4387}
9e93e967
KW
4388
4389void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
4390{
4391 struct ib_other_headers *ohdr = packet->ohdr;
4392 struct rvt_qp *qp = packet->qp;
4393 struct hfi1_qp_priv *qpriv = qp->priv;
4394 struct rvt_swqe *wqe;
4395 struct tid_rdma_request *req;
4396 struct tid_rdma_flow *flow;
4397 u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn;
4398 bool is_fecn;
4399 unsigned long flags;
4400 u16 fidx;
4401
4402 is_fecn = process_ecn(qp, packet);
4403 psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4404 aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
4405 req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
4406 resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn));
4407
4408 spin_lock_irqsave(&qp->s_lock, flags);
4409
4410 /* If we are waiting for an ACK to RESYNC, drop any other packets */
4411 if ((qp->s_flags & HFI1_S_WAIT_HALT) &&
4412 cmp_psn(psn, qpriv->s_resync_psn))
4413 goto ack_op_err;
4414
4415 ack_psn = req_psn;
4416 if (hfi1_tid_rdma_is_resync_psn(psn))
4417 ack_kpsn = resync_psn;
4418 else
4419 ack_kpsn = psn;
4420 if (aeth >> 29) {
4421 ack_psn--;
4422 ack_kpsn--;
4423 }
4424
4425 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4426
4427 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
4428 goto ack_op_err;
4429
4430 req = wqe_to_tid_req(wqe);
4431 flow = &req->flows[req->acked_tail];
4432
4433 /* Drop stale ACK/NAK */
4434 if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0)
4435 goto ack_op_err;
4436
4437 while (cmp_psn(ack_kpsn,
4438 full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 &&
4439 req->ack_seg < req->cur_seg) {
4440 req->ack_seg++;
4441 /* advance acked segment pointer */
4442 req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS);
4443 req->r_last_acked = flow->flow_state.resp_ib_psn;
4444 if (req->ack_seg == req->total_segs) {
4445 req->state = TID_REQUEST_COMPLETE;
4446 wqe = do_rc_completion(qp, wqe,
4447 to_iport(qp->ibqp.device,
4448 qp->port_num));
4449 atomic_dec(&qpriv->n_tid_requests);
4450 if (qp->s_acked == qp->s_tail)
4451 break;
4452 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
4453 break;
4454 req = wqe_to_tid_req(wqe);
4455 }
4456 flow = &req->flows[req->acked_tail];
4457 }
4458
4459 switch (aeth >> 29) {
4460 case 0: /* ACK */
4461 if (qpriv->s_flags & RVT_S_WAIT_ACK)
4462 qpriv->s_flags &= ~RVT_S_WAIT_ACK;
4463 if (!hfi1_tid_rdma_is_resync_psn(psn)) {
829eaee5
KW
4464 /* Check if there is any pending TID ACK */
4465 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
4466 req->ack_seg < req->cur_seg)
4467 hfi1_mod_tid_retry_timer(qp);
4468 else
4469 hfi1_stop_tid_retry_timer(qp);
9e93e967
KW
4470 hfi1_schedule_send(qp);
4471 } else {
4472 u32 spsn, fpsn, last_acked, generation;
4473 struct tid_rdma_request *rptr;
4474
829eaee5
KW
4475 /* ACK(RESYNC) */
4476 hfi1_stop_tid_retry_timer(qp);
9e93e967
KW
4477 /* Allow new requests (see hfi1_make_tid_rdma_pkt) */
4478 qp->s_flags &= ~HFI1_S_WAIT_HALT;
4479 /*
4480 * Clear RVT_S_SEND_ONE flag in case that the TID RDMA
4481 * ACK is received after the TID retry timer is fired
4482 * again. In this case, do not send any more TID
4483 * RESYNC request or wait for any more TID ACK packet.
4484 */
4485 qpriv->s_flags &= ~RVT_S_SEND_ONE;
4486 hfi1_schedule_send(qp);
4487
4488 if ((qp->s_acked == qpriv->s_tid_tail &&
4489 req->ack_seg == req->total_segs) ||
4490 qp->s_acked == qp->s_tail) {
4491 qpriv->s_state = TID_OP(WRITE_DATA_LAST);
4492 goto done;
4493 }
4494
4495 if (req->ack_seg == req->comp_seg) {
4496 qpriv->s_state = TID_OP(WRITE_DATA);
4497 goto done;
4498 }
4499
4500 /*
4501 * The PSN to start with is the next PSN after the
4502 * RESYNC PSN.
4503 */
4504 psn = mask_psn(psn + 1);
4505 generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
4506 spsn = 0;
4507
4508 /*
4509 * Update to the correct WQE when we get an ACK(RESYNC)
4510 * in the middle of a request.
4511 */
4512 if (delta_psn(ack_psn, wqe->lpsn))
4513 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4514 req = wqe_to_tid_req(wqe);
4515 flow = &req->flows[req->acked_tail];
4516 /*
4517 * RESYNC re-numbers the PSN ranges of all remaining
4518 * segments. Also, PSN's start from 0 in the middle of a
4519 * segment and the first segment size is less than the
4520 * default number of packets. flow->resync_npkts is used
4521 * to track the number of packets from the start of the
4522 * real segment to the point of 0 PSN after the RESYNC
4523 * in order to later correctly rewind the SGE.
4524 */
4525 fpsn = full_flow_psn(flow, flow->flow_state.spsn);
4526 req->r_ack_psn = psn;
4527 flow->resync_npkts +=
4528 delta_psn(mask_psn(resync_psn + 1), fpsn);
4529 /*
4530 * Renumber all packet sequence number ranges
4531 * based on the new generation.
4532 */
4533 last_acked = qp->s_acked;
4534 rptr = req;
4535 while (1) {
4536 /* start from last acked segment */
4537 for (fidx = rptr->acked_tail;
4538 CIRC_CNT(rptr->setup_head, fidx,
4539 MAX_FLOWS);
4540 fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
4541 u32 lpsn;
4542 u32 gen;
4543
4544 flow = &rptr->flows[fidx];
4545 gen = flow->flow_state.generation;
4546 if (WARN_ON(gen == generation &&
4547 flow->flow_state.spsn !=
4548 spsn))
4549 continue;
4550 lpsn = flow->flow_state.lpsn;
4551 lpsn = full_flow_psn(flow, lpsn);
4552 flow->npkts =
4553 delta_psn(lpsn,
4554 mask_psn(resync_psn)
4555 );
4556 flow->flow_state.generation =
4557 generation;
4558 flow->flow_state.spsn = spsn;
4559 flow->flow_state.lpsn =
4560 flow->flow_state.spsn +
4561 flow->npkts - 1;
4562 flow->pkt = 0;
4563 spsn += flow->npkts;
4564 resync_psn += flow->npkts;
4565 }
4566 if (++last_acked == qpriv->s_tid_cur + 1)
4567 break;
4568 if (last_acked == qp->s_size)
4569 last_acked = 0;
4570 wqe = rvt_get_swqe_ptr(qp, last_acked);
4571 rptr = wqe_to_tid_req(wqe);
4572 }
4573 req->cur_seg = req->ack_seg;
4574 qpriv->s_tid_tail = qp->s_acked;
4575 qpriv->s_state = TID_OP(WRITE_REQ);
572f0c33 4576 hfi1_schedule_tid_send(qp);
9e93e967
KW
4577 }
4578done:
4579 qpriv->s_retry = qp->s_retry_cnt;
4580 break;
4581
4582 case 3: /* NAK */
829eaee5 4583 hfi1_stop_tid_retry_timer(qp);
9e93e967
KW
4584 switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
4585 IB_AETH_CREDIT_MASK) {
4586 case 0: /* PSN sequence error */
4587 flow = &req->flows[req->acked_tail];
4588 fspsn = full_flow_psn(flow, flow->flow_state.spsn);
4589 req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4590 req->cur_seg = req->ack_seg;
4591 qpriv->s_tid_tail = qp->s_acked;
4592 qpriv->s_state = TID_OP(WRITE_REQ);
4593 qpriv->s_retry = qp->s_retry_cnt;
572f0c33 4594 hfi1_schedule_tid_send(qp);
9e93e967
KW
4595 break;
4596
4597 default:
4598 break;
4599 }
4600 break;
4601
4602 default:
4603 break;
4604 }
4605
4606ack_op_err:
4607 spin_unlock_irqrestore(&qp->s_lock, flags);
4608}
829eaee5
KW
4609
4610void hfi1_add_tid_retry_timer(struct rvt_qp *qp)
4611{
4612 struct hfi1_qp_priv *priv = qp->priv;
4613 struct ib_qp *ibqp = &qp->ibqp;
4614 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
4615
4616 lockdep_assert_held(&qp->s_lock);
4617 if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) {
4618 priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
4619 priv->s_tid_retry_timer.expires = jiffies +
4620 priv->tid_retry_timeout_jiffies + rdi->busy_jiffies;
4621 add_timer(&priv->s_tid_retry_timer);
4622 }
4623}
4624
4625static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp)
4626{
4627 struct hfi1_qp_priv *priv = qp->priv;
4628 struct ib_qp *ibqp = &qp->ibqp;
4629 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
4630
4631 lockdep_assert_held(&qp->s_lock);
4632 priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
4633 mod_timer(&priv->s_tid_retry_timer, jiffies +
4634 priv->tid_retry_timeout_jiffies + rdi->busy_jiffies);
4635}
4636
4637static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp)
4638{
4639 struct hfi1_qp_priv *priv = qp->priv;
4640 int rval = 0;
4641
4642 lockdep_assert_held(&qp->s_lock);
4643 if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
4644 rval = del_timer(&priv->s_tid_retry_timer);
4645 priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
4646 }
4647 return rval;
4648}
4649
4650void hfi1_del_tid_retry_timer(struct rvt_qp *qp)
4651{
4652 struct hfi1_qp_priv *priv = qp->priv;
4653
4654 del_timer_sync(&priv->s_tid_retry_timer);
4655 priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
4656}
4657
4658static void hfi1_tid_retry_timeout(struct timer_list *t)
4659{
4660 struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer);
4661 struct rvt_qp *qp = priv->owner;
4662 struct rvt_swqe *wqe;
4663 unsigned long flags;
4664
4665 spin_lock_irqsave(&qp->r_lock, flags);
4666 spin_lock(&qp->s_lock);
4667 if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
4668 hfi1_stop_tid_retry_timer(qp);
4669 if (!priv->s_retry) {
4670 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4671 hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
4672 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
4673 } else {
4674 priv->s_flags &= ~RVT_S_WAIT_ACK;
4675 /* Only send one packet (the RESYNC) */
4676 priv->s_flags |= RVT_S_SEND_ONE;
4677 /*
4678 * No additional request shall be made by this QP until
4679 * the RESYNC has been complete.
4680 */
4681 qp->s_flags |= HFI1_S_WAIT_HALT;
4682 priv->s_state = TID_OP(RESYNC);
4683 priv->s_retry--;
572f0c33 4684 hfi1_schedule_tid_send(qp);
829eaee5
KW
4685 }
4686 }
4687 spin_unlock(&qp->s_lock);
4688 spin_unlock_irqrestore(&qp->r_lock, flags);
4689}
6e391c6a
KW
4690
4691u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
4692 struct ib_other_headers *ohdr, u32 *bth1,
4693 u32 *bth2, u16 fidx)
4694{
4695 struct hfi1_qp_priv *qpriv = qp->priv;
4696 struct tid_rdma_params *remote;
4697 struct tid_rdma_request *req = wqe_to_tid_req(wqe);
4698 struct tid_rdma_flow *flow = &req->flows[fidx];
4699 u32 generation;
4700
4701 rcu_read_lock();
4702 remote = rcu_dereference(qpriv->tid_rdma.remote);
4703 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
4704 ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
4705 *bth1 = remote->qp;
4706 rcu_read_unlock();
4707
4708 generation = kern_flow_generation_next(flow->flow_state.generation);
4709 *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
4710 qpriv->s_resync_psn = *bth2;
4711 *bth2 |= IB_BTH_REQ_ACK;
4712 KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
4713
4714 return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32);
4715}
7cf0ad67
KW
4716
4717void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
4718{
4719 struct ib_other_headers *ohdr = packet->ohdr;
4720 struct rvt_qp *qp = packet->qp;
4721 struct hfi1_qp_priv *qpriv = qp->priv;
4722 struct hfi1_ctxtdata *rcd = qpriv->rcd;
4723 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
4724 struct rvt_ack_entry *e;
4725 struct tid_rdma_request *req;
4726 struct tid_rdma_flow *flow;
4727 struct tid_flow_state *fs = &qpriv->flow_state;
4728 u32 psn, generation, idx, gen_next;
4729 bool is_fecn;
4730 unsigned long flags;
4731
4732 is_fecn = process_ecn(qp, packet);
4733 psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4734
4735 generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT;
4736 spin_lock_irqsave(&qp->s_lock, flags);
4737
4738 gen_next = (fs->generation == KERN_GENERATION_RESERVED) ?
4739 generation : kern_flow_generation_next(fs->generation);
4740 /*
4741 * RESYNC packet contains the "next" generation and can only be
4742 * from the current or previous generations
4743 */
4744 if (generation != mask_generation(gen_next - 1) &&
4745 generation != gen_next)
4746 goto bail;
4747 /* Already processing a resync */
4748 if (qpriv->resync)
4749 goto bail;
4750
4751 spin_lock(&rcd->exp_lock);
4752 if (fs->index >= RXE_NUM_TID_FLOWS) {
4753 /*
4754 * If we don't have a flow, save the generation so it can be
4755 * applied when a new flow is allocated
4756 */
4757 fs->generation = generation;
4758 } else {
4759 /* Reprogram the QP flow with new generation */
4760 rcd->flows[fs->index].generation = generation;
4761 fs->generation = kern_setup_hw_flow(rcd, fs->index);
4762 }
4763 fs->psn = 0;
4764 /*
4765 * Disable SW PSN checking since a RESYNC is equivalent to a
4766 * sync point and the flow has/will be reprogrammed
4767 */
4768 qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
4769
4770 /*
4771 * Reset all TID flow information with the new generation.
4772 * This is done for all requests and segments after the
4773 * last received segment
4774 */
4775 for (idx = qpriv->r_tid_tail; ; idx++) {
4776 u16 flow_idx;
4777
4778 if (idx > rvt_size_atomic(&dev->rdi))
4779 idx = 0;
4780 e = &qp->s_ack_queue[idx];
4781 if (e->opcode == TID_OP(WRITE_REQ)) {
4782 req = ack_to_tid_req(e);
4783
4784 /* start from last unacked segment */
4785 for (flow_idx = req->clear_tail;
4786 CIRC_CNT(req->setup_head, flow_idx,
4787 MAX_FLOWS);
4788 flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) {
4789 u32 lpsn;
4790 u32 next;
4791
4792 flow = &req->flows[flow_idx];
4793 lpsn = full_flow_psn(flow,
4794 flow->flow_state.lpsn);
4795 next = flow->flow_state.r_next_psn;
4796 flow->npkts = delta_psn(lpsn, next - 1);
4797 flow->flow_state.generation = fs->generation;
4798 flow->flow_state.spsn = fs->psn;
4799 flow->flow_state.lpsn =
4800 flow->flow_state.spsn + flow->npkts - 1;
4801 flow->flow_state.r_next_psn =
4802 full_flow_psn(flow,
4803 flow->flow_state.spsn);
4804 fs->psn += flow->npkts;
4805 }
4806 }
4807 if (idx == qp->s_tail_ack_queue)
4808 break;
4809 }
4810
4811 spin_unlock(&rcd->exp_lock);
4812 qpriv->resync = true;
4813 /* RESYNC request always gets a TID RDMA ACK. */
4814 qpriv->s_nak_state = 0;
4815 qpriv->s_flags |= RVT_S_ACK_PENDING;
572f0c33 4816 hfi1_schedule_tid_send(qp);
7cf0ad67
KW
4817bail:
4818 spin_unlock_irqrestore(&qp->s_lock, flags);
4819}
70dcb2e3
KW
4820
4821/*
4822 * Call this function when the last TID RDMA WRITE DATA packet for a request
4823 * is built.
4824 */
4825static void update_tid_tail(struct rvt_qp *qp)
4826 __must_hold(&qp->s_lock)
4827{
4828 struct hfi1_qp_priv *priv = qp->priv;
4829 u32 i;
4830 struct rvt_swqe *wqe;
4831
4832 lockdep_assert_held(&qp->s_lock);
4833 /* Can't move beyond s_tid_cur */
4834 if (priv->s_tid_tail == priv->s_tid_cur)
4835 return;
4836 for (i = priv->s_tid_tail + 1; ; i++) {
4837 if (i == qp->s_size)
4838 i = 0;
4839
4840 if (i == priv->s_tid_cur)
4841 break;
4842 wqe = rvt_get_swqe_ptr(qp, i);
4843 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
4844 break;
4845 }
4846 priv->s_tid_tail = i;
4847 priv->s_state = TID_OP(WRITE_RESP);
4848}
4849
4850int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
4851 __must_hold(&qp->s_lock)
4852{
4853 struct hfi1_qp_priv *priv = qp->priv;
4854 struct rvt_swqe *wqe;
4855 u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0;
4856 struct ib_other_headers *ohdr;
4857 struct rvt_sge_state *ss = &qp->s_sge;
4858 struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue];
4859 struct tid_rdma_request *req = ack_to_tid_req(e);
4860 bool last = false;
4861 u8 opcode = TID_OP(WRITE_DATA);
4862
4863 lockdep_assert_held(&qp->s_lock);
4864 /*
4865 * Prioritize the sending of the requests and responses over the
4866 * sending of the TID RDMA data packets.
4867 */
4868 if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) &&
4869 atomic_read(&priv->n_requests) &&
4870 !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK |
4871 HFI1_S_ANY_WAIT_IO))) ||
4872 (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg &&
4873 !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) {
4874 struct iowait_work *iowork;
4875
4876 iowork = iowait_get_ib_work(&priv->s_iowait);
4877 ps->s_txreq = get_waiting_verbs_txreq(iowork);
4878 if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) {
4879 priv->s_flags |= HFI1_S_TID_BUSY_SET;
4880 return 1;
4881 }
4882 }
4883
4884 ps->s_txreq = get_txreq(ps->dev, qp);
4885 if (!ps->s_txreq)
4886 goto bail_no_tx;
4887
4888 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
4889
24c5bfea
KW
4890 if ((priv->s_flags & RVT_S_ACK_PENDING) &&
4891 make_tid_rdma_ack(qp, ohdr, ps))
4892 return 1;
4893
70dcb2e3
KW
4894 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
4895 if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
4896 goto bail;
4897 /* We are in the error state, flush the work request. */
4898 if (qp->s_last == READ_ONCE(qp->s_head))
4899 goto bail;
4900 /* If DMAs are in progress, we can't flush immediately. */
4901 if (iowait_sdma_pending(&priv->s_iowait)) {
4902 qp->s_flags |= RVT_S_WAIT_DMA;
4903 goto bail;
4904 }
4905 clear_ahg(qp);
4906 wqe = rvt_get_swqe_ptr(qp, qp->s_last);
4907 hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
4908 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
4909 /* will get called again */
4910 goto done_free_tx;
4911 }
4912
4913 if (priv->s_flags & RVT_S_WAIT_ACK)
4914 goto bail;
4915
4916 /* Check whether there is anything to do. */
4917 if (priv->s_tid_tail == HFI1_QP_WQE_INVALID)
4918 goto bail;
4919 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
4920 req = wqe_to_tid_req(wqe);
4921 switch (priv->s_state) {
4922 case TID_OP(WRITE_REQ):
4923 case TID_OP(WRITE_RESP):
4924 priv->tid_ss.sge = wqe->sg_list[0];
4925 priv->tid_ss.sg_list = wqe->sg_list + 1;
4926 priv->tid_ss.num_sge = wqe->wr.num_sge;
4927 priv->tid_ss.total_len = wqe->length;
4928
4929 if (priv->s_state == TID_OP(WRITE_REQ))
4930 hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
4931 priv->s_state = TID_OP(WRITE_DATA);
4932 /* fall through */
4933
4934 case TID_OP(WRITE_DATA):
4935 /*
4936 * 1. Check whether TID RDMA WRITE RESP available.
4937 * 2. If no:
4938 * 2.1 If have more segments and no TID RDMA WRITE RESP,
4939 * set HFI1_S_WAIT_TID_RESP
4940 * 2.2 Return indicating no progress made.
4941 * 3. If yes:
4942 * 3.1 Build TID RDMA WRITE DATA packet.
4943 * 3.2 If last packet in segment:
4944 * 3.2.1 Change KDETH header bits
4945 * 3.2.2 Advance RESP pointers.
4946 * 3.3 Return indicating progress made.
4947 */
4948 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
4949 req = wqe_to_tid_req(wqe);
4950 len = wqe->length;
4951
4952 if (!req->comp_seg || req->cur_seg == req->comp_seg)
4953 goto bail;
4954
4955 last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2,
4956 &len);
4957
4958 if (last) {
4959 /* move pointer to next flow */
4960 req->clear_tail = CIRC_NEXT(req->clear_tail,
4961 MAX_FLOWS);
4962 if (++req->cur_seg < req->total_segs) {
4963 if (!CIRC_CNT(req->setup_head, req->clear_tail,
4964 MAX_FLOWS))
4965 qp->s_flags |= HFI1_S_WAIT_TID_RESP;
4966 } else {
4967 priv->s_state = TID_OP(WRITE_DATA_LAST);
4968 opcode = TID_OP(WRITE_DATA_LAST);
4969
4970 /* Advance the s_tid_tail now */
4971 update_tid_tail(qp);
4972 }
4973 }
4974 hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32);
4975 ss = &priv->tid_ss;
4976 break;
4977
4978 case TID_OP(RESYNC):
4979 /* Use generation from the most recently received response */
4980 wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
4981 req = wqe_to_tid_req(wqe);
4982 /* If no responses for this WQE look at the previous one */
4983 if (!req->comp_seg) {
4984 wqe = rvt_get_swqe_ptr(qp,
4985 (!priv->s_tid_cur ? qp->s_size :
4986 priv->s_tid_cur) - 1);
4987 req = wqe_to_tid_req(wqe);
4988 }
4989 hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1,
4990 &bth2,
4991 CIRC_PREV(req->setup_head,
4992 MAX_FLOWS));
4993 ss = NULL;
4994 len = 0;
4995 opcode = TID_OP(RESYNC);
4996 break;
4997
4998 default:
4999 goto bail;
5000 }
5001 if (priv->s_flags & RVT_S_SEND_ONE) {
5002 priv->s_flags &= ~RVT_S_SEND_ONE;
5003 priv->s_flags |= RVT_S_WAIT_ACK;
5004 bth2 |= IB_BTH_REQ_ACK;
5005 }
5006 qp->s_len -= len;
5007 ps->s_txreq->hdr_dwords = hwords;
5008 ps->s_txreq->sde = priv->s_sde;
5009 ps->s_txreq->ss = ss;
5010 ps->s_txreq->s_cur_size = len;
5011 hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2,
5012 middle, ps);
5013 return 1;
5014done_free_tx:
5015 hfi1_put_txreq(ps->s_txreq);
5016 ps->s_txreq = NULL;
5017 return 1;
5018
5019bail:
5020 hfi1_put_txreq(ps->s_txreq);
5021bail_no_tx:
5022 ps->s_txreq = NULL;
5023 priv->s_flags &= ~RVT_S_BUSY;
5024 /*
5025 * If we didn't get a txreq, the QP will be woken up later to try
5026 * again, set the flags to the the wake up which work item to wake
5027 * up.
5028 * (A better algorithm should be found to do this and generalize the
5029 * sleep/wakeup flags.)
5030 */
5031 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
5032 return 0;
5033}
24c5bfea
KW
5034
5035static int make_tid_rdma_ack(struct rvt_qp *qp,
5036 struct ib_other_headers *ohdr,
5037 struct hfi1_pkt_state *ps)
5038{
5039 struct rvt_ack_entry *e;
5040 struct hfi1_qp_priv *qpriv = qp->priv;
5041 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
5042 u32 hwords, next;
5043 u32 len = 0;
5044 u32 bth1 = 0, bth2 = 0;
5045 int middle = 0;
5046 u16 flow;
5047 struct tid_rdma_request *req, *nreq;
5048
5049 /* Don't send an ACK if we aren't supposed to. */
5050 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
5051 goto bail;
5052
5053 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
5054 hwords = 5;
5055
5056 e = &qp->s_ack_queue[qpriv->r_tid_ack];
5057 req = ack_to_tid_req(e);
5058 /*
5059 * In the RESYNC case, we are exactly one segment past the
5060 * previously sent ack or at the previously sent NAK. So to send
5061 * the resync ack, we go back one segment (which might be part of
5062 * the previous request) and let the do-while loop execute again.
5063 * The advantage of executing the do-while loop is that any data
5064 * received after the previous ack is automatically acked in the
5065 * RESYNC ack. It turns out that for the do-while loop we only need
5066 * to pull back qpriv->r_tid_ack, not the segment
5067 * indices/counters. The scheme works even if the previous request
5068 * was not a TID WRITE request.
5069 */
5070 if (qpriv->resync) {
5071 if (!req->ack_seg || req->ack_seg == req->total_segs)
5072 qpriv->r_tid_ack = !qpriv->r_tid_ack ?
5073 rvt_size_atomic(&dev->rdi) :
5074 qpriv->r_tid_ack - 1;
5075 e = &qp->s_ack_queue[qpriv->r_tid_ack];
5076 req = ack_to_tid_req(e);
5077 }
5078
5079 /*
5080 * If we've sent all the ACKs that we can, we are done
5081 * until we get more segments...
5082 */
5083 if (!qpriv->s_nak_state && !qpriv->resync &&
5084 req->ack_seg == req->comp_seg)
5085 goto bail;
5086
5087 do {
5088 /*
5089 * To deal with coalesced ACKs, the acked_tail pointer
5090 * into the flow array is used. The distance between it
5091 * and the clear_tail is the number of flows that are
5092 * being ACK'ed.
5093 */
5094 req->ack_seg +=
5095 /* Get up-to-date value */
5096 CIRC_CNT(req->clear_tail, req->acked_tail,
5097 MAX_FLOWS);
5098 /* Advance acked index */
5099 req->acked_tail = req->clear_tail;
5100
5101 /*
5102 * req->clear_tail points to the segment currently being
5103 * received. So, when sending an ACK, the previous
5104 * segment is being ACK'ed.
5105 */
5106 flow = CIRC_PREV(req->acked_tail, MAX_FLOWS);
5107 if (req->ack_seg != req->total_segs)
5108 break;
5109 req->state = TID_REQUEST_COMPLETE;
5110
5111 next = qpriv->r_tid_ack + 1;
5112 if (next > rvt_size_atomic(&dev->rdi))
5113 next = 0;
5114 qpriv->r_tid_ack = next;
5115 if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ))
5116 break;
5117 nreq = ack_to_tid_req(&qp->s_ack_queue[next]);
5118 if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg)
5119 break;
5120
5121 /* Move to the next ack entry now */
5122 e = &qp->s_ack_queue[qpriv->r_tid_ack];
5123 req = ack_to_tid_req(e);
5124 } while (1);
5125
5126 /*
5127 * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and
5128 * req could be pointing at the previous ack queue entry
5129 */
5130 if (qpriv->s_nak_state ||
5131 (qpriv->resync &&
5132 !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) &&
5133 (cmp_psn(qpriv->r_next_psn_kdeth - 1,
5134 full_flow_psn(&req->flows[flow],
5135 req->flows[flow].flow_state.lpsn)) > 0))) {
5136 /*
5137 * A NAK will implicitly acknowledge all previous TID RDMA
5138 * requests. Therefore, we NAK with the req->acked_tail
5139 * segment for the request at qpriv->r_tid_ack (same at
5140 * this point as the req->clear_tail segment for the
5141 * qpriv->r_tid_tail request)
5142 */
5143 e = &qp->s_ack_queue[qpriv->r_tid_ack];
5144 req = ack_to_tid_req(e);
5145 flow = req->acked_tail;
5146 }
5147
5148 hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1,
5149 &bth2);
5150 len = 0;
5151 qpriv->s_flags &= ~RVT_S_ACK_PENDING;
5152 ps->s_txreq->hdr_dwords = hwords;
5153 ps->s_txreq->sde = qpriv->s_sde;
5154 ps->s_txreq->s_cur_size = len;
5155 ps->s_txreq->ss = NULL;
5156 hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle,
5157 ps);
5158 return 1;
5159bail:
5160 /*
5161 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
5162 * RVT_S_RESP_PENDING
5163 */
5164 smp_wmb();
5165 qpriv->s_flags &= ~RVT_S_ACK_PENDING;
5166 return 0;
5167}
572f0c33
KW
5168
5169static int hfi1_send_tid_ok(struct rvt_qp *qp)
5170{
5171 struct hfi1_qp_priv *priv = qp->priv;
5172
5173 return !(priv->s_flags & RVT_S_BUSY ||
5174 qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
5175 (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) ||
5176 (priv->s_flags & RVT_S_RESP_PENDING) ||
5177 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
5178}
5179
5180void _hfi1_do_tid_send(struct work_struct *work)
5181{
5182 struct iowait_work *w = container_of(work, struct iowait_work, iowork);
5183 struct rvt_qp *qp = iowait_to_qp(w->iow);
5184
5185 hfi1_do_tid_send(qp);
5186}
5187
5188static void hfi1_do_tid_send(struct rvt_qp *qp)
5189{
5190 struct hfi1_pkt_state ps;
5191 struct hfi1_qp_priv *priv = qp->priv;
5192
5193 ps.dev = to_idev(qp->ibqp.device);
5194 ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
5195 ps.ppd = ppd_from_ibp(ps.ibp);
5196 ps.wait = iowait_get_tid_work(&priv->s_iowait);
5197 ps.in_thread = false;
5198 ps.timeout_int = qp->timeout_jiffies / 8;
5199
5200 spin_lock_irqsave(&qp->s_lock, ps.flags);
5201
5202 /* Return if we are already busy processing a work request. */
5203 if (!hfi1_send_tid_ok(qp)) {
5204 if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
5205 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
5206 spin_unlock_irqrestore(&qp->s_lock, ps.flags);
5207 return;
5208 }
5209
5210 priv->s_flags |= RVT_S_BUSY;
5211
5212 ps.timeout = jiffies + ps.timeout_int;
5213 ps.cpu = priv->s_sde ? priv->s_sde->cpu :
5214 cpumask_first(cpumask_of_node(ps.ppd->dd->node));
5215 ps.pkts_sent = false;
5216
5217 /* insure a pre-built packet is handled */
5218 ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
5219 do {
5220 /* Check for a constructed packet to be sent. */
5221 if (ps.s_txreq) {
5222 if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
5223 qp->s_flags |= RVT_S_BUSY;
5224 ps.wait = iowait_get_ib_work(&priv->s_iowait);
5225 }
5226 spin_unlock_irqrestore(&qp->s_lock, ps.flags);
5227
5228 /*
5229 * If the packet cannot be sent now, return and
5230 * the send tasklet will be woken up later.
5231 */
5232 if (hfi1_verbs_send(qp, &ps))
5233 return;
5234
5235 /* allow other tasks to run */
5236 if (hfi1_schedule_send_yield(qp, &ps, true))
5237 return;
5238
5239 spin_lock_irqsave(&qp->s_lock, ps.flags);
5240 if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
5241 qp->s_flags &= ~RVT_S_BUSY;
5242 priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
5243 ps.wait = iowait_get_tid_work(&priv->s_iowait);
5244 if (iowait_flag_set(&priv->s_iowait,
5245 IOWAIT_PENDING_IB))
5246 hfi1_schedule_send(qp);
5247 }
5248 }
5249 } while (hfi1_make_tid_rdma_pkt(qp, &ps));
5250 iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
5251 spin_unlock_irqrestore(&qp->s_lock, ps.flags);
5252}
5253
5254static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
5255{
5256 struct hfi1_qp_priv *priv = qp->priv;
5257 struct hfi1_ibport *ibp =
5258 to_iport(qp->ibqp.device, qp->port_num);
5259 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
5260 struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
5261
5262 return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
5263 priv->s_sde ?
5264 priv->s_sde->cpu :
5265 cpumask_first(cpumask_of_node(dd->node)));
5266}
5267
5268/**
5269 * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
5270 * @qp: the QP
5271 *
5272 * This schedules qp progress on the TID RDMA state machine. Caller
5273 * should hold the s_lock.
5274 * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
5275 * the two state machines can step on each other with respect to the
5276 * RVT_S_BUSY flag.
5277 * Therefore, a modified test is used.
5278 * @return true if the second leg is scheduled;
5279 * false if the second leg is not scheduled.
5280 */
5281bool hfi1_schedule_tid_send(struct rvt_qp *qp)
5282{
5283 lockdep_assert_held(&qp->s_lock);
5284 if (hfi1_send_tid_ok(qp)) {
5285 /*
5286 * The following call returns true if the qp is not on the
5287 * queue and false if the qp is already on the queue before
5288 * this call. Either way, the qp will be on the queue when the
5289 * call returns.
5290 */
5291 _hfi1_schedule_tid_send(qp);
5292 return true;
5293 }
5294 if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
5295 iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
5296 IOWAIT_PENDING_TID);
5297 return false;
5298}