]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blame - drivers/infiniband/hw/hfi1/tid_rdma.c
IB/hfi1: Add an s_acked_ack_queue pointer
[mirror_ubuntu-focal-kernel.git] / drivers / infiniband / hw / hfi1 / tid_rdma.c
CommitLineData
5190f052
MM
1// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
2/*
3 * Copyright(c) 2018 Intel Corporation.
4 *
5 */
6
7#include "hfi.h"
37356e78 8#include "qp.h"
742a3826 9#include "rc.h"
5190f052
MM
10#include "verbs.h"
11#include "tid_rdma.h"
838b6fd2 12#include "exp_rcv.h"
a131d164 13#include "trace.h"
5190f052 14
742a3826
KW
15/**
16 * DOC: TID RDMA READ protocol
17 *
18 * This is an end-to-end protocol at the hfi1 level between two nodes that
19 * improves performance by avoiding data copy on the requester side. It
20 * converts a qualified RDMA READ request into a TID RDMA READ request on
21 * the requester side and thereafter handles the request and response
22 * differently. To be qualified, the RDMA READ request should meet the
23 * following:
24 * -- The total data length should be greater than 256K;
25 * -- The total data length should be a multiple of 4K page size;
26 * -- Each local scatter-gather entry should be 4K page aligned;
27 * -- Each local scatter-gather entry should be a multiple of 4K page size;
28 */
29
37356e78
KW
30#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
31#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
32#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
33#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35)
34#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
35#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
36
742a3826
KW
37/* Maximum number of packets within a flow generation. */
38#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
39
37356e78
KW
40#define GENERATION_MASK 0xFFFFF
41
42static u32 mask_generation(u32 a)
43{
44 return a & GENERATION_MASK;
45}
46
47/* Reserved generation value to set to unused flows for kernel contexts */
48#define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
49
d22a207d
KW
50/*
51 * J_KEY for kernel contexts when TID RDMA is used.
52 * See generate_jkey() in hfi.h for more information.
53 */
54#define TID_RDMA_JKEY 32
55#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
56#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
57
838b6fd2 58/* Maximum number of segments in flight per QP request. */
d22a207d
KW
59#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6
60#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
838b6fd2
KW
61#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
62 TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
63#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
64
65#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE)
d22a207d 66
742a3826
KW
67#define TID_RDMA_DESTQP_FLOW_SHIFT 11
68#define TID_RDMA_DESTQP_FLOW_MASK 0x1f
69
9905bf06
KW
70#define TID_FLOW_SW_PSN BIT(0)
71
d22a207d
KW
72#define TID_OPFN_QP_CTXT_MASK 0xff
73#define TID_OPFN_QP_CTXT_SHIFT 56
74#define TID_OPFN_QP_KDETH_MASK 0xff
75#define TID_OPFN_QP_KDETH_SHIFT 48
76#define TID_OPFN_MAX_LEN_MASK 0x7ff
77#define TID_OPFN_MAX_LEN_SHIFT 37
78#define TID_OPFN_TIMEOUT_MASK 0x1f
79#define TID_OPFN_TIMEOUT_SHIFT 32
80#define TID_OPFN_RESERVED_MASK 0x3f
81#define TID_OPFN_RESERVED_SHIFT 26
82#define TID_OPFN_URG_MASK 0x1
83#define TID_OPFN_URG_SHIFT 25
84#define TID_OPFN_VER_MASK 0x7
85#define TID_OPFN_VER_SHIFT 22
86#define TID_OPFN_JKEY_MASK 0x3f
87#define TID_OPFN_JKEY_SHIFT 16
88#define TID_OPFN_MAX_READ_MASK 0x3f
89#define TID_OPFN_MAX_READ_SHIFT 10
90#define TID_OPFN_MAX_WRITE_MASK 0x3f
91#define TID_OPFN_MAX_WRITE_SHIFT 4
92
93/*
94 * OPFN TID layout
95 *
96 * 63 47 31 15
97 * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC
98 * 3210987654321098 7654321098765432 1098765432109876 5432109876543210
99 * N - the context Number
100 * K - the Kdeth_qp
101 * M - Max_len
102 * T - Timeout
103 * D - reserveD
104 * V - version
105 * U - Urg capable
106 * J - Jkey
107 * R - max_Read
108 * W - max_Write
109 * C - Capcode
110 */
111
37356e78 112static void tid_rdma_trigger_resume(struct work_struct *work);
838b6fd2
KW
113static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
114static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
115 gfp_t gfp);
116static void hfi1_init_trdma_req(struct rvt_qp *qp,
117 struct tid_rdma_request *req);
37356e78 118
d22a207d
KW
119static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
120{
121 return
122 (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) <<
123 TID_OPFN_QP_CTXT_SHIFT) |
124 ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) <<
125 TID_OPFN_QP_KDETH_SHIFT) |
126 (((u64)((p->max_len >> PAGE_SHIFT) - 1) &
127 TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) |
128 (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) <<
129 TID_OPFN_TIMEOUT_SHIFT) |
130 (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) |
131 (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) |
132 (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) <<
133 TID_OPFN_MAX_READ_SHIFT) |
134 (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) <<
135 TID_OPFN_MAX_WRITE_SHIFT);
136}
137
138static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data)
139{
140 p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) &
141 TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT;
142 p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK;
143 p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) &
144 TID_OPFN_MAX_WRITE_MASK;
145 p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) &
146 TID_OPFN_MAX_READ_MASK;
147 p->qp =
148 ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK)
149 << 16) |
150 ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK));
151 p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK;
152 p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK;
153}
154
155void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p)
156{
157 struct hfi1_qp_priv *priv = qp->priv;
158
159 p->qp = (kdeth_qp << 16) | priv->rcd->ctxt;
160 p->max_len = TID_RDMA_MAX_SEGMENT_SIZE;
161 p->jkey = priv->rcd->jkey;
162 p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ;
163 p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ;
164 p->timeout = qp->timeout;
165 p->urg = is_urg_masked(priv->rcd);
166}
167
168bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data)
169{
170 struct hfi1_qp_priv *priv = qp->priv;
171
172 *data = tid_rdma_opfn_encode(&priv->tid_rdma.local);
173 return true;
174}
175
176bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data)
177{
178 struct hfi1_qp_priv *priv = qp->priv;
179 struct tid_rdma_params *remote, *old;
180 bool ret = true;
181
182 old = rcu_dereference_protected(priv->tid_rdma.remote,
183 lockdep_is_held(&priv->opfn.lock));
184 data &= ~0xfULL;
185 /*
186 * If data passed in is zero, return true so as not to continue the
187 * negotiation process
188 */
189 if (!data || !HFI1_CAP_IS_KSET(TID_RDMA))
190 goto null;
191 /*
192 * If kzalloc fails, return false. This will result in:
193 * * at the requester a new OPFN request being generated to retry
194 * the negotiation
195 * * at the responder, 0 being returned to the requester so as to
196 * disable TID RDMA at both the requester and the responder
197 */
198 remote = kzalloc(sizeof(*remote), GFP_ATOMIC);
199 if (!remote) {
200 ret = false;
201 goto null;
202 }
203
204 tid_rdma_opfn_decode(remote, data);
205 priv->tid_timer_timeout_jiffies =
206 usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) /
207 1000UL) << 3) * 7);
a131d164
KW
208 trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local);
209 trace_hfi1_opfn_param(qp, 1, remote);
d22a207d
KW
210 rcu_assign_pointer(priv->tid_rdma.remote, remote);
211 /*
212 * A TID RDMA READ request's segment size is not equal to
213 * remote->max_len only when the request's data length is smaller
214 * than remote->max_len. In that case, there will be only one segment.
215 * Therefore, when priv->pkts_ps is used to calculate req->cur_seg
216 * during retry, it will lead to req->cur_seg = 0, which is exactly
217 * what is expected.
218 */
219 priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len);
220 priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1;
221 goto free;
222null:
223 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
224 priv->timeout_shift = 0;
225free:
226 if (old)
227 kfree_rcu(old, rcu_head);
228 return ret;
229}
230
231bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data)
232{
233 bool ret;
234
235 ret = tid_rdma_conn_reply(qp, *data);
236 *data = 0;
237 /*
238 * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate
239 * TID RDMA could not be enabled. This will result in TID RDMA being
240 * disabled at the requester too.
241 */
242 if (ret)
243 (void)tid_rdma_conn_req(qp, data);
244 return ret;
245}
246
247void tid_rdma_conn_error(struct rvt_qp *qp)
248{
249 struct hfi1_qp_priv *priv = qp->priv;
250 struct tid_rdma_params *old;
251
252 old = rcu_dereference_protected(priv->tid_rdma.remote,
253 lockdep_is_held(&priv->opfn.lock));
254 RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
255 if (old)
256 kfree_rcu(old, rcu_head);
257}
258
259/* This is called at context initialization time */
260int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
261{
262 if (reinit)
263 return 0;
264
265 BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY);
266 BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
267 rcd->jkey = TID_RDMA_JKEY;
268 hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
838b6fd2 269 return hfi1_alloc_ctxt_rcv_groups(rcd);
d22a207d
KW
270}
271
5190f052
MM
272/**
273 * qp_to_rcd - determine the receive context used by a qp
274 * @qp - the qp
275 *
276 * This routine returns the receive context associated
277 * with a a qp's qpn.
278 *
279 * Returns the context.
280 */
281static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi,
282 struct rvt_qp *qp)
283{
284 struct hfi1_ibdev *verbs_dev = container_of(rdi,
285 struct hfi1_ibdev,
286 rdi);
287 struct hfi1_devdata *dd = container_of(verbs_dev,
288 struct hfi1_devdata,
289 verbs_dev);
290 unsigned int ctxt;
291
292 if (qp->ibqp.qp_num == 0)
293 ctxt = 0;
294 else
295 ctxt = ((qp->ibqp.qp_num >> dd->qos_shift) %
296 (dd->n_krcv_queues - 1)) + 1;
297
298 return dd->rcd[ctxt];
299}
300
301int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
302 struct ib_qp_init_attr *init_attr)
303{
304 struct hfi1_qp_priv *qpriv = qp->priv;
838b6fd2 305 int i, ret;
5190f052
MM
306
307 qpriv->rcd = qp_to_rcd(rdi, qp);
308
48a615dc
KW
309 spin_lock_init(&qpriv->opfn.lock);
310 INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
37356e78
KW
311 INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume);
312 qpriv->flow_state.psn = 0;
313 qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
314 qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
315 qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
316 INIT_LIST_HEAD(&qpriv->tid_wait);
48a615dc 317
838b6fd2
KW
318 if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
319 struct hfi1_devdata *dd = qpriv->rcd->dd;
320
321 qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
322 sizeof(*qpriv->pages),
323 GFP_KERNEL, dd->node);
324 if (!qpriv->pages)
325 return -ENOMEM;
326 for (i = 0; i < qp->s_size; i++) {
327 struct hfi1_swqe_priv *priv;
328 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
329
330 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
331 dd->node);
332 if (!priv)
333 return -ENOMEM;
334
335 hfi1_init_trdma_req(qp, &priv->tid_req);
336 priv->tid_req.e.swqe = wqe;
337 wqe->priv = priv;
338 }
339 for (i = 0; i < rvt_max_atomic(rdi); i++) {
340 struct hfi1_ack_priv *priv;
341
342 priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
343 dd->node);
344 if (!priv)
345 return -ENOMEM;
346
347 hfi1_init_trdma_req(qp, &priv->tid_req);
348 priv->tid_req.e.ack = &qp->s_ack_queue[i];
349
350 ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req,
351 GFP_KERNEL);
352 if (ret) {
353 kfree(priv);
354 return ret;
355 }
356 qp->s_ack_queue[i].priv = priv;
357 }
358 }
359
5190f052
MM
360 return 0;
361}
48a615dc
KW
362
363void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
364{
838b6fd2
KW
365 struct hfi1_qp_priv *qpriv = qp->priv;
366 struct rvt_swqe *wqe;
367 u32 i;
368
369 if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
370 for (i = 0; i < qp->s_size; i++) {
371 wqe = rvt_get_swqe_ptr(qp, i);
372 kfree(wqe->priv);
373 wqe->priv = NULL;
374 }
375 for (i = 0; i < rvt_max_atomic(rdi); i++) {
376 struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
377
378 if (priv)
379 hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
380 kfree(priv);
381 qp->s_ack_queue[i].priv = NULL;
382 }
383 cancel_work_sync(&qpriv->opfn.opfn_work);
384 kfree(qpriv->pages);
385 qpriv->pages = NULL;
386 }
48a615dc 387}
37356e78
KW
388
389/* Flow and tid waiter functions */
390/**
391 * DOC: lock ordering
392 *
393 * There are two locks involved with the queuing
394 * routines: the qp s_lock and the exp_lock.
395 *
396 * Since the tid space allocation is called from
397 * the send engine, the qp s_lock is already held.
398 *
399 * The allocation routines will get the exp_lock.
400 *
401 * The first_qp() call is provided to allow the head of
402 * the rcd wait queue to be fetched under the exp_lock and
403 * followed by a drop of the exp_lock.
404 *
405 * Any qp in the wait list will have the qp reference count held
406 * to hold the qp in memory.
407 */
408
409/*
410 * return head of rcd wait list
411 *
412 * Must hold the exp_lock.
413 *
414 * Get a reference to the QP to hold the QP in memory.
415 *
416 * The caller must release the reference when the local
417 * is no longer being used.
418 */
419static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd,
420 struct tid_queue *queue)
421 __must_hold(&rcd->exp_lock)
422{
423 struct hfi1_qp_priv *priv;
424
425 lockdep_assert_held(&rcd->exp_lock);
426 priv = list_first_entry_or_null(&queue->queue_head,
427 struct hfi1_qp_priv,
428 tid_wait);
429 if (!priv)
430 return NULL;
431 rvt_get_qp(priv->owner);
432 return priv->owner;
433}
434
435/**
436 * kernel_tid_waiters - determine rcd wait
437 * @rcd: the receive context
438 * @qp: the head of the qp being processed
439 *
440 * This routine will return false IFF
441 * the list is NULL or the head of the
442 * list is the indicated qp.
443 *
444 * Must hold the qp s_lock and the exp_lock.
445 *
446 * Return:
447 * false if either of the conditions below are statisfied:
448 * 1. The list is empty or
449 * 2. The indicated qp is at the head of the list and the
450 * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
451 * true is returned otherwise.
452 */
453static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd,
454 struct tid_queue *queue, struct rvt_qp *qp)
455 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
456{
457 struct rvt_qp *fqp;
458 bool ret = true;
459
460 lockdep_assert_held(&qp->s_lock);
461 lockdep_assert_held(&rcd->exp_lock);
462 fqp = first_qp(rcd, queue);
463 if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE)))
464 ret = false;
465 rvt_put_qp(fqp);
466 return ret;
467}
468
469/**
470 * dequeue_tid_waiter - dequeue the qp from the list
471 * @qp - the qp to remove the wait list
472 *
473 * This routine removes the indicated qp from the
474 * wait list if it is there.
475 *
476 * This should be done after the hardware flow and
477 * tid array resources have been allocated.
478 *
479 * Must hold the qp s_lock and the rcd exp_lock.
480 *
481 * It assumes the s_lock to protect the s_flags
482 * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
483 */
484static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd,
485 struct tid_queue *queue, struct rvt_qp *qp)
486 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
487{
488 struct hfi1_qp_priv *priv = qp->priv;
489
490 lockdep_assert_held(&qp->s_lock);
491 lockdep_assert_held(&rcd->exp_lock);
492 if (list_empty(&priv->tid_wait))
493 return;
494 list_del_init(&priv->tid_wait);
495 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
496 queue->dequeue++;
497 rvt_put_qp(qp);
498}
499
500/**
501 * queue_qp_for_tid_wait - suspend QP on tid space
502 * @rcd: the receive context
503 * @qp: the qp
504 *
505 * The qp is inserted at the tail of the rcd
506 * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set.
507 *
508 * Must hold the qp s_lock and the exp_lock.
509 */
510static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd,
511 struct tid_queue *queue, struct rvt_qp *qp)
512 __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
513{
514 struct hfi1_qp_priv *priv = qp->priv;
515
516 lockdep_assert_held(&qp->s_lock);
517 lockdep_assert_held(&rcd->exp_lock);
518 if (list_empty(&priv->tid_wait)) {
519 qp->s_flags |= HFI1_S_WAIT_TID_SPACE;
520 list_add_tail(&priv->tid_wait, &queue->queue_head);
521 priv->tid_enqueue = ++queue->enqueue;
2f16a696 522 rcd->dd->verbs_dev.n_tidwait++;
37356e78
KW
523 trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE);
524 rvt_get_qp(qp);
525 }
526}
527
528/**
529 * __trigger_tid_waiter - trigger tid waiter
530 * @qp: the qp
531 *
532 * This is a private entrance to schedule the qp
533 * assuming the caller is holding the qp->s_lock.
534 */
535static void __trigger_tid_waiter(struct rvt_qp *qp)
536 __must_hold(&qp->s_lock)
537{
538 lockdep_assert_held(&qp->s_lock);
539 if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE))
540 return;
541 trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
542 hfi1_schedule_send(qp);
543}
544
545/**
546 * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp
547 * @qp - the qp
548 *
549 * trigger a schedule or a waiting qp in a deadlock
550 * safe manner. The qp reference is held prior
551 * to this call via first_qp().
552 *
553 * If the qp trigger was already scheduled (!rval)
554 * the the reference is dropped, otherwise the resume
555 * or the destroy cancel will dispatch the reference.
556 */
557static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
558{
559 struct hfi1_qp_priv *priv;
560 struct hfi1_ibport *ibp;
561 struct hfi1_pportdata *ppd;
562 struct hfi1_devdata *dd;
563 bool rval;
564
565 if (!qp)
566 return;
567
568 priv = qp->priv;
569 ibp = to_iport(qp->ibqp.device, qp->port_num);
570 ppd = ppd_from_ibp(ibp);
571 dd = dd_from_ibdev(qp->ibqp.device);
572
573 rval = queue_work_on(priv->s_sde ?
574 priv->s_sde->cpu :
575 cpumask_first(cpumask_of_node(dd->node)),
576 ppd->hfi1_wq,
577 &priv->tid_rdma.trigger_work);
578 if (!rval)
579 rvt_put_qp(qp);
580}
581
582/**
583 * tid_rdma_trigger_resume - field a trigger work request
584 * @work - the work item
585 *
586 * Complete the off qp trigger processing by directly
587 * calling the progress routine.
588 */
589static void tid_rdma_trigger_resume(struct work_struct *work)
590{
591 struct tid_rdma_qp_params *tr;
592 struct hfi1_qp_priv *priv;
593 struct rvt_qp *qp;
594
595 tr = container_of(work, struct tid_rdma_qp_params, trigger_work);
596 priv = container_of(tr, struct hfi1_qp_priv, tid_rdma);
597 qp = priv->owner;
598 spin_lock_irq(&qp->s_lock);
599 if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) {
600 spin_unlock_irq(&qp->s_lock);
601 hfi1_do_send(priv->owner, true);
602 } else {
603 spin_unlock_irq(&qp->s_lock);
604 }
605 rvt_put_qp(qp);
606}
607
608/**
609 * tid_rdma_flush_wait - unwind any tid space wait
610 *
611 * This is called when resetting a qp to
612 * allow a destroy or reset to get rid
613 * of any tid space linkage and reference counts.
614 */
615static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue)
616 __must_hold(&qp->s_lock)
617{
618 struct hfi1_qp_priv *priv;
619
620 if (!qp)
621 return;
622 lockdep_assert_held(&qp->s_lock);
623 priv = qp->priv;
624 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
625 spin_lock(&priv->rcd->exp_lock);
626 if (!list_empty(&priv->tid_wait)) {
627 list_del_init(&priv->tid_wait);
628 qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
629 queue->dequeue++;
630 rvt_put_qp(qp);
631 }
632 spin_unlock(&priv->rcd->exp_lock);
633}
634
635void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
636 __must_hold(&qp->s_lock)
637{
638 struct hfi1_qp_priv *priv = qp->priv;
639
640 _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue);
838b6fd2 641 _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue);
37356e78
KW
642}
643
644/* Flow functions */
645/**
646 * kern_reserve_flow - allocate a hardware flow
647 * @rcd - the context to use for allocation
648 * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
649 * signify "don't care".
650 *
651 * Use a bit mask based allocation to reserve a hardware
652 * flow for use in receiving KDETH data packets. If a preferred flow is
653 * specified the function will attempt to reserve that flow again, if
654 * available.
655 *
656 * The exp_lock must be held.
657 *
658 * Return:
659 * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1
660 * On failure: -EAGAIN
661 */
662static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
663 __must_hold(&rcd->exp_lock)
664{
665 int nr;
666
667 /* Attempt to reserve the preferred flow index */
668 if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
669 !test_and_set_bit(last, &rcd->flow_mask))
670 return last;
671
672 nr = ffz(rcd->flow_mask);
673 BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
674 (sizeof(rcd->flow_mask) * BITS_PER_BYTE));
675 if (nr > (RXE_NUM_TID_FLOWS - 1))
676 return -EAGAIN;
677 set_bit(nr, &rcd->flow_mask);
678 return nr;
679}
680
681static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
682 u32 flow_idx)
683{
684 u64 reg;
685
686 reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
687 RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK |
688 RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK |
689 RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK |
690 RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK |
691 RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
692
693 if (generation != KERN_GENERATION_RESERVED)
694 reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
695
696 write_uctxt_csr(rcd->dd, rcd->ctxt,
697 RCV_TID_FLOW_TABLE + 8 * flow_idx, reg);
698}
699
700static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
701 __must_hold(&rcd->exp_lock)
702{
703 u32 generation = rcd->flows[flow_idx].generation;
704
705 kern_set_hw_flow(rcd, generation, flow_idx);
706 return generation;
707}
708
709static u32 kern_flow_generation_next(u32 gen)
710{
711 u32 generation = mask_generation(gen + 1);
712
713 if (generation == KERN_GENERATION_RESERVED)
714 generation = mask_generation(generation + 1);
715 return generation;
716}
717
718static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
719 __must_hold(&rcd->exp_lock)
720{
721 rcd->flows[flow_idx].generation =
722 kern_flow_generation_next(rcd->flows[flow_idx].generation);
723 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
724}
725
726int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
727{
728 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
729 struct tid_flow_state *fs = &qpriv->flow_state;
730 struct rvt_qp *fqp;
731 unsigned long flags;
732 int ret = 0;
733
734 /* The QP already has an allocated flow */
735 if (fs->index != RXE_NUM_TID_FLOWS)
736 return ret;
737
738 spin_lock_irqsave(&rcd->exp_lock, flags);
739 if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp))
740 goto queue;
741
742 ret = kern_reserve_flow(rcd, fs->last_index);
743 if (ret < 0)
744 goto queue;
745 fs->index = ret;
746 fs->last_index = fs->index;
747
748 /* Generation received in a RESYNC overrides default flow generation */
749 if (fs->generation != KERN_GENERATION_RESERVED)
750 rcd->flows[fs->index].generation = fs->generation;
751 fs->generation = kern_setup_hw_flow(rcd, fs->index);
752 fs->psn = 0;
753 fs->flags = 0;
754 dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
755 /* get head before dropping lock */
756 fqp = first_qp(rcd, &rcd->flow_queue);
757 spin_unlock_irqrestore(&rcd->exp_lock, flags);
758
759 tid_rdma_schedule_tid_wakeup(fqp);
760 return 0;
761queue:
762 queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp);
763 spin_unlock_irqrestore(&rcd->exp_lock, flags);
764 return -EAGAIN;
765}
766
767void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
768{
769 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
770 struct tid_flow_state *fs = &qpriv->flow_state;
771 struct rvt_qp *fqp;
772 unsigned long flags;
773
774 if (fs->index >= RXE_NUM_TID_FLOWS)
775 return;
776 spin_lock_irqsave(&rcd->exp_lock, flags);
777 kern_clear_hw_flow(rcd, fs->index);
778 clear_bit(fs->index, &rcd->flow_mask);
779 fs->index = RXE_NUM_TID_FLOWS;
780 fs->psn = 0;
781 fs->generation = KERN_GENERATION_RESERVED;
782
783 /* get head before dropping lock */
784 fqp = first_qp(rcd, &rcd->flow_queue);
785 spin_unlock_irqrestore(&rcd->exp_lock, flags);
786
787 if (fqp == qp) {
788 __trigger_tid_waiter(fqp);
789 rvt_put_qp(fqp);
790 } else {
791 tid_rdma_schedule_tid_wakeup(fqp);
792 }
793}
794
795void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
796{
797 int i;
798
799 for (i = 0; i < RXE_NUM_TID_FLOWS; i++) {
800 rcd->flows[i].generation = mask_generation(prandom_u32());
801 kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
802 }
803}
838b6fd2
KW
804
805/* TID allocation functions */
806static u8 trdma_pset_order(struct tid_rdma_pageset *s)
807{
808 u8 count = s->count;
809
810 return ilog2(count) + 1;
811}
812
813/**
814 * tid_rdma_find_phys_blocks_4k - get groups base on mr info
815 * @npages - number of pages
816 * @pages - pointer to an array of page structs
817 * @list - page set array to return
818 *
819 * This routine returns the number of groups associated with
820 * the current sge information. This implementation is based
821 * on the expected receive find_phys_blocks() adjusted to
822 * use the MR information vs. the pfn.
823 *
824 * Return:
825 * the number of RcvArray entries
826 */
827static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
828 struct page **pages,
829 u32 npages,
830 struct tid_rdma_pageset *list)
831{
832 u32 pagecount, pageidx, setcount = 0, i;
833 void *vaddr, *this_vaddr;
834
835 if (!npages)
836 return 0;
837
838 /*
839 * Look for sets of physically contiguous pages in the user buffer.
840 * This will allow us to optimize Expected RcvArray entry usage by
841 * using the bigger supported sizes.
842 */
843 vaddr = page_address(pages[0]);
84f4a40d 844 trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr);
838b6fd2
KW
845 for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
846 this_vaddr = i < npages ? page_address(pages[i]) : NULL;
84f4a40d
KW
847 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0,
848 this_vaddr);
838b6fd2
KW
849 /*
850 * If the vaddr's are not sequential, pages are not physically
851 * contiguous.
852 */
853 if (this_vaddr != (vaddr + PAGE_SIZE)) {
854 /*
855 * At this point we have to loop over the set of
856 * physically contiguous pages and break them down it
857 * sizes supported by the HW.
858 * There are two main constraints:
859 * 1. The max buffer size is MAX_EXPECTED_BUFFER.
860 * If the total set size is bigger than that
861 * program only a MAX_EXPECTED_BUFFER chunk.
862 * 2. The buffer size has to be a power of two. If
863 * it is not, round down to the closes power of
864 * 2 and program that size.
865 */
866 while (pagecount) {
867 int maxpages = pagecount;
868 u32 bufsize = pagecount * PAGE_SIZE;
869
870 if (bufsize > MAX_EXPECTED_BUFFER)
871 maxpages =
872 MAX_EXPECTED_BUFFER >>
873 PAGE_SHIFT;
874 else if (!is_power_of_2(bufsize))
875 maxpages =
876 rounddown_pow_of_two(bufsize) >>
877 PAGE_SHIFT;
878
879 list[setcount].idx = pageidx;
880 list[setcount].count = maxpages;
84f4a40d
KW
881 trace_hfi1_tid_pageset(flow->req->qp, setcount,
882 list[setcount].idx,
883 list[setcount].count);
838b6fd2
KW
884 pagecount -= maxpages;
885 pageidx += maxpages;
886 setcount++;
887 }
888 pageidx = i;
889 pagecount = 1;
890 vaddr = this_vaddr;
891 } else {
892 vaddr += PAGE_SIZE;
893 pagecount++;
894 }
895 }
896 /* insure we always return an even number of sets */
897 if (setcount & 1)
898 list[setcount++].count = 0;
899 return setcount;
900}
901
902/**
903 * tid_flush_pages - dump out pages into pagesets
904 * @list - list of pagesets
905 * @idx - pointer to current page index
906 * @pages - number of pages to dump
907 * @sets - current number of pagesset
908 *
909 * This routine flushes out accumuated pages.
910 *
911 * To insure an even number of sets the
912 * code may add a filler.
913 *
914 * This can happen with when pages is not
915 * a power of 2 or pages is a power of 2
916 * less than the maximum pages.
917 *
918 * Return:
919 * The new number of sets
920 */
921
922static u32 tid_flush_pages(struct tid_rdma_pageset *list,
923 u32 *idx, u32 pages, u32 sets)
924{
925 while (pages) {
926 u32 maxpages = pages;
927
928 if (maxpages > MAX_EXPECTED_PAGES)
929 maxpages = MAX_EXPECTED_PAGES;
930 else if (!is_power_of_2(maxpages))
931 maxpages = rounddown_pow_of_two(maxpages);
932 list[sets].idx = *idx;
933 list[sets++].count = maxpages;
934 *idx += maxpages;
935 pages -= maxpages;
936 }
937 /* might need a filler */
938 if (sets & 1)
939 list[sets++].count = 0;
940 return sets;
941}
942
943/**
944 * tid_rdma_find_phys_blocks_8k - get groups base on mr info
945 * @pages - pointer to an array of page structs
946 * @npages - number of pages
947 * @list - page set array to return
948 *
949 * This routine parses an array of pages to compute pagesets
950 * in an 8k compatible way.
951 *
952 * pages are tested two at a time, i, i + 1 for contiguous
953 * pages and i - 1 and i contiguous pages.
954 *
955 * If any condition is false, any accumlated pages are flushed and
956 * v0,v1 are emitted as separate PAGE_SIZE pagesets
957 *
958 * Otherwise, the current 8k is totaled for a future flush.
959 *
960 * Return:
961 * The number of pagesets
962 * list set with the returned number of pagesets
963 *
964 */
965static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
966 struct page **pages,
967 u32 npages,
968 struct tid_rdma_pageset *list)
969{
970 u32 idx, sets = 0, i;
971 u32 pagecnt = 0;
972 void *v0, *v1, *vm1;
973
974 if (!npages)
975 return 0;
976 for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) {
977 /* get a new v0 */
978 v0 = page_address(pages[i]);
84f4a40d 979 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0);
838b6fd2
KW
980 v1 = i + 1 < npages ?
981 page_address(pages[i + 1]) : NULL;
84f4a40d 982 trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1);
838b6fd2
KW
983 /* compare i, i + 1 vaddr */
984 if (v1 != (v0 + PAGE_SIZE)) {
985 /* flush out pages */
986 sets = tid_flush_pages(list, &idx, pagecnt, sets);
987 /* output v0,v1 as two pagesets */
988 list[sets].idx = idx++;
989 list[sets++].count = 1;
990 if (v1) {
991 list[sets].count = 1;
992 list[sets++].idx = idx++;
993 } else {
994 list[sets++].count = 0;
995 }
996 vm1 = NULL;
997 pagecnt = 0;
998 continue;
999 }
1000 /* i,i+1 consecutive, look at i-1,i */
1001 if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
1002 /* flush out pages */
1003 sets = tid_flush_pages(list, &idx, pagecnt, sets);
1004 pagecnt = 0;
1005 }
1006 /* pages will always be a multiple of 8k */
1007 pagecnt += 2;
1008 /* save i-1 */
1009 vm1 = v1;
1010 /* move to next pair */
1011 }
1012 /* dump residual pages at end */
1013 sets = tid_flush_pages(list, &idx, npages - idx, sets);
1014 /* by design cannot be odd sets */
1015 WARN_ON(sets & 1);
1016 return sets;
1017}
1018
1019/**
1020 * Find pages for one segment of a sge array represented by @ss. The function
1021 * does not check the sge, the sge must have been checked for alignment with a
1022 * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
1023 * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
1024 * copy maintained in @ss->sge, the original sge is not modified.
1025 *
1026 * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
1027 * releasing the MR reference count at the same time. Otherwise, we'll "leak"
1028 * references to the MR. This difference requires that we keep track of progress
1029 * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
1030 * structure.
1031 */
1032static u32 kern_find_pages(struct tid_rdma_flow *flow,
1033 struct page **pages,
1034 struct rvt_sge_state *ss, bool *last)
1035{
1036 struct tid_rdma_request *req = flow->req;
1037 struct rvt_sge *sge = &ss->sge;
1038 u32 length = flow->req->seg_len;
1039 u32 len = PAGE_SIZE;
1040 u32 i = 0;
1041
1042 while (length && req->isge < ss->num_sge) {
1043 pages[i++] = virt_to_page(sge->vaddr);
1044
1045 sge->vaddr += len;
1046 sge->length -= len;
1047 sge->sge_length -= len;
1048 if (!sge->sge_length) {
1049 if (++req->isge < ss->num_sge)
1050 *sge = ss->sg_list[req->isge - 1];
1051 } else if (sge->length == 0 && sge->mr->lkey) {
1052 if (++sge->n >= RVT_SEGSZ) {
1053 ++sge->m;
1054 sge->n = 0;
1055 }
1056 sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
1057 sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
1058 }
1059 length -= len;
1060 }
1061
1062 flow->length = flow->req->seg_len - length;
1063 *last = req->isge == ss->num_sge ? false : true;
1064 return i;
1065}
1066
1067static void dma_unmap_flow(struct tid_rdma_flow *flow)
1068{
1069 struct hfi1_devdata *dd;
1070 int i;
1071 struct tid_rdma_pageset *pset;
1072
1073 dd = flow->req->rcd->dd;
1074 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
1075 i++, pset++) {
1076 if (pset->count && pset->addr) {
1077 dma_unmap_page(&dd->pcidev->dev,
1078 pset->addr,
1079 PAGE_SIZE * pset->count,
1080 DMA_FROM_DEVICE);
1081 pset->mapped = 0;
1082 }
1083 }
1084}
1085
1086static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages)
1087{
1088 int i;
1089 struct hfi1_devdata *dd = flow->req->rcd->dd;
1090 struct tid_rdma_pageset *pset;
1091
1092 for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
1093 i++, pset++) {
1094 if (pset->count) {
1095 pset->addr = dma_map_page(&dd->pcidev->dev,
1096 pages[pset->idx],
1097 0,
1098 PAGE_SIZE * pset->count,
1099 DMA_FROM_DEVICE);
1100
1101 if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) {
1102 dma_unmap_flow(flow);
1103 return -ENOMEM;
1104 }
1105 pset->mapped = 1;
1106 }
1107 }
1108 return 0;
1109}
1110
1111static inline bool dma_mapped(struct tid_rdma_flow *flow)
1112{
1113 return !!flow->pagesets[0].mapped;
1114}
1115
1116/*
1117 * Get pages pointers and identify contiguous physical memory chunks for a
1118 * segment. All segments are of length flow->req->seg_len.
1119 */
1120static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
1121 struct page **pages,
1122 struct rvt_sge_state *ss, bool *last)
1123{
1124 u8 npages;
1125
1126 /* Reuse previously computed pagesets, if any */
1127 if (flow->npagesets) {
84f4a40d
KW
1128 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head,
1129 flow);
838b6fd2
KW
1130 if (!dma_mapped(flow))
1131 return dma_map_flow(flow, pages);
1132 return 0;
1133 }
1134
1135 npages = kern_find_pages(flow, pages, ss, last);
1136
1137 if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
1138 flow->npagesets =
1139 tid_rdma_find_phys_blocks_4k(flow, pages, npages,
1140 flow->pagesets);
1141 else
1142 flow->npagesets =
1143 tid_rdma_find_phys_blocks_8k(flow, pages, npages,
1144 flow->pagesets);
1145
1146 return dma_map_flow(flow, pages);
1147}
1148
1149static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
1150 struct hfi1_ctxtdata *rcd, char *s,
1151 struct tid_group *grp, u8 cnt)
1152{
1153 struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
1154
1155 WARN_ON_ONCE(flow->tnode_cnt >=
1156 (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
1157 if (WARN_ON_ONCE(cnt & 1))
1158 dd_dev_err(rcd->dd,
1159 "unexpected odd allocation cnt %u map 0x%x used %u",
1160 cnt, grp->map, grp->used);
1161
1162 node->grp = grp;
1163 node->map = grp->map;
1164 node->cnt = cnt;
84f4a40d
KW
1165 trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1,
1166 grp->base, grp->map, grp->used, cnt);
838b6fd2
KW
1167}
1168
1169/*
1170 * Try to allocate pageset_count TID's from TID groups for a context
1171 *
1172 * This function allocates TID's without moving groups between lists or
1173 * modifying grp->map. This is done as follows, being cogizant of the lists
1174 * between which the TID groups will move:
1175 * 1. First allocate complete groups of 8 TID's since this is more efficient,
1176 * these groups will move from group->full without affecting used
1177 * 2. If more TID's are needed allocate from used (will move from used->full or
1178 * stay in used)
1179 * 3. If we still don't have the required number of TID's go back and look again
1180 * at a complete group (will move from group->used)
1181 */
1182static int kern_alloc_tids(struct tid_rdma_flow *flow)
1183{
1184 struct hfi1_ctxtdata *rcd = flow->req->rcd;
1185 struct hfi1_devdata *dd = rcd->dd;
1186 u32 ngroups, pageidx = 0;
1187 struct tid_group *group = NULL, *used;
1188 u8 use;
1189
1190 flow->tnode_cnt = 0;
1191 ngroups = flow->npagesets / dd->rcv_entries.group_size;
1192 if (!ngroups)
1193 goto used_list;
1194
1195 /* First look at complete groups */
1196 list_for_each_entry(group, &rcd->tid_group_list.list, list) {
1197 kern_add_tid_node(flow, rcd, "complete groups", group,
1198 group->size);
1199
1200 pageidx += group->size;
1201 if (!--ngroups)
1202 break;
1203 }
1204
1205 if (pageidx >= flow->npagesets)
1206 goto ok;
1207
1208used_list:
1209 /* Now look at partially used groups */
1210 list_for_each_entry(used, &rcd->tid_used_list.list, list) {
1211 use = min_t(u32, flow->npagesets - pageidx,
1212 used->size - used->used);
1213 kern_add_tid_node(flow, rcd, "used groups", used, use);
1214
1215 pageidx += use;
1216 if (pageidx >= flow->npagesets)
1217 goto ok;
1218 }
1219
1220 /*
1221 * Look again at a complete group, continuing from where we left.
1222 * However, if we are at the head, we have reached the end of the
1223 * complete groups list from the first loop above
1224 */
1225 if (group && &group->list == &rcd->tid_group_list.list)
1226 goto bail_eagain;
1227 group = list_prepare_entry(group, &rcd->tid_group_list.list,
1228 list);
1229 if (list_is_last(&group->list, &rcd->tid_group_list.list))
1230 goto bail_eagain;
1231 group = list_next_entry(group, list);
1232 use = min_t(u32, flow->npagesets - pageidx, group->size);
1233 kern_add_tid_node(flow, rcd, "complete continue", group, use);
1234 pageidx += use;
1235 if (pageidx >= flow->npagesets)
1236 goto ok;
1237bail_eagain:
84f4a40d
KW
1238 trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ",
1239 (u64)flow->npagesets);
838b6fd2
KW
1240 return -EAGAIN;
1241ok:
1242 return 0;
1243}
1244
1245static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num,
1246 u32 *pset_idx)
1247{
1248 struct hfi1_ctxtdata *rcd = flow->req->rcd;
1249 struct hfi1_devdata *dd = rcd->dd;
1250 struct kern_tid_node *node = &flow->tnode[grp_num];
1251 struct tid_group *grp = node->grp;
1252 struct tid_rdma_pageset *pset;
1253 u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
1254 u32 rcventry, npages = 0, pair = 0, tidctrl;
1255 u8 i, cnt = 0;
1256
1257 for (i = 0; i < grp->size; i++) {
1258 rcventry = grp->base + i;
1259
1260 if (node->map & BIT(i) || cnt >= node->cnt) {
1261 rcv_array_wc_fill(dd, rcventry);
1262 continue;
1263 }
1264 pset = &flow->pagesets[(*pset_idx)++];
1265 if (pset->count) {
1266 hfi1_put_tid(dd, rcventry, PT_EXPECTED,
1267 pset->addr, trdma_pset_order(pset));
1268 } else {
1269 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
1270 }
1271 npages += pset->count;
1272
1273 rcventry -= rcd->expected_base;
1274 tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1;
1275 /*
1276 * A single TID entry will be used to use a rcvarr pair (with
1277 * tidctrl 0x3), if ALL these are true (a) the bit pos is even
1278 * (b) the group map shows current and the next bits as free
1279 * indicating two consecutive rcvarry entries are available (c)
1280 * we actually need 2 more entries
1281 */
1282 pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
1283 node->cnt >= cnt + 2;
1284 if (!pair) {
1285 if (!pset->count)
1286 tidctrl = 0x1;
1287 flow->tid_entry[flow->tidcnt++] =
1288 EXP_TID_SET(IDX, rcventry >> 1) |
1289 EXP_TID_SET(CTRL, tidctrl) |
1290 EXP_TID_SET(LEN, npages);
84f4a40d
KW
1291 trace_hfi1_tid_entry_alloc(/* entry */
1292 flow->req->qp, flow->tidcnt - 1,
1293 flow->tid_entry[flow->tidcnt - 1]);
1294
838b6fd2
KW
1295 /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */
1296 flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg);
1297 npages = 0;
1298 }
1299
1300 if (grp->used == grp->size - 1)
1301 tid_group_move(grp, &rcd->tid_used_list,
1302 &rcd->tid_full_list);
1303 else if (!grp->used)
1304 tid_group_move(grp, &rcd->tid_group_list,
1305 &rcd->tid_used_list);
1306
1307 grp->used++;
1308 grp->map |= BIT(i);
1309 cnt++;
1310 }
1311}
1312
1313static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num)
1314{
1315 struct hfi1_ctxtdata *rcd = flow->req->rcd;
1316 struct hfi1_devdata *dd = rcd->dd;
1317 struct kern_tid_node *node = &flow->tnode[grp_num];
1318 struct tid_group *grp = node->grp;
1319 u32 rcventry;
1320 u8 i, cnt = 0;
1321
1322 for (i = 0; i < grp->size; i++) {
1323 rcventry = grp->base + i;
1324
1325 if (node->map & BIT(i) || cnt >= node->cnt) {
1326 rcv_array_wc_fill(dd, rcventry);
1327 continue;
1328 }
1329
1330 hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
1331
1332 grp->used--;
1333 grp->map &= ~BIT(i);
1334 cnt++;
1335
1336 if (grp->used == grp->size - 1)
1337 tid_group_move(grp, &rcd->tid_full_list,
1338 &rcd->tid_used_list);
1339 else if (!grp->used)
1340 tid_group_move(grp, &rcd->tid_used_list,
1341 &rcd->tid_group_list);
1342 }
1343 if (WARN_ON_ONCE(cnt & 1)) {
1344 struct hfi1_ctxtdata *rcd = flow->req->rcd;
1345 struct hfi1_devdata *dd = rcd->dd;
1346
1347 dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
1348 cnt, grp->map, grp->used);
1349 }
1350}
1351
1352static void kern_program_rcvarray(struct tid_rdma_flow *flow)
1353{
1354 u32 pset_idx = 0;
1355 int i;
1356
1357 flow->npkts = 0;
1358 flow->tidcnt = 0;
1359 for (i = 0; i < flow->tnode_cnt; i++)
1360 kern_program_rcv_group(flow, i, &pset_idx);
84f4a40d 1361 trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow);
838b6fd2
KW
1362}
1363
1364/**
1365 * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
1366 * TID RDMA request
1367 *
1368 * @req: TID RDMA request for which the segment/flow is being set up
1369 * @ss: sge state, maintains state across successive segments of a sge
1370 * @last: set to true after the last sge segment has been processed
1371 *
1372 * This function
1373 * (1) finds a free flow entry in the flow circular buffer
1374 * (2) finds pages and continuous physical chunks constituing one segment
1375 * of an sge
1376 * (3) allocates TID group entries for those chunks
1377 * (4) programs rcvarray entries in the hardware corresponding to those
1378 * TID's
1379 * (5) computes a tidarray with formatted TID entries which can be sent
1380 * to the sender
1381 * (6) Reserves and programs HW flows.
1382 * (7) It also manages queing the QP when TID/flow resources are not
1383 * available.
1384 *
1385 * @req points to struct tid_rdma_request of which the segments are a part. The
1386 * function uses qp, rcd and seg_len members of @req. In the absence of errors,
1387 * req->flow_idx is the index of the flow which has been prepared in this
1388 * invocation of function call. With flow = &req->flows[req->flow_idx],
1389 * flow->tid_entry contains the TID array which the sender can use for TID RDMA
1390 * sends and flow->npkts contains number of packets required to send the
1391 * segment.
1392 *
1393 * hfi1_check_sge_align should be called prior to calling this function and if
1394 * it signals error TID RDMA cannot be used for this sge and this function
1395 * should not be called.
1396 *
1397 * For the queuing, caller must hold the flow->req->qp s_lock from the send
1398 * engine and the function will procure the exp_lock.
1399 *
1400 * Return:
1401 * The function returns -EAGAIN if sufficient number of TID/flow resources to
1402 * map the segment could not be allocated. In this case the function should be
1403 * called again with previous arguments to retry the TID allocation. There are
1404 * no other error returns. The function returns 0 on success.
1405 */
1406int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
1407 struct rvt_sge_state *ss, bool *last)
1408 __must_hold(&req->qp->s_lock)
1409{
1410 struct tid_rdma_flow *flow = &req->flows[req->setup_head];
1411 struct hfi1_ctxtdata *rcd = req->rcd;
1412 struct hfi1_qp_priv *qpriv = req->qp->priv;
1413 unsigned long flags;
1414 struct rvt_qp *fqp;
1415 u16 clear_tail = req->clear_tail;
1416
1417 lockdep_assert_held(&req->qp->s_lock);
1418 /*
1419 * We return error if either (a) we don't have space in the flow
1420 * circular buffer, or (b) we already have max entries in the buffer.
1421 * Max entries depend on the type of request we are processing and the
1422 * negotiated TID RDMA parameters.
1423 */
1424 if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
1425 CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
1426 req->n_flows)
1427 return -EINVAL;
1428
1429 /*
1430 * Get pages, identify contiguous physical memory chunks for the segment
1431 * If we can not determine a DMA address mapping we will treat it just
1432 * like if we ran out of space above.
1433 */
1434 if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
1435 hfi1_wait_kmem(flow->req->qp);
1436 return -ENOMEM;
1437 }
1438
1439 spin_lock_irqsave(&rcd->exp_lock, flags);
1440 if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp))
1441 goto queue;
1442
1443 /*
1444 * At this point we know the number of pagesets and hence the number of
1445 * TID's to map the segment. Allocate the TID's from the TID groups. If
1446 * we cannot allocate the required number we exit and try again later
1447 */
1448 if (kern_alloc_tids(flow))
1449 goto queue;
1450 /*
1451 * Finally program the TID entries with the pagesets, compute the
1452 * tidarray and enable the HW flow
1453 */
1454 kern_program_rcvarray(flow);
1455
1456 /*
1457 * Setup the flow state with relevant information.
1458 * This information is used for tracking the sequence of data packets
1459 * for the segment.
1460 * The flow is setup here as this is the most accurate time and place
1461 * to do so. Doing at a later time runs the risk of the flow data in
1462 * qpriv getting out of sync.
1463 */
1464 memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
1465 flow->idx = qpriv->flow_state.index;
1466 flow->flow_state.generation = qpriv->flow_state.generation;
1467 flow->flow_state.spsn = qpriv->flow_state.psn;
1468 flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
1469 flow->flow_state.r_next_psn =
1470 full_flow_psn(flow, flow->flow_state.spsn);
1471 qpriv->flow_state.psn += flow->npkts;
1472
1473 dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp);
1474 /* get head before dropping lock */
1475 fqp = first_qp(rcd, &rcd->rarr_queue);
1476 spin_unlock_irqrestore(&rcd->exp_lock, flags);
1477 tid_rdma_schedule_tid_wakeup(fqp);
1478
1479 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
1480 return 0;
1481queue:
1482 queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp);
1483 spin_unlock_irqrestore(&rcd->exp_lock, flags);
1484 return -EAGAIN;
1485}
1486
1487static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
1488{
1489 flow->npagesets = 0;
1490}
1491
1492/*
1493 * This function is called after one segment has been successfully sent to
1494 * release the flow and TID HW/SW resources for that segment. The segments for a
1495 * TID RDMA request are setup and cleared in FIFO order which is managed using a
1496 * circular buffer.
1497 */
1498int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
1499 __must_hold(&req->qp->s_lock)
1500{
1501 struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
1502 struct hfi1_ctxtdata *rcd = req->rcd;
1503 unsigned long flags;
1504 int i;
1505 struct rvt_qp *fqp;
1506
1507 lockdep_assert_held(&req->qp->s_lock);
1508 /* Exit if we have nothing in the flow circular buffer */
1509 if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
1510 return -EINVAL;
1511
1512 spin_lock_irqsave(&rcd->exp_lock, flags);
1513
1514 for (i = 0; i < flow->tnode_cnt; i++)
1515 kern_unprogram_rcv_group(flow, i);
1516 /* To prevent double unprogramming */
1517 flow->tnode_cnt = 0;
1518 /* get head before dropping lock */
1519 fqp = first_qp(rcd, &rcd->rarr_queue);
1520 spin_unlock_irqrestore(&rcd->exp_lock, flags);
1521
1522 dma_unmap_flow(flow);
1523
1524 hfi1_tid_rdma_reset_flow(flow);
1525 req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1);
1526
1527 if (fqp == req->qp) {
1528 __trigger_tid_waiter(fqp);
1529 rvt_put_qp(fqp);
1530 } else {
1531 tid_rdma_schedule_tid_wakeup(fqp);
1532 }
1533
1534 return 0;
1535}
1536
1537/*
1538 * This function is called to release all the tid entries for
1539 * a request.
1540 */
1541void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
1542 __must_hold(&req->qp->s_lock)
1543{
1544 /* Use memory barrier for proper ordering */
1545 while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
1546 if (hfi1_kern_exp_rcv_clear(req))
1547 break;
1548 }
1549}
1550
1551/**
1552 * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information
1553 * @req - the tid rdma request to be cleaned
1554 */
1555static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
1556{
1557 kfree(req->flows);
1558 req->flows = NULL;
1559}
1560
1561/**
1562 * __trdma_clean_swqe - clean up for large sized QPs
1563 * @qp: the queue patch
1564 * @wqe: the send wqe
1565 */
1566void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
1567{
1568 struct hfi1_swqe_priv *p = wqe->priv;
1569
1570 hfi1_kern_exp_rcv_free_flows(&p->tid_req);
1571}
1572
1573/*
1574 * This can be called at QP create time or in the data path.
1575 */
1576static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
1577 gfp_t gfp)
1578{
1579 struct tid_rdma_flow *flows;
1580 int i;
1581
1582 if (likely(req->flows))
1583 return 0;
1584 flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
1585 req->rcd->numa_id);
1586 if (!flows)
1587 return -ENOMEM;
1588 /* mini init */
1589 for (i = 0; i < MAX_FLOWS; i++) {
1590 flows[i].req = req;
1591 flows[i].npagesets = 0;
1592 flows[i].pagesets[0].mapped = 0;
1593 }
1594 req->flows = flows;
1595 return 0;
1596}
1597
1598static void hfi1_init_trdma_req(struct rvt_qp *qp,
1599 struct tid_rdma_request *req)
1600{
1601 struct hfi1_qp_priv *qpriv = qp->priv;
1602
1603 /*
1604 * Initialize various TID RDMA request variables.
1605 * These variables are "static", which is why they
1606 * can be pre-initialized here before the WRs has
1607 * even been submitted.
1608 * However, non-NULL values for these variables do not
1609 * imply that this WQE has been enabled for TID RDMA.
1610 * Drivers should check the WQE's opcode to determine
1611 * if a request is a TID RDMA one or not.
1612 */
1613 req->qp = qp;
1614 req->rcd = qpriv->rcd;
1615}
2f16a696
KW
1616
1617u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
1618 void *context, int vl, int mode, u64 data)
1619{
1620 struct hfi1_devdata *dd = context;
1621
1622 return dd->verbs_dev.n_tidwait;
1623}
742a3826 1624
b126078e
KW
1625static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
1626 u32 psn, u16 *fidx)
1627{
1628 u16 head, tail;
1629 struct tid_rdma_flow *flow;
1630
1631 head = req->setup_head;
1632 tail = req->clear_tail;
1633 for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
1634 tail = CIRC_NEXT(tail, MAX_FLOWS)) {
1635 flow = &req->flows[tail];
1636 if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 &&
1637 cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) {
1638 if (fidx)
1639 *fidx = tail;
1640 return flow;
1641 }
1642 }
1643 return NULL;
1644}
1645
9905bf06
KW
1646static struct tid_rdma_flow *
1647__find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail,
1648 u32 psn, u16 *fidx)
1649{
1650 for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
1651 tail = CIRC_NEXT(tail, MAX_FLOWS)) {
1652 struct tid_rdma_flow *flow = &req->flows[tail];
1653 u32 spsn, lpsn;
1654
1655 spsn = full_flow_psn(flow, flow->flow_state.spsn);
1656 lpsn = full_flow_psn(flow, flow->flow_state.lpsn);
1657
1658 if (cmp_psn(psn, spsn) >= 0 && cmp_psn(psn, lpsn) <= 0) {
1659 if (fidx)
1660 *fidx = tail;
1661 return flow;
1662 }
1663 }
1664 return NULL;
1665}
1666
1667static struct tid_rdma_flow *find_flow(struct tid_rdma_request *req,
1668 u32 psn, u16 *fidx)
1669{
1670 return __find_flow_ranged(req, req->setup_head, req->clear_tail, psn,
1671 fidx);
1672}
1673
742a3826
KW
1674/* TID RDMA READ functions */
1675u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
1676 struct ib_other_headers *ohdr, u32 *bth1,
1677 u32 *bth2, u32 *len)
1678{
1679 struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1680 struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
1681 struct rvt_qp *qp = req->qp;
1682 struct hfi1_qp_priv *qpriv = qp->priv;
1683 struct hfi1_swqe_priv *wpriv = wqe->priv;
1684 struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
1685 struct tid_rdma_params *remote;
1686 u32 req_len = 0;
1687 void *req_addr = NULL;
1688
1689 /* This is the IB psn used to send the request */
1690 *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
3ce5daa2 1691 trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow);
742a3826
KW
1692
1693 /* TID Entries for TID RDMA READ payload */
1694 req_addr = &flow->tid_entry[flow->tid_idx];
1695 req_len = sizeof(*flow->tid_entry) *
1696 (flow->tidcnt - flow->tid_idx);
1697
1698 memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
1699 wpriv->ss.sge.vaddr = req_addr;
1700 wpriv->ss.sge.sge_length = req_len;
1701 wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
1702 /*
1703 * We can safely zero these out. Since the first SGE covers the
1704 * entire packet, nothing else should even look at the MR.
1705 */
1706 wpriv->ss.sge.mr = NULL;
1707 wpriv->ss.sge.m = 0;
1708 wpriv->ss.sge.n = 0;
1709
1710 wpriv->ss.sg_list = NULL;
1711 wpriv->ss.total_len = wpriv->ss.sge.sge_length;
1712 wpriv->ss.num_sge = 1;
1713
1714 /* Construct the TID RDMA READ REQ packet header */
1715 rcu_read_lock();
1716 remote = rcu_dereference(qpriv->tid_rdma.remote);
1717
1718 KDETH_RESET(rreq->kdeth0, KVER, 0x1);
1719 KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
1720 rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
1721 req->cur_seg * req->seg_len + flow->sent);
1722 rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
1723 rreq->reth.length = cpu_to_be32(*len);
1724 rreq->tid_flow_psn =
1725 cpu_to_be32((flow->flow_state.generation <<
1726 HFI1_KDETH_BTH_SEQ_SHIFT) |
1727 ((flow->flow_state.spsn + flow->pkt) &
1728 HFI1_KDETH_BTH_SEQ_MASK));
1729 rreq->tid_flow_qp =
1730 cpu_to_be32(qpriv->tid_rdma.local.qp |
1731 ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
1732 TID_RDMA_DESTQP_FLOW_SHIFT) |
1733 qpriv->rcd->ctxt);
1734 rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
1735 *bth1 &= ~RVT_QPN_MASK;
1736 *bth1 |= remote->qp;
1737 *bth2 |= IB_BTH_REQ_ACK;
1738 rcu_read_unlock();
1739
1740 /* We are done with this segment */
1741 flow->sent += *len;
1742 req->cur_seg++;
1743 qp->s_state = TID_OP(READ_REQ);
1744 req->ack_pending++;
1745 req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1);
1746 qpriv->pending_tid_r_segs++;
1747 qp->s_num_rd_atomic++;
1748
1749 /* Set the TID RDMA READ request payload size */
1750 *len = req_len;
1751
1752 return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
1753}
1754
1755/*
1756 * @len: contains the data length to read upon entry and the read request
1757 * payload length upon exit.
1758 */
1759u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
1760 struct ib_other_headers *ohdr, u32 *bth1,
1761 u32 *bth2, u32 *len)
1762 __must_hold(&qp->s_lock)
1763{
1764 struct hfi1_qp_priv *qpriv = qp->priv;
1765 struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1766 struct tid_rdma_flow *flow = NULL;
1767 u32 hdwords = 0;
1768 bool last;
1769 bool retry = true;
1770 u32 npkts = rvt_div_round_up_mtu(qp, *len);
1771
3ce5daa2
KW
1772 trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn,
1773 wqe->lpsn, req);
742a3826
KW
1774 /*
1775 * Check sync conditions. Make sure that there are no pending
1776 * segments before freeing the flow.
1777 */
1778sync_check:
1779 if (req->state == TID_REQUEST_SYNC) {
1780 if (qpriv->pending_tid_r_segs)
1781 goto done;
1782
1783 hfi1_kern_clear_hw_flow(req->rcd, qp);
1784 req->state = TID_REQUEST_ACTIVE;
1785 }
1786
1787 /*
1788 * If the request for this segment is resent, the tid resources should
1789 * have been allocated before. In this case, req->flow_idx should
1790 * fall behind req->setup_head.
1791 */
1792 if (req->flow_idx == req->setup_head) {
1793 retry = false;
1794 if (req->state == TID_REQUEST_RESEND) {
1795 /*
1796 * This is the first new segment for a request whose
1797 * earlier segments have been re-sent. We need to
1798 * set up the sge pointer correctly.
1799 */
1800 restart_sge(&qp->s_sge, wqe, req->s_next_psn,
1801 qp->pmtu);
1802 req->isge = 0;
1803 req->state = TID_REQUEST_ACTIVE;
1804 }
1805
1806 /*
1807 * Check sync. The last PSN of each generation is reserved for
1808 * RESYNC.
1809 */
1810 if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
1811 req->state = TID_REQUEST_SYNC;
1812 goto sync_check;
1813 }
1814
1815 /* Allocate the flow if not yet */
1816 if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
1817 goto done;
1818
1819 /*
1820 * The following call will advance req->setup_head after
1821 * allocating the tid entries.
1822 */
1823 if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
1824 req->state = TID_REQUEST_QUEUED;
1825
1826 /*
1827 * We don't have resources for this segment. The QP has
1828 * already been queued.
1829 */
1830 goto done;
1831 }
1832 }
1833
1834 /* req->flow_idx should only be one slot behind req->setup_head */
1835 flow = &req->flows[req->flow_idx];
1836 flow->pkt = 0;
1837 flow->tid_idx = 0;
1838 flow->sent = 0;
1839 if (!retry) {
1840 /* Set the first and last IB PSN for the flow in use.*/
1841 flow->flow_state.ib_spsn = req->s_next_psn;
1842 flow->flow_state.ib_lpsn =
1843 flow->flow_state.ib_spsn + flow->npkts - 1;
1844 }
1845
1846 /* Calculate the next segment start psn.*/
1847 req->s_next_psn += flow->npkts;
1848
1849 /* Build the packet header */
1850 hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
1851done:
1852 return hdwords;
1853}
d0d564a1
KW
1854
1855/*
1856 * Validate and accept the TID RDMA READ request parameters.
1857 * Return 0 if the request is accepted successfully;
1858 * Return 1 otherwise.
1859 */
1860static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
1861 struct rvt_ack_entry *e,
1862 struct hfi1_packet *packet,
1863 struct ib_other_headers *ohdr,
1864 u32 bth0, u32 psn, u64 vaddr, u32 len)
1865{
1866 struct hfi1_qp_priv *qpriv = qp->priv;
1867 struct tid_rdma_request *req;
1868 struct tid_rdma_flow *flow;
1869 u32 flow_psn, i, tidlen = 0, pktlen, tlen;
1870
1871 req = ack_to_tid_req(e);
1872
1873 /* Validate the payload first */
1874 flow = &req->flows[req->setup_head];
1875
1876 /* payload length = packet length - (header length + ICRC length) */
1877 pktlen = packet->tlen - (packet->hlen + 4);
1878 if (pktlen > sizeof(flow->tid_entry))
1879 return 1;
1880 memcpy(flow->tid_entry, packet->ebuf, pktlen);
1881 flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
1882
1883 /*
1884 * Walk the TID_ENTRY list to make sure we have enough space for a
1885 * complete segment. Also calculate the number of required packets.
1886 */
1887 flow->npkts = rvt_div_round_up_mtu(qp, len);
1888 for (i = 0; i < flow->tidcnt; i++) {
3ce5daa2
KW
1889 trace_hfi1_tid_entry_rcv_read_req(qp, i,
1890 flow->tid_entry[i]);
d0d564a1
KW
1891 tlen = EXP_TID_GET(flow->tid_entry[i], LEN);
1892 if (!tlen)
1893 return 1;
1894
1895 /*
1896 * For tid pair (tidctr == 3), the buffer size of the pair
1897 * should be the sum of the buffer size described by each
1898 * tid entry. However, only the first entry needs to be
1899 * specified in the request (see WFR HAS Section 8.5.7.1).
1900 */
1901 tidlen += tlen;
1902 }
1903 if (tidlen * PAGE_SIZE < len)
1904 return 1;
1905
1906 /* Empty the flow array */
1907 req->clear_tail = req->setup_head;
1908 flow->pkt = 0;
1909 flow->tid_idx = 0;
1910 flow->tid_offset = 0;
1911 flow->sent = 0;
1912 flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
1913 flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
1914 TID_RDMA_DESTQP_FLOW_MASK;
1915 flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
1916 flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
1917 flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
1918 flow->length = len;
1919
1920 flow->flow_state.lpsn = flow->flow_state.spsn +
1921 flow->npkts - 1;
1922 flow->flow_state.ib_spsn = psn;
1923 flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1;
1924
3ce5daa2 1925 trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow);
d0d564a1
KW
1926 /* Set the initial flow index to the current flow. */
1927 req->flow_idx = req->setup_head;
1928
1929 /* advance circular buffer head */
1930 req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
1931
1932 /*
1933 * Compute last PSN for request.
1934 */
1935 e->opcode = (bth0 >> 24) & 0xff;
1936 e->psn = psn;
1937 e->lpsn = psn + flow->npkts - 1;
1938 e->sent = 0;
1939
1940 req->n_flows = qpriv->tid_rdma.local.max_read;
1941 req->state = TID_REQUEST_ACTIVE;
1942 req->cur_seg = 0;
1943 req->comp_seg = 0;
1944 req->ack_seg = 0;
1945 req->isge = 0;
1946 req->seg_len = qpriv->tid_rdma.local.max_len;
1947 req->total_len = len;
1948 req->total_segs = 1;
1949 req->r_flow_psn = e->psn;
1950
3ce5daa2
KW
1951 trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn,
1952 req);
d0d564a1
KW
1953 return 0;
1954}
1955
1956static int tid_rdma_rcv_error(struct hfi1_packet *packet,
1957 struct ib_other_headers *ohdr,
1958 struct rvt_qp *qp, u32 psn, int diff)
1959{
1960 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1961 struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
1962 struct rvt_ack_entry *e;
1963 struct tid_rdma_request *req;
1964 unsigned long flags;
1965 u8 prev;
1966 bool old_req;
1967
3ce5daa2
KW
1968 trace_hfi1_rsp_tid_rcv_error(qp, psn);
1969 trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff);
d0d564a1
KW
1970 if (diff > 0) {
1971 /* sequence error */
1972 if (!qp->r_nak_state) {
1973 ibp->rvp.n_rc_seqnak++;
1974 qp->r_nak_state = IB_NAK_PSN_ERROR;
1975 qp->r_ack_psn = qp->r_psn;
1976 rc_defered_ack(rcd, qp);
1977 }
1978 goto done;
1979 }
1980
1981 ibp->rvp.n_rc_dupreq++;
1982
1983 spin_lock_irqsave(&qp->s_lock, flags);
1984 e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
1985 if (!e || e->opcode != TID_OP(READ_REQ))
1986 goto unlock;
1987
1988 req = ack_to_tid_req(e);
1989 req->r_flow_psn = psn;
3ce5daa2 1990 trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req);
d0d564a1
KW
1991 if (e->opcode == TID_OP(READ_REQ)) {
1992 struct ib_reth *reth;
1993 u32 offset;
1994 u32 len;
1995 u32 rkey;
1996 u64 vaddr;
1997 int ok;
1998 u32 bth0;
1999
2000 reth = &ohdr->u.tid_rdma.r_req.reth;
2001 /*
2002 * The requester always restarts from the start of the original
2003 * request.
2004 */
2005 offset = delta_psn(psn, e->psn) * qp->pmtu;
2006 len = be32_to_cpu(reth->length);
2007 if (psn != e->psn || len != req->total_len)
2008 goto unlock;
2009
2010 if (e->rdma_sge.mr) {
2011 rvt_put_mr(e->rdma_sge.mr);
2012 e->rdma_sge.mr = NULL;
2013 }
2014
2015 rkey = be32_to_cpu(reth->rkey);
2016 vaddr = get_ib_reth_vaddr(reth);
2017
2018 qp->r_len = len;
2019 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
2020 IB_ACCESS_REMOTE_READ);
2021 if (unlikely(!ok))
2022 goto unlock;
2023
2024 /*
2025 * If all the response packets for the current request have
2026 * been sent out and this request is complete (old_request
2027 * == false) and the TID flow may be unusable (the
2028 * req->clear_tail is advanced). However, when an earlier
2029 * request is received, this request will not be complete any
2030 * more (qp->s_tail_ack_queue is moved back, see below).
2031 * Consequently, we need to update the TID flow info everytime
2032 * a duplicate request is received.
2033 */
2034 bth0 = be32_to_cpu(ohdr->bth[0]);
2035 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
2036 vaddr, len))
2037 goto unlock;
2038
2039 /*
2040 * True if the request is already scheduled (between
2041 * qp->s_tail_ack_queue and qp->r_head_ack_queue);
2042 */
2043 if (old_req)
2044 goto unlock;
2045 }
2046 /* Re-process old requests.*/
4f9264d1
KW
2047 if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
2048 qp->s_acked_ack_queue = prev;
d0d564a1
KW
2049 qp->s_tail_ack_queue = prev;
2050 /*
2051 * Since the qp->s_tail_ack_queue is modified, the
2052 * qp->s_ack_state must be changed to re-initialize
2053 * qp->s_ack_rdma_sge; Otherwise, we will end up in
2054 * wrong memory region.
2055 */
2056 qp->s_ack_state = OP(ACKNOWLEDGE);
2057 qp->r_state = e->opcode;
2058 qp->r_nak_state = 0;
2059 qp->s_flags |= RVT_S_RESP_PENDING;
2060 hfi1_schedule_send(qp);
2061unlock:
2062 spin_unlock_irqrestore(&qp->s_lock, flags);
2063done:
2064 return 1;
2065}
2066
2067void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
2068{
2069 /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/
2070
2071 /*
2072 * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
2073 * (see hfi1_rc_rcv())
2074 * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue)
2075 * - Setup struct tid_rdma_req with request info
2076 * - Initialize struct tid_rdma_flow info;
2077 * - Copy TID entries;
2078 * 3. Set the qp->s_ack_state.
2079 * 4. Set RVT_S_RESP_PENDING in s_flags.
2080 * 5. Kick the send engine (hfi1_schedule_send())
2081 */
2082 struct hfi1_ctxtdata *rcd = packet->rcd;
2083 struct rvt_qp *qp = packet->qp;
2084 struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
2085 struct ib_other_headers *ohdr = packet->ohdr;
2086 struct rvt_ack_entry *e;
2087 unsigned long flags;
2088 struct ib_reth *reth;
2089 struct hfi1_qp_priv *qpriv = qp->priv;
2090 u32 bth0, psn, len, rkey;
2091 bool is_fecn;
2092 u8 next;
2093 u64 vaddr;
2094 int diff;
2095 u8 nack_state = IB_NAK_INVALID_REQUEST;
2096
2097 bth0 = be32_to_cpu(ohdr->bth[0]);
2098 if (hfi1_ruc_check_hdr(ibp, packet))
2099 return;
2100
2101 is_fecn = process_ecn(qp, packet);
2102 psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
3ce5daa2 2103 trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
d0d564a1
KW
2104
2105 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
2106 rvt_comm_est(qp);
2107
2108 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2109 goto nack_inv;
2110
2111 reth = &ohdr->u.tid_rdma.r_req.reth;
2112 vaddr = be64_to_cpu(reth->vaddr);
2113 len = be32_to_cpu(reth->length);
2114 /* The length needs to be in multiples of PAGE_SIZE */
2115 if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len)
2116 goto nack_inv;
2117
2118 diff = delta_psn(psn, qp->r_psn);
2119 if (unlikely(diff)) {
2120 if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
2121 return;
2122 goto send_ack;
2123 }
2124
2125 /* We've verified the request, insert it into the ack queue. */
2126 next = qp->r_head_ack_queue + 1;
2127 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
2128 next = 0;
2129 spin_lock_irqsave(&qp->s_lock, flags);
2130 if (unlikely(next == qp->s_tail_ack_queue)) {
2131 if (!qp->s_ack_queue[next].sent) {
2132 nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2133 goto nack_inv_unlock;
2134 }
2135 update_ack_queue(qp, next);
2136 }
2137 e = &qp->s_ack_queue[qp->r_head_ack_queue];
2138 if (e->rdma_sge.mr) {
2139 rvt_put_mr(e->rdma_sge.mr);
2140 e->rdma_sge.mr = NULL;
2141 }
2142
2143 rkey = be32_to_cpu(reth->rkey);
2144 qp->r_len = len;
2145
2146 if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
2147 rkey, IB_ACCESS_REMOTE_READ)))
2148 goto nack_acc;
2149
2150 /* Accept the request parameters */
2151 if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
2152 len))
2153 goto nack_inv_unlock;
2154
2155 qp->r_state = e->opcode;
2156 qp->r_nak_state = 0;
2157 /*
2158 * We need to increment the MSN here instead of when we
2159 * finish sending the result since a duplicate request would
2160 * increment it more than once.
2161 */
2162 qp->r_msn++;
2163 qp->r_psn += e->lpsn - e->psn + 1;
2164
2165 qp->r_head_ack_queue = next;
2166
2167 /* Schedule the send tasklet. */
2168 qp->s_flags |= RVT_S_RESP_PENDING;
2169 hfi1_schedule_send(qp);
2170
2171 spin_unlock_irqrestore(&qp->s_lock, flags);
2172 if (is_fecn)
2173 goto send_ack;
2174 return;
2175
2176nack_inv_unlock:
2177 spin_unlock_irqrestore(&qp->s_lock, flags);
2178nack_inv:
2179 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2180 qp->r_nak_state = nack_state;
2181 qp->r_ack_psn = qp->r_psn;
2182 /* Queue NAK for later */
2183 rc_defered_ack(rcd, qp);
2184 return;
2185nack_acc:
2186 spin_unlock_irqrestore(&qp->s_lock, flags);
2187 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
2188 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2189 qp->r_ack_psn = qp->r_psn;
2190send_ack:
2191 hfi1_send_rc_ack(packet, is_fecn);
2192}
1db21b50
KW
2193
2194u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
2195 struct ib_other_headers *ohdr, u32 *bth0,
2196 u32 *bth1, u32 *bth2, u32 *len, bool *last)
2197{
2198 struct hfi1_ack_priv *epriv = e->priv;
2199 struct tid_rdma_request *req = &epriv->tid_req;
2200 struct hfi1_qp_priv *qpriv = qp->priv;
2201 struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
2202 u32 tidentry = flow->tid_entry[flow->tid_idx];
2203 u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
2204 struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
2205 u32 next_offset, om = KDETH_OM_LARGE;
2206 bool last_pkt;
2207 u32 hdwords = 0;
2208 struct tid_rdma_params *remote;
2209
2210 *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
2211 flow->sent += *len;
2212 next_offset = flow->tid_offset + *len;
2213 last_pkt = (flow->sent >= flow->length);
2214
3ce5daa2
KW
2215 trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry);
2216 trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow);
2217
1db21b50
KW
2218 rcu_read_lock();
2219 remote = rcu_dereference(qpriv->tid_rdma.remote);
2220 if (!remote) {
2221 rcu_read_unlock();
2222 goto done;
2223 }
2224 KDETH_RESET(resp->kdeth0, KVER, 0x1);
2225 KDETH_SET(resp->kdeth0, SH, !last_pkt);
2226 KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
2227 KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
2228 KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
2229 KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
2230 KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
2231 KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
2232 resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
2233 rcu_read_unlock();
2234
2235 resp->aeth = rvt_compute_aeth(qp);
2236 resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
2237 flow->pkt));
2238
2239 *bth0 = TID_OP(READ_RESP) << 24;
2240 *bth1 = flow->tid_qpn;
2241 *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
2242 HFI1_KDETH_BTH_SEQ_MASK) |
2243 (flow->flow_state.generation <<
2244 HFI1_KDETH_BTH_SEQ_SHIFT));
2245 *last = last_pkt;
2246 if (last_pkt)
2247 /* Advance to next flow */
2248 req->clear_tail = (req->clear_tail + 1) &
2249 (MAX_FLOWS - 1);
2250
2251 if (next_offset >= tidlen) {
2252 flow->tid_offset = 0;
2253 flow->tid_idx++;
2254 } else {
2255 flow->tid_offset = next_offset;
2256 }
2257
2258 hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
2259
2260done:
2261 return hdwords;
2262}
9905bf06
KW
2263
2264static inline struct tid_rdma_request *
2265find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode)
2266 __must_hold(&qp->s_lock)
2267{
2268 struct rvt_swqe *wqe;
2269 struct tid_rdma_request *req = NULL;
2270 u32 i, end;
2271
2272 end = qp->s_cur + 1;
2273 if (end == qp->s_size)
2274 end = 0;
2275 for (i = qp->s_acked; i != end;) {
2276 wqe = rvt_get_swqe_ptr(qp, i);
2277 if (cmp_psn(psn, wqe->psn) >= 0 &&
2278 cmp_psn(psn, wqe->lpsn) <= 0) {
2279 if (wqe->wr.opcode == opcode)
2280 req = wqe_to_tid_req(wqe);
2281 break;
2282 }
2283 if (++i == qp->s_size)
2284 i = 0;
2285 }
2286
2287 return req;
2288}
2289
2290void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
2291{
2292 /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */
2293
2294 /*
2295 * 1. Find matching SWQE
2296 * 2. Check that the entire segment has been read.
2297 * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags.
2298 * 4. Free the TID flow resources.
2299 * 5. Kick the send engine (hfi1_schedule_send())
2300 */
2301 struct ib_other_headers *ohdr = packet->ohdr;
2302 struct rvt_qp *qp = packet->qp;
2303 struct hfi1_qp_priv *priv = qp->priv;
2304 struct hfi1_ctxtdata *rcd = packet->rcd;
2305 struct tid_rdma_request *req;
2306 struct tid_rdma_flow *flow;
2307 u32 opcode, aeth;
2308 bool is_fecn;
2309 unsigned long flags;
2310 u32 kpsn, ipsn;
2311
3ce5daa2 2312 trace_hfi1_sender_rcv_tid_read_resp(qp);
9905bf06
KW
2313 is_fecn = process_ecn(qp, packet);
2314 kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2315 aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
2316 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
2317
2318 spin_lock_irqsave(&qp->s_lock, flags);
2319 ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
2320 req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ);
2321 if (unlikely(!req))
2322 goto ack_op_err;
2323
2324 flow = &req->flows[req->clear_tail];
2325 /* When header suppression is disabled */
2326 if (cmp_psn(ipsn, flow->flow_state.ib_lpsn))
2327 goto ack_done;
2328 req->ack_pending--;
2329 priv->pending_tid_r_segs--;
2330 qp->s_num_rd_atomic--;
2331 if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
2332 !qp->s_num_rd_atomic) {
2333 qp->s_flags &= ~(RVT_S_WAIT_FENCE |
2334 RVT_S_WAIT_ACK);
2335 hfi1_schedule_send(qp);
2336 }
2337 if (qp->s_flags & RVT_S_WAIT_RDMAR) {
2338 qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK);
2339 hfi1_schedule_send(qp);
2340 }
2341
3ce5daa2
KW
2342 trace_hfi1_ack(qp, ipsn);
2343 trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode,
2344 req->e.swqe->psn, req->e.swqe->lpsn,
2345 req);
2346 trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow);
2347
9905bf06
KW
2348 /* Release the tid resources */
2349 hfi1_kern_exp_rcv_clear(req);
2350
2351 if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd))
2352 goto ack_done;
2353
2354 /* If not done yet, build next read request */
2355 if (++req->comp_seg >= req->total_segs) {
2356 priv->tid_r_comp++;
2357 req->state = TID_REQUEST_COMPLETE;
2358 }
2359
2360 /*
2361 * Clear the hw flow under two conditions:
2362 * 1. This request is a sync point and it is complete;
2363 * 2. Current request is completed and there are no more requests.
2364 */
2365 if ((req->state == TID_REQUEST_SYNC &&
2366 req->comp_seg == req->cur_seg) ||
2367 priv->tid_r_comp == priv->tid_r_reqs) {
2368 hfi1_kern_clear_hw_flow(priv->rcd, qp);
2369 if (req->state == TID_REQUEST_SYNC)
2370 req->state = TID_REQUEST_ACTIVE;
2371 }
2372
2373 hfi1_schedule_send(qp);
2374 goto ack_done;
2375
2376ack_op_err:
2377 /*
2378 * The test indicates that the send engine has finished its cleanup
2379 * after sending the request and it's now safe to put the QP into error
2380 * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
2381 * == qp->s_head), it would be unsafe to complete the wqe pointed by
2382 * qp->s_acked here. Putting the qp into error state will safely flush
2383 * all remaining requests.
2384 */
2385 if (qp->s_last == qp->s_acked)
2386 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
2387
2388ack_done:
2389 spin_unlock_irqrestore(&qp->s_lock, flags);
2390 if (is_fecn)
2391 hfi1_send_rc_ack(packet, is_fecn);
2392}
2393
2394void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
2395 __must_hold(&qp->s_lock)
2396{
2397 u32 n = qp->s_acked;
2398 struct rvt_swqe *wqe;
2399 struct tid_rdma_request *req;
2400 struct hfi1_qp_priv *priv = qp->priv;
2401
2402 lockdep_assert_held(&qp->s_lock);
2403 /* Free any TID entries */
2404 while (n != qp->s_tail) {
2405 wqe = rvt_get_swqe_ptr(qp, n);
2406 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2407 req = wqe_to_tid_req(wqe);
2408 hfi1_kern_exp_rcv_clear_all(req);
2409 }
2410
2411 if (++n == qp->s_size)
2412 n = 0;
2413 }
2414 /* Free flow */
2415 hfi1_kern_clear_hw_flow(priv->rcd, qp);
2416}
2417
2418static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
2419 struct hfi1_packet *packet, u8 rcv_type,
2420 u8 opcode)
2421{
2422 struct rvt_qp *qp = packet->qp;
2423 u32 ipsn;
2424 struct ib_other_headers *ohdr = packet->ohdr;
2425
2426 if (rcv_type >= RHF_RCV_TYPE_IB)
2427 goto done;
2428
2429 spin_lock(&qp->s_lock);
2430 /*
2431 * For TID READ response, error out QP after freeing the tid
2432 * resources.
2433 */
2434 if (opcode == TID_OP(READ_RESP)) {
2435 ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
2436 if (cmp_psn(ipsn, qp->s_last_psn) > 0 &&
2437 cmp_psn(ipsn, qp->s_psn) < 0) {
2438 hfi1_kern_read_tid_flow_free(qp);
2439 spin_unlock(&qp->s_lock);
2440 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2441 goto done;
2442 }
2443 }
2444
2445 spin_unlock(&qp->s_lock);
2446done:
2447 return true;
2448}
2449
2450static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd,
2451 struct rvt_qp *qp, struct rvt_swqe *wqe)
2452{
2453 struct tid_rdma_request *req;
2454 struct tid_rdma_flow *flow;
2455
2456 /* Start from the right segment */
2457 qp->r_flags |= RVT_R_RDMAR_SEQ;
2458 req = wqe_to_tid_req(wqe);
2459 flow = &req->flows[req->clear_tail];
2460 hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0);
2461 if (list_empty(&qp->rspwait)) {
2462 qp->r_flags |= RVT_R_RSP_SEND;
2463 rvt_get_qp(qp);
2464 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2465 }
2466}
2467
2468/*
2469 * Handle the KDETH eflags for TID RDMA READ response.
2470 *
2471 * Return true if the last packet for a segment has been received and it is
2472 * time to process the response normally; otherwise, return true.
2473 *
2474 * The caller must hold the packet->qp->r_lock and the rcu_read_lock.
2475 */
2476static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
2477 struct hfi1_packet *packet, u8 rcv_type,
2478 u8 rte, u32 psn, u32 ibpsn)
2479 __must_hold(&packet->qp->r_lock) __must_hold(RCU)
2480{
2481 struct hfi1_pportdata *ppd = rcd->ppd;
2482 struct hfi1_devdata *dd = ppd->dd;
2483 struct hfi1_ibport *ibp;
2484 struct rvt_swqe *wqe;
2485 struct tid_rdma_request *req;
2486 struct tid_rdma_flow *flow;
2487 u32 ack_psn;
2488 struct rvt_qp *qp = packet->qp;
2489 struct hfi1_qp_priv *priv = qp->priv;
2490 bool ret = true;
2491 int diff = 0;
2492 u32 fpsn;
2493
2494 lockdep_assert_held(&qp->r_lock);
2495 /* If the psn is out of valid range, drop the packet */
2496 if (cmp_psn(ibpsn, qp->s_last_psn) < 0 ||
2497 cmp_psn(ibpsn, qp->s_psn) > 0)
2498 return ret;
2499
2500 spin_lock(&qp->s_lock);
2501 /*
2502 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
2503 * requests and implicitly NAK RDMA read and atomic requests issued
2504 * before the NAK'ed request.
2505 */
2506 ack_psn = ibpsn - 1;
2507 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2508 ibp = to_iport(qp->ibqp.device, qp->port_num);
2509
2510 /* Complete WQEs that the PSN finishes. */
2511 while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) {
2512 /*
2513 * If this request is a RDMA read or atomic, and the NACK is
2514 * for a later operation, this NACK NAKs the RDMA read or
2515 * atomic.
2516 */
2517 if (wqe->wr.opcode == IB_WR_RDMA_READ ||
2518 wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
2519 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2520 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
2521 /* Retry this request. */
2522 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
2523 qp->r_flags |= RVT_R_RDMAR_SEQ;
2524 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2525 restart_tid_rdma_read_req(rcd, qp,
2526 wqe);
2527 } else {
2528 hfi1_restart_rc(qp, qp->s_last_psn + 1,
2529 0);
2530 if (list_empty(&qp->rspwait)) {
2531 qp->r_flags |= RVT_R_RSP_SEND;
2532 rvt_get_qp(qp);
2533 list_add_tail(/* wait */
2534 &qp->rspwait,
2535 &rcd->qp_wait_list);
2536 }
2537 }
2538 }
2539 /*
2540 * No need to process the NAK since we are
2541 * restarting an earlier request.
2542 */
2543 break;
2544 }
2545
2546 wqe = do_rc_completion(qp, wqe, ibp);
2547 if (qp->s_acked == qp->s_tail)
2548 break;
2549 }
2550
2551 /* Handle the eflags for the request */
2552 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
2553 goto s_unlock;
2554
2555 req = wqe_to_tid_req(wqe);
2556 switch (rcv_type) {
2557 case RHF_RCV_TYPE_EXPECTED:
2558 switch (rte) {
2559 case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
2560 /*
2561 * On the first occurrence of a Flow Sequence error,
2562 * the flag TID_FLOW_SW_PSN is set.
2563 *
2564 * After that, the flow is *not* reprogrammed and the
2565 * protocol falls back to SW PSN checking. This is done
2566 * to prevent continuous Flow Sequence errors for any
2567 * packets that could be still in the fabric.
2568 */
2569 flow = find_flow(req, psn, NULL);
2570 if (!flow) {
2571 /*
2572 * We can't find the IB PSN matching the
2573 * received KDETH PSN. The only thing we can
2574 * do at this point is report the error to
2575 * the QP.
2576 */
2577 hfi1_kern_read_tid_flow_free(qp);
2578 spin_unlock(&qp->s_lock);
2579 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2580 return ret;
2581 }
2582 if (priv->flow_state.flags & TID_FLOW_SW_PSN) {
2583 diff = cmp_psn(psn,
2584 priv->flow_state.r_next_psn);
2585 if (diff > 0) {
2586 if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
2587 restart_tid_rdma_read_req(rcd,
2588 qp,
2589 wqe);
2590
2591 /* Drop the packet.*/
2592 goto s_unlock;
2593 } else if (diff < 0) {
2594 /*
2595 * If a response packet for a restarted
2596 * request has come back, reset the
2597 * restart flag.
2598 */
2599 if (qp->r_flags & RVT_R_RDMAR_SEQ)
2600 qp->r_flags &=
2601 ~RVT_R_RDMAR_SEQ;
2602
2603 /* Drop the packet.*/
2604 goto s_unlock;
2605 }
2606
2607 /*
2608 * If SW PSN verification is successful and
2609 * this is the last packet in the segment, tell
2610 * the caller to process it as a normal packet.
2611 */
2612 fpsn = full_flow_psn(flow,
2613 flow->flow_state.lpsn);
2614 if (cmp_psn(fpsn, psn) == 0) {
2615 ret = false;
2616 if (qp->r_flags & RVT_R_RDMAR_SEQ)
2617 qp->r_flags &=
2618 ~RVT_R_RDMAR_SEQ;
2619 }
2620 priv->flow_state.r_next_psn++;
2621 } else {
2622 u64 reg;
2623 u32 last_psn;
2624
2625 /*
2626 * The only sane way to get the amount of
2627 * progress is to read the HW flow state.
2628 */
2629 reg = read_uctxt_csr(dd, rcd->ctxt,
2630 RCV_TID_FLOW_TABLE +
2631 (8 * flow->idx));
2632 last_psn = mask_psn(reg);
2633
2634 priv->flow_state.r_next_psn = last_psn;
2635 priv->flow_state.flags |= TID_FLOW_SW_PSN;
2636 /*
2637 * If no request has been restarted yet,
2638 * restart the current one.
2639 */
2640 if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
2641 restart_tid_rdma_read_req(rcd, qp,
2642 wqe);
2643 }
2644
2645 break;
2646
2647 case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
2648 /*
2649 * Since the TID flow is able to ride through
2650 * generation mismatch, drop this stale packet.
2651 */
2652 break;
2653
2654 default:
2655 break;
2656 }
2657 break;
2658
2659 case RHF_RCV_TYPE_ERROR:
2660 switch (rte) {
2661 case RHF_RTE_ERROR_OP_CODE_ERR:
2662 case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
2663 case RHF_RTE_ERROR_KHDR_HCRC_ERR:
2664 case RHF_RTE_ERROR_KHDR_KVER_ERR:
2665 case RHF_RTE_ERROR_CONTEXT_ERR:
2666 case RHF_RTE_ERROR_KHDR_TID_ERR:
2667 default:
2668 break;
2669 }
2670 default:
2671 break;
2672 }
2673s_unlock:
2674 spin_unlock(&qp->s_lock);
2675 return ret;
2676}
2677
2678bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
2679 struct hfi1_pportdata *ppd,
2680 struct hfi1_packet *packet)
2681{
2682 struct hfi1_ibport *ibp = &ppd->ibport_data;
2683 struct hfi1_devdata *dd = ppd->dd;
2684 struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
2685 u8 rcv_type = rhf_rcv_type(packet->rhf);
2686 u8 rte = rhf_rcv_type_err(packet->rhf);
2687 struct ib_header *hdr = packet->hdr;
2688 struct ib_other_headers *ohdr = NULL;
2689 int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
2690 u16 lid = be16_to_cpu(hdr->lrh[1]);
2691 u8 opcode;
2692 u32 qp_num, psn, ibpsn;
2693 struct rvt_qp *qp;
2694 unsigned long flags;
2695 bool ret = true;
2696
3ce5daa2
KW
2697 trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
2698 packet->rhf);
9905bf06
KW
2699 if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
2700 return ret;
2701
2702 packet->ohdr = &hdr->u.oth;
2703 ohdr = packet->ohdr;
2704 trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
2705
2706 /* Get the destination QP number. */
2707 qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) &
2708 RVT_QPN_MASK;
2709 if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
2710 goto drop;
2711
2712 psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2713 opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
2714
2715 rcu_read_lock();
2716 qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
2717 if (!qp)
2718 goto rcu_unlock;
2719
2720 packet->qp = qp;
2721
2722 /* Check for valid receive state. */
2723 spin_lock_irqsave(&qp->r_lock, flags);
2724 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
2725 ibp->rvp.n_pkt_drops++;
2726 goto r_unlock;
2727 }
2728
2729 if (packet->rhf & RHF_TID_ERR) {
2730 /* For TIDERR and RC QPs preemptively schedule a NAK */
2731 u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
2732
2733 /* Sanity check packet */
2734 if (tlen < 24)
2735 goto r_unlock;
2736
2737 /*
2738 * Check for GRH. We should never get packets with GRH in this
2739 * path.
2740 */
2741 if (lnh == HFI1_LRH_GRH)
2742 goto r_unlock;
2743
2744 if (tid_rdma_tid_err(rcd, packet, rcv_type, opcode))
2745 goto r_unlock;
2746 }
2747
2748 /* handle TID RDMA READ */
2749 if (opcode == TID_OP(READ_RESP)) {
2750 ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn);
2751 ibpsn = mask_psn(ibpsn);
2752 ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
2753 ibpsn);
2754 }
2755
2756r_unlock:
2757 spin_unlock_irqrestore(&qp->r_lock, flags);
2758rcu_unlock:
2759 rcu_read_unlock();
2760drop:
2761 return ret;
2762}
b126078e
KW
2763
2764/*
2765 * "Rewind" the TID request information.
2766 * This means that we reset the state back to ACTIVE,
2767 * find the proper flow, set the flow index to that flow,
2768 * and reset the flow information.
2769 */
2770void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
2771 u32 *bth2)
2772{
2773 struct tid_rdma_request *req = wqe_to_tid_req(wqe);
2774 struct tid_rdma_flow *flow;
2775 int diff;
2776 u32 tididx = 0;
2777 u16 fidx;
2778
2779 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2780 *bth2 = mask_psn(qp->s_psn);
2781 flow = find_flow_ib(req, *bth2, &fidx);
3ce5daa2
KW
2782 if (!flow) {
2783 trace_hfi1_msg_tid_restart_req(/* msg */
2784 qp, "!!!!!! Could not find flow to restart: bth2 ",
2785 (u64)*bth2);
2786 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode,
2787 wqe->psn, wqe->lpsn,
2788 req);
b126078e 2789 return;
3ce5daa2 2790 }
b126078e
KW
2791 } else {
2792 return;
2793 }
2794
3ce5daa2 2795 trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
b126078e
KW
2796 diff = delta_psn(*bth2, flow->flow_state.ib_spsn);
2797
2798 flow->sent = 0;
2799 flow->pkt = 0;
2800 flow->tid_idx = 0;
2801 flow->tid_offset = 0;
2802 if (diff) {
2803 for (tididx = 0; tididx < flow->tidcnt; tididx++) {
2804 u32 tidentry = flow->tid_entry[tididx], tidlen,
2805 tidnpkts, npkts;
2806
2807 flow->tid_offset = 0;
2808 tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE;
2809 tidnpkts = rvt_div_round_up_mtu(qp, tidlen);
2810 npkts = min_t(u32, diff, tidnpkts);
2811 flow->pkt += npkts;
2812 flow->sent += (npkts == tidnpkts ? tidlen :
2813 npkts * qp->pmtu);
2814 flow->tid_offset += npkts * qp->pmtu;
2815 diff -= npkts;
2816 if (!diff)
2817 break;
2818 }
2819 }
2820
2821 if (flow->tid_offset ==
2822 EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
2823 tididx++;
2824 flow->tid_offset = 0;
2825 }
2826 flow->tid_idx = tididx;
2827 /* Move flow_idx to correct index */
2828 req->flow_idx = fidx;
2829
3ce5daa2
KW
2830 trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
2831 trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
2832 wqe->lpsn, req);
b126078e
KW
2833 req->state = TID_REQUEST_ACTIVE;
2834}
24b11923
KW
2835
2836void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
2837{
2838 int i, ret;
2839 struct hfi1_qp_priv *qpriv = qp->priv;
2840 struct tid_flow_state *fs;
2841
2842 if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA))
2843 return;
2844
2845 /*
2846 * First, clear the flow to help prevent any delayed packets from
2847 * being delivered.
2848 */
2849 fs = &qpriv->flow_state;
2850 if (fs->index != RXE_NUM_TID_FLOWS)
2851 hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
2852
2853 for (i = qp->s_acked; i != qp->s_head;) {
2854 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
2855
2856 if (++i == qp->s_size)
2857 i = 0;
2858 /* Free only locally allocated TID entries */
2859 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
2860 continue;
2861 do {
2862 struct hfi1_swqe_priv *priv = wqe->priv;
2863
2864 ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
2865 } while (!ret);
2866 }
2867}
a0b34f75
KW
2868
2869bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
2870{
2871 struct rvt_swqe *prev;
2872 struct hfi1_qp_priv *priv = qp->priv;
2873 u32 s_prev;
2874
2875 s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
2876 prev = rvt_get_swqe_ptr(qp, s_prev);
2877
2878 switch (wqe->wr.opcode) {
2879 case IB_WR_SEND:
2880 case IB_WR_SEND_WITH_IMM:
2881 case IB_WR_SEND_WITH_INV:
2882 case IB_WR_ATOMIC_CMP_AND_SWP:
2883 case IB_WR_ATOMIC_FETCH_AND_ADD:
2884 case IB_WR_RDMA_WRITE:
2885 case IB_WR_RDMA_READ:
2886 break;
2887 case IB_WR_TID_RDMA_READ:
2888 switch (prev->wr.opcode) {
2889 case IB_WR_RDMA_READ:
2890 if (qp->s_acked != qp->s_cur)
2891 goto interlock;
2892 break;
2893 default:
2894 break;
2895 }
2896 default:
2897 break;
2898 }
2899 return false;
2900
2901interlock:
2902 priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK;
2903 return true;
2904}
f1ab4efa
KW
2905
2906/* Does @sge meet the alignment requirements for tid rdma? */
3ce5daa2
KW
2907static inline bool hfi1_check_sge_align(struct rvt_qp *qp,
2908 struct rvt_sge *sge, int num_sge)
f1ab4efa
KW
2909{
2910 int i;
2911
3ce5daa2
KW
2912 for (i = 0; i < num_sge; i++, sge++) {
2913 trace_hfi1_sge_check_align(qp, i, sge);
f1ab4efa
KW
2914 if ((u64)sge->vaddr & ~PAGE_MASK ||
2915 sge->sge_length & ~PAGE_MASK)
2916 return false;
3ce5daa2 2917 }
f1ab4efa
KW
2918 return true;
2919}
2920
2921void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
2922{
2923 struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
2924 struct hfi1_swqe_priv *priv = wqe->priv;
2925 struct tid_rdma_params *remote;
2926 enum ib_wr_opcode new_opcode;
2927 bool do_tid_rdma = false;
2928 struct hfi1_pportdata *ppd = qpriv->rcd->ppd;
2929
2930 if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) ==
2931 ppd->lid)
2932 return;
2933 if (qpriv->hdr_type != HFI1_PKT_TYPE_9B)
2934 return;
2935
2936 rcu_read_lock();
2937 remote = rcu_dereference(qpriv->tid_rdma.remote);
2938 /*
2939 * If TID RDMA is disabled by the negotiation, don't
2940 * use it.
2941 */
2942 if (!remote)
2943 goto exit;
2944
2945 if (wqe->wr.opcode == IB_WR_RDMA_READ) {
3ce5daa2
KW
2946 if (hfi1_check_sge_align(qp, &wqe->sg_list[0],
2947 wqe->wr.num_sge)) {
f1ab4efa
KW
2948 new_opcode = IB_WR_TID_RDMA_READ;
2949 do_tid_rdma = true;
2950 }
2951 }
2952
2953 if (do_tid_rdma) {
2954 if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC))
2955 goto exit;
2956 wqe->wr.opcode = new_opcode;
2957 priv->tid_req.seg_len =
2958 min_t(u32, remote->max_len, wqe->length);
2959 priv->tid_req.total_segs =
2960 DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
2961 /* Compute the last PSN of the request */
2962 wqe->lpsn = wqe->psn;
2963 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2964 priv->tid_req.n_flows = remote->max_read;
2965 qpriv->tid_r_reqs++;
2966 wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
2967 }
2968
2969 priv->tid_req.cur_seg = 0;
2970 priv->tid_req.comp_seg = 0;
2971 priv->tid_req.ack_seg = 0;
2972 priv->tid_req.state = TID_REQUEST_INACTIVE;
3ce5daa2
KW
2973 trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
2974 wqe->psn, wqe->lpsn,
2975 &priv->tid_req);
f1ab4efa
KW
2976 }
2977exit:
2978 rcu_read_unlock();
2979}
c098bbb0
KW
2980
2981/* TID RDMA WRITE functions */
2982
2983u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
2984 struct ib_other_headers *ohdr,
2985 u32 *bth1, u32 *bth2, u32 *len)
2986{
2987 struct hfi1_qp_priv *qpriv = qp->priv;
2988 struct tid_rdma_request *req = wqe_to_tid_req(wqe);
2989 struct tid_rdma_params *remote;
2990
2991 rcu_read_lock();
2992 remote = rcu_dereference(qpriv->tid_rdma.remote);
2993 /*
2994 * Set the number of flow to be used based on negotiated
2995 * parameters.
2996 */
2997 req->n_flows = remote->max_write;
2998 req->state = TID_REQUEST_ACTIVE;
2999
3000 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1);
3001 KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
3002 ohdr->u.tid_rdma.w_req.reth.vaddr =
3003 cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
3004 ohdr->u.tid_rdma.w_req.reth.rkey =
3005 cpu_to_be32(wqe->rdma_wr.rkey);
3006 ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
3007 ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
3008 *bth1 &= ~RVT_QPN_MASK;
3009 *bth1 |= remote->qp;
3010 qp->s_state = TID_OP(WRITE_REQ);
3011 qp->s_flags |= HFI1_S_WAIT_TID_RESP;
3012 *bth2 |= IB_BTH_REQ_ACK;
3013 *len = 0;
3014
3015 rcu_read_unlock();
3016 return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
3017}