3 * This file is provided under a dual BSD/GPLv2 license. When using or
4 * redistributing this file, you may do so under either license.
8 * Copyright(c) 2015 Intel Corporation.
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
21 * Copyright(c) 2015 Intel Corporation.
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
27 * - Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * - Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in
31 * the documentation and/or other materials provided with the
33 * - Neither the name of Intel Corporation nor the names of its
34 * contributors may be used to endorse or promote products derived
35 * from this software without specific prior written permission.
37 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
40 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
41 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
42 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
43 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
45 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
47 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58 /* cut down ridiculously long IB macro names */
59 #define OP(x) IB_OPCODE_RC_##x
61 static void rc_timeout(unsigned long arg
);
63 static u32
restart_sge(struct hfi1_sge_state
*ss
, struct hfi1_swqe
*wqe
,
68 len
= delta_psn(psn
, wqe
->psn
) * pmtu
;
69 ss
->sge
= wqe
->sg_list
[0];
70 ss
->sg_list
= wqe
->sg_list
+ 1;
71 ss
->num_sge
= wqe
->wr
.num_sge
;
72 ss
->total_len
= wqe
->length
;
73 hfi1_skip_sge(ss
, len
, 0);
74 return wqe
->length
- len
;
77 static void start_timer(struct hfi1_qp
*qp
)
79 qp
->s_flags
|= HFI1_S_TIMER
;
80 qp
->s_timer
.function
= rc_timeout
;
81 /* 4.096 usec. * (1 << qp->timeout) */
82 qp
->s_timer
.expires
= jiffies
+ qp
->timeout_jiffies
;
83 add_timer(&qp
->s_timer
);
87 * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
88 * @dev: the device for this QP
89 * @qp: a pointer to the QP
90 * @ohdr: a pointer to the IB header being constructed
93 * Return 1 if constructed; otherwise, return 0.
94 * Note that we are in the responder's side of the QP context.
95 * Note the QP s_lock must be held.
97 static int make_rc_ack(struct hfi1_ibdev
*dev
, struct hfi1_qp
*qp
,
98 struct hfi1_other_headers
*ohdr
, u32 pmtu
)
100 struct hfi1_ack_entry
*e
;
107 /* Don't send an ACK if we aren't supposed to. */
108 if (!(ib_hfi1_state_ops
[qp
->state
] & HFI1_PROCESS_RECV_OK
))
111 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
114 switch (qp
->s_ack_state
) {
115 case OP(RDMA_READ_RESPONSE_LAST
):
116 case OP(RDMA_READ_RESPONSE_ONLY
):
117 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
118 if (e
->rdma_sge
.mr
) {
119 hfi1_put_mr(e
->rdma_sge
.mr
);
120 e
->rdma_sge
.mr
= NULL
;
123 case OP(ATOMIC_ACKNOWLEDGE
):
125 * We can increment the tail pointer now that the last
126 * response has been sent instead of only being
129 if (++qp
->s_tail_ack_queue
> HFI1_MAX_RDMA_ATOMIC
)
130 qp
->s_tail_ack_queue
= 0;
133 case OP(ACKNOWLEDGE
):
134 /* Check for no next entry in the queue. */
135 if (qp
->r_head_ack_queue
== qp
->s_tail_ack_queue
) {
136 if (qp
->s_flags
& HFI1_S_ACK_PENDING
)
141 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
142 if (e
->opcode
== OP(RDMA_READ_REQUEST
)) {
144 * If a RDMA read response is being resent and
145 * we haven't seen the duplicate request yet,
146 * then stop sending the remaining responses the
147 * responder has seen until the requester re-sends it.
149 len
= e
->rdma_sge
.sge_length
;
150 if (len
&& !e
->rdma_sge
.mr
) {
151 qp
->s_tail_ack_queue
= qp
->r_head_ack_queue
;
154 /* Copy SGE state in case we need to resend */
155 qp
->s_rdma_mr
= e
->rdma_sge
.mr
;
157 hfi1_get_mr(qp
->s_rdma_mr
);
158 qp
->s_ack_rdma_sge
.sge
= e
->rdma_sge
;
159 qp
->s_ack_rdma_sge
.num_sge
= 1;
160 qp
->s_cur_sge
= &qp
->s_ack_rdma_sge
;
163 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_FIRST
);
165 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_ONLY
);
168 ohdr
->u
.aeth
= hfi1_compute_aeth(qp
);
170 qp
->s_ack_rdma_psn
= e
->psn
;
171 bth2
= mask_psn(qp
->s_ack_rdma_psn
++);
173 /* COMPARE_SWAP or FETCH_ADD */
174 qp
->s_cur_sge
= NULL
;
176 qp
->s_ack_state
= OP(ATOMIC_ACKNOWLEDGE
);
177 ohdr
->u
.at
.aeth
= hfi1_compute_aeth(qp
);
178 ohdr
->u
.at
.atomic_ack_eth
[0] =
179 cpu_to_be32(e
->atomic_data
>> 32);
180 ohdr
->u
.at
.atomic_ack_eth
[1] =
181 cpu_to_be32(e
->atomic_data
);
182 hwords
+= sizeof(ohdr
->u
.at
) / sizeof(u32
);
183 bth2
= mask_psn(e
->psn
);
186 bth0
= qp
->s_ack_state
<< 24;
189 case OP(RDMA_READ_RESPONSE_FIRST
):
190 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
192 case OP(RDMA_READ_RESPONSE_MIDDLE
):
193 qp
->s_cur_sge
= &qp
->s_ack_rdma_sge
;
194 qp
->s_rdma_mr
= qp
->s_ack_rdma_sge
.sge
.mr
;
196 hfi1_get_mr(qp
->s_rdma_mr
);
197 len
= qp
->s_ack_rdma_sge
.sge
.sge_length
;
200 middle
= HFI1_CAP_IS_KSET(SDMA_AHG
);
202 ohdr
->u
.aeth
= hfi1_compute_aeth(qp
);
204 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_LAST
);
205 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
208 bth0
= qp
->s_ack_state
<< 24;
209 bth2
= mask_psn(qp
->s_ack_rdma_psn
++);
215 * Send a regular ACK.
216 * Set the s_ack_state so we wait until after sending
217 * the ACK before setting s_ack_state to ACKNOWLEDGE
220 qp
->s_ack_state
= OP(SEND_ONLY
);
221 qp
->s_flags
&= ~HFI1_S_ACK_PENDING
;
222 qp
->s_cur_sge
= NULL
;
225 cpu_to_be32((qp
->r_msn
& HFI1_MSN_MASK
) |
227 HFI1_AETH_CREDIT_SHIFT
));
229 ohdr
->u
.aeth
= hfi1_compute_aeth(qp
);
232 bth0
= OP(ACKNOWLEDGE
) << 24;
233 bth2
= mask_psn(qp
->s_ack_psn
);
235 qp
->s_rdma_ack_cnt
++;
236 qp
->s_hdrwords
= hwords
;
237 qp
->s_cur_size
= len
;
238 hfi1_make_ruc_header(qp
, ohdr
, bth0
, bth2
, middle
);
242 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
244 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
245 * HFI1_S_RESP_PENDING
248 qp
->s_flags
&= ~(HFI1_S_RESP_PENDING
255 * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
256 * @qp: a pointer to the QP
258 * Return 1 if constructed; otherwise, return 0.
260 int hfi1_make_rc_req(struct hfi1_qp
*qp
)
262 struct hfi1_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
263 struct hfi1_other_headers
*ohdr
;
264 struct hfi1_sge_state
*ss
;
265 struct hfi1_swqe
*wqe
;
266 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
278 ohdr
= &qp
->s_hdr
->ibh
.u
.oth
;
279 if (qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)
280 ohdr
= &qp
->s_hdr
->ibh
.u
.l
.oth
;
283 * The lock is needed to synchronize between the sending tasklet,
284 * the receive interrupt handler, and timeout re-sends.
286 spin_lock_irqsave(&qp
->s_lock
, flags
);
288 /* Sending responses has higher priority over sending requests. */
289 if ((qp
->s_flags
& HFI1_S_RESP_PENDING
) &&
290 make_rc_ack(dev
, qp
, ohdr
, pmtu
))
293 if (!(ib_hfi1_state_ops
[qp
->state
] & HFI1_PROCESS_SEND_OK
)) {
294 if (!(ib_hfi1_state_ops
[qp
->state
] & HFI1_FLUSH_SEND
))
296 /* We are in the error state, flush the work request. */
297 if (qp
->s_last
== qp
->s_head
)
299 /* If DMAs are in progress, we can't flush immediately. */
300 if (atomic_read(&qp
->s_iowait
.sdma_busy
)) {
301 qp
->s_flags
|= HFI1_S_WAIT_DMA
;
305 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
306 hfi1_send_complete(qp
, wqe
, qp
->s_last
!= qp
->s_acked
?
307 IB_WC_SUCCESS
: IB_WC_WR_FLUSH_ERR
);
308 /* will get called again */
312 if (qp
->s_flags
& (HFI1_S_WAIT_RNR
| HFI1_S_WAIT_ACK
))
315 if (cmp_psn(qp
->s_psn
, qp
->s_sending_hpsn
) <= 0) {
316 if (cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) <= 0) {
317 qp
->s_flags
|= HFI1_S_WAIT_PSN
;
320 qp
->s_sending_psn
= qp
->s_psn
;
321 qp
->s_sending_hpsn
= qp
->s_psn
- 1;
324 /* Send a request. */
325 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
326 switch (qp
->s_state
) {
328 if (!(ib_hfi1_state_ops
[qp
->state
] & HFI1_PROCESS_NEXT_SEND_OK
))
331 * Resend an old request or start a new one.
333 * We keep track of the current SWQE so that
334 * we don't reset the "furthest progress" state
335 * if we need to back up.
338 if (qp
->s_cur
== qp
->s_tail
) {
339 /* Check if send work queue is empty. */
340 if (qp
->s_tail
== qp
->s_head
) {
345 * If a fence is requested, wait for previous
346 * RDMA read and atomic operations to finish.
348 if ((wqe
->wr
.send_flags
& IB_SEND_FENCE
) &&
349 qp
->s_num_rd_atomic
) {
350 qp
->s_flags
|= HFI1_S_WAIT_FENCE
;
353 wqe
->psn
= qp
->s_next_psn
;
357 * Note that we have to be careful not to modify the
358 * original work request since we may need to resend
363 bth2
= mask_psn(qp
->s_psn
);
364 switch (wqe
->wr
.opcode
) {
366 case IB_WR_SEND_WITH_IMM
:
367 /* If no credit, return. */
368 if (!(qp
->s_flags
& HFI1_S_UNLIMITED_CREDIT
) &&
369 cmp_msn(wqe
->ssn
, qp
->s_lsn
+ 1) > 0) {
370 qp
->s_flags
|= HFI1_S_WAIT_SSN_CREDIT
;
373 wqe
->lpsn
= wqe
->psn
;
375 wqe
->lpsn
+= (len
- 1) / pmtu
;
376 qp
->s_state
= OP(SEND_FIRST
);
380 if (wqe
->wr
.opcode
== IB_WR_SEND
)
381 qp
->s_state
= OP(SEND_ONLY
);
383 qp
->s_state
= OP(SEND_ONLY_WITH_IMMEDIATE
);
384 /* Immediate data comes after the BTH */
385 ohdr
->u
.imm_data
= wqe
->wr
.ex
.imm_data
;
388 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
389 bth0
|= IB_BTH_SOLICITED
;
390 bth2
|= IB_BTH_REQ_ACK
;
391 if (++qp
->s_cur
== qp
->s_size
)
395 case IB_WR_RDMA_WRITE
:
396 if (newreq
&& !(qp
->s_flags
& HFI1_S_UNLIMITED_CREDIT
))
399 case IB_WR_RDMA_WRITE_WITH_IMM
:
400 /* If no credit, return. */
401 if (!(qp
->s_flags
& HFI1_S_UNLIMITED_CREDIT
) &&
402 cmp_msn(wqe
->ssn
, qp
->s_lsn
+ 1) > 0) {
403 qp
->s_flags
|= HFI1_S_WAIT_SSN_CREDIT
;
406 ohdr
->u
.rc
.reth
.vaddr
=
407 cpu_to_be64(wqe
->rdma_wr
.remote_addr
);
408 ohdr
->u
.rc
.reth
.rkey
=
409 cpu_to_be32(wqe
->rdma_wr
.rkey
);
410 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
411 hwords
+= sizeof(struct ib_reth
) / sizeof(u32
);
412 wqe
->lpsn
= wqe
->psn
;
414 wqe
->lpsn
+= (len
- 1) / pmtu
;
415 qp
->s_state
= OP(RDMA_WRITE_FIRST
);
419 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
420 qp
->s_state
= OP(RDMA_WRITE_ONLY
);
423 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
);
424 /* Immediate data comes after RETH */
425 ohdr
->u
.rc
.imm_data
= wqe
->wr
.ex
.imm_data
;
427 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
428 bth0
|= IB_BTH_SOLICITED
;
430 bth2
|= IB_BTH_REQ_ACK
;
431 if (++qp
->s_cur
== qp
->s_size
)
435 case IB_WR_RDMA_READ
:
437 * Don't allow more operations to be started
438 * than the QP limits allow.
441 if (qp
->s_num_rd_atomic
>=
442 qp
->s_max_rd_atomic
) {
443 qp
->s_flags
|= HFI1_S_WAIT_RDMAR
;
446 qp
->s_num_rd_atomic
++;
447 if (!(qp
->s_flags
& HFI1_S_UNLIMITED_CREDIT
))
450 * Adjust s_next_psn to count the
451 * expected number of responses.
454 qp
->s_next_psn
+= (len
- 1) / pmtu
;
455 wqe
->lpsn
= qp
->s_next_psn
++;
457 ohdr
->u
.rc
.reth
.vaddr
=
458 cpu_to_be64(wqe
->rdma_wr
.remote_addr
);
459 ohdr
->u
.rc
.reth
.rkey
=
460 cpu_to_be32(wqe
->rdma_wr
.rkey
);
461 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
462 qp
->s_state
= OP(RDMA_READ_REQUEST
);
463 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
466 bth2
|= IB_BTH_REQ_ACK
;
467 if (++qp
->s_cur
== qp
->s_size
)
471 case IB_WR_ATOMIC_CMP_AND_SWP
:
472 case IB_WR_ATOMIC_FETCH_AND_ADD
:
474 * Don't allow more operations to be started
475 * than the QP limits allow.
478 if (qp
->s_num_rd_atomic
>=
479 qp
->s_max_rd_atomic
) {
480 qp
->s_flags
|= HFI1_S_WAIT_RDMAR
;
483 qp
->s_num_rd_atomic
++;
484 if (!(qp
->s_flags
& HFI1_S_UNLIMITED_CREDIT
))
486 wqe
->lpsn
= wqe
->psn
;
488 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
) {
489 qp
->s_state
= OP(COMPARE_SWAP
);
490 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
491 wqe
->atomic_wr
.swap
);
492 ohdr
->u
.atomic_eth
.compare_data
= cpu_to_be64(
493 wqe
->atomic_wr
.compare_add
);
495 qp
->s_state
= OP(FETCH_ADD
);
496 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
497 wqe
->atomic_wr
.compare_add
);
498 ohdr
->u
.atomic_eth
.compare_data
= 0;
500 ohdr
->u
.atomic_eth
.vaddr
[0] = cpu_to_be32(
501 wqe
->atomic_wr
.remote_addr
>> 32);
502 ohdr
->u
.atomic_eth
.vaddr
[1] = cpu_to_be32(
503 wqe
->atomic_wr
.remote_addr
);
504 ohdr
->u
.atomic_eth
.rkey
= cpu_to_be32(
505 wqe
->atomic_wr
.rkey
);
506 hwords
+= sizeof(struct ib_atomic_eth
) / sizeof(u32
);
509 bth2
|= IB_BTH_REQ_ACK
;
510 if (++qp
->s_cur
== qp
->s_size
)
517 qp
->s_sge
.sge
= wqe
->sg_list
[0];
518 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
519 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
520 qp
->s_sge
.total_len
= wqe
->length
;
521 qp
->s_len
= wqe
->length
;
524 if (qp
->s_tail
>= qp
->s_size
)
527 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
528 qp
->s_psn
= wqe
->lpsn
+ 1;
531 if (cmp_psn(qp
->s_psn
, qp
->s_next_psn
) > 0)
532 qp
->s_next_psn
= qp
->s_psn
;
536 case OP(RDMA_READ_RESPONSE_FIRST
):
538 * qp->s_state is normally set to the opcode of the
539 * last packet constructed for new requests and therefore
540 * is never set to RDMA read response.
541 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
542 * thread to indicate a SEND needs to be restarted from an
543 * earlier PSN without interfering with the sending thread.
546 qp
->s_len
= restart_sge(&qp
->s_sge
, wqe
, qp
->s_psn
, pmtu
);
549 qp
->s_state
= OP(SEND_MIDDLE
);
551 case OP(SEND_MIDDLE
):
552 bth2
= mask_psn(qp
->s_psn
++);
553 if (cmp_psn(qp
->s_psn
, qp
->s_next_psn
) > 0)
554 qp
->s_next_psn
= qp
->s_psn
;
559 middle
= HFI1_CAP_IS_KSET(SDMA_AHG
);
562 if (wqe
->wr
.opcode
== IB_WR_SEND
)
563 qp
->s_state
= OP(SEND_LAST
);
565 qp
->s_state
= OP(SEND_LAST_WITH_IMMEDIATE
);
566 /* Immediate data comes after the BTH */
567 ohdr
->u
.imm_data
= wqe
->wr
.ex
.imm_data
;
570 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
571 bth0
|= IB_BTH_SOLICITED
;
572 bth2
|= IB_BTH_REQ_ACK
;
574 if (qp
->s_cur
>= qp
->s_size
)
578 case OP(RDMA_READ_RESPONSE_LAST
):
580 * qp->s_state is normally set to the opcode of the
581 * last packet constructed for new requests and therefore
582 * is never set to RDMA read response.
583 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
584 * thread to indicate a RDMA write needs to be restarted from
585 * an earlier PSN without interfering with the sending thread.
588 qp
->s_len
= restart_sge(&qp
->s_sge
, wqe
, qp
->s_psn
, pmtu
);
590 case OP(RDMA_WRITE_FIRST
):
591 qp
->s_state
= OP(RDMA_WRITE_MIDDLE
);
593 case OP(RDMA_WRITE_MIDDLE
):
594 bth2
= mask_psn(qp
->s_psn
++);
595 if (cmp_psn(qp
->s_psn
, qp
->s_next_psn
) > 0)
596 qp
->s_next_psn
= qp
->s_psn
;
601 middle
= HFI1_CAP_IS_KSET(SDMA_AHG
);
604 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
605 qp
->s_state
= OP(RDMA_WRITE_LAST
);
607 qp
->s_state
= OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
);
608 /* Immediate data comes after the BTH */
609 ohdr
->u
.imm_data
= wqe
->wr
.ex
.imm_data
;
611 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
612 bth0
|= IB_BTH_SOLICITED
;
614 bth2
|= IB_BTH_REQ_ACK
;
616 if (qp
->s_cur
>= qp
->s_size
)
620 case OP(RDMA_READ_RESPONSE_MIDDLE
):
622 * qp->s_state is normally set to the opcode of the
623 * last packet constructed for new requests and therefore
624 * is never set to RDMA read response.
625 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
626 * thread to indicate a RDMA read needs to be restarted from
627 * an earlier PSN without interfering with the sending thread.
630 len
= (delta_psn(qp
->s_psn
, wqe
->psn
)) * pmtu
;
631 ohdr
->u
.rc
.reth
.vaddr
=
632 cpu_to_be64(wqe
->rdma_wr
.remote_addr
+ len
);
633 ohdr
->u
.rc
.reth
.rkey
=
634 cpu_to_be32(wqe
->rdma_wr
.rkey
);
635 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(wqe
->length
- len
);
636 qp
->s_state
= OP(RDMA_READ_REQUEST
);
637 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
638 bth2
= mask_psn(qp
->s_psn
) | IB_BTH_REQ_ACK
;
639 qp
->s_psn
= wqe
->lpsn
+ 1;
643 if (qp
->s_cur
== qp
->s_size
)
647 qp
->s_sending_hpsn
= bth2
;
648 delta
= delta_psn(bth2
, wqe
->psn
);
649 if (delta
&& delta
% HFI1_PSN_CREDIT
== 0)
650 bth2
|= IB_BTH_REQ_ACK
;
651 if (qp
->s_flags
& HFI1_S_SEND_ONE
) {
652 qp
->s_flags
&= ~HFI1_S_SEND_ONE
;
653 qp
->s_flags
|= HFI1_S_WAIT_ACK
;
654 bth2
|= IB_BTH_REQ_ACK
;
657 qp
->s_hdrwords
= hwords
;
659 qp
->s_cur_size
= len
;
660 hfi1_make_ruc_header(
663 bth0
| (qp
->s_state
<< 24),
671 qp
->s_flags
&= ~HFI1_S_BUSY
;
673 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
678 * hfi1_send_rc_ack - Construct an ACK packet and send it
679 * @qp: a pointer to the QP
681 * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
682 * Note that RDMA reads and atomics are handled in the
683 * send side QP state and tasklet.
685 void hfi1_send_rc_ack(struct hfi1_ctxtdata
*rcd
, struct hfi1_qp
*qp
,
688 struct hfi1_ibport
*ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
689 struct hfi1_pportdata
*ppd
= ppd_from_ibp(ibp
);
690 u64 pbc
, pbc_flags
= 0;
696 struct send_context
*sc
;
697 struct pio_buf
*pbuf
;
698 struct hfi1_ib_header hdr
;
699 struct hfi1_other_headers
*ohdr
;
702 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
703 if (qp
->s_flags
& HFI1_S_RESP_PENDING
)
706 /* Ensure s_rdma_ack_cnt changes are committed */
707 smp_read_barrier_depends();
708 if (qp
->s_rdma_ack_cnt
)
711 /* Construct the header */
712 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
714 if (unlikely(qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)) {
715 hwords
+= hfi1_make_grh(ibp
, &hdr
.u
.l
.grh
,
716 &qp
->remote_ah_attr
.grh
, hwords
, 0);
723 /* read pkey_index w/o lock (its atomic) */
724 bth0
= hfi1_get_pkey(ibp
, qp
->s_pkey_index
) | (OP(ACKNOWLEDGE
) << 24);
725 if (qp
->s_mig_state
== IB_MIG_MIGRATED
)
726 bth0
|= IB_BTH_MIG_REQ
;
728 ohdr
->u
.aeth
= cpu_to_be32((qp
->r_msn
& HFI1_MSN_MASK
) |
730 HFI1_AETH_CREDIT_SHIFT
));
732 ohdr
->u
.aeth
= hfi1_compute_aeth(qp
);
733 sc5
= ibp
->sl_to_sc
[qp
->remote_ah_attr
.sl
];
734 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
735 pbc_flags
|= ((!!(sc5
& 0x10)) << PBC_DC_INFO_SHIFT
);
736 lrh0
|= (sc5
& 0xf) << 12 | (qp
->remote_ah_attr
.sl
& 0xf) << 4;
737 hdr
.lrh
[0] = cpu_to_be16(lrh0
);
738 hdr
.lrh
[1] = cpu_to_be16(qp
->remote_ah_attr
.dlid
);
739 hdr
.lrh
[2] = cpu_to_be16(hwords
+ SIZE_OF_CRC
);
740 hdr
.lrh
[3] = cpu_to_be16(ppd
->lid
| qp
->remote_ah_attr
.src_path_bits
);
741 ohdr
->bth
[0] = cpu_to_be32(bth0
);
742 ohdr
->bth
[1] = cpu_to_be32(qp
->remote_qpn
);
743 ohdr
->bth
[1] |= cpu_to_be32((!!is_fecn
) << HFI1_BECN_SHIFT
);
744 ohdr
->bth
[2] = cpu_to_be32(mask_psn(qp
->r_ack_psn
));
746 /* Don't try to send ACKs if the link isn't ACTIVE */
747 if (driver_lstate(ppd
) != IB_PORT_ACTIVE
)
751 plen
= 2 /* PBC */ + hwords
;
752 vl
= sc_to_vlt(ppd
->dd
, sc5
);
753 pbc
= create_pbc(ppd
, pbc_flags
, qp
->srate_mbps
, vl
, plen
);
755 pbuf
= sc_buffer_alloc(sc
, plen
, NULL
, NULL
);
758 * We have no room to send at the moment. Pass
759 * responsibility for sending the ACK to the send tasklet
760 * so that when enough buffer space becomes available,
761 * the ACK is sent ahead of other outgoing packets.
766 trace_output_ibhdr(dd_from_ibdev(qp
->ibqp
.device
), &hdr
);
768 /* write the pbc and data */
769 ppd
->dd
->pio_inline_send(ppd
->dd
, pbuf
, pbc
, &hdr
, hwords
);
774 this_cpu_inc(*ibp
->rc_qacks
);
775 spin_lock_irqsave(&qp
->s_lock
, flags
);
776 qp
->s_flags
|= HFI1_S_ACK_PENDING
| HFI1_S_RESP_PENDING
;
777 qp
->s_nak_state
= qp
->r_nak_state
;
778 qp
->s_ack_psn
= qp
->r_ack_psn
;
780 qp
->s_flags
|= HFI1_S_ECN
;
782 /* Schedule the send tasklet. */
783 hfi1_schedule_send(qp
);
784 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
788 * reset_psn - reset the QP state to send starting from PSN
790 * @psn: the packet sequence number to restart at
792 * This is called from hfi1_rc_rcv() to process an incoming RC ACK
794 * Called at interrupt level with the QP s_lock held.
796 static void reset_psn(struct hfi1_qp
*qp
, u32 psn
)
799 struct hfi1_swqe
*wqe
= get_swqe_ptr(qp
, n
);
805 * If we are starting the request from the beginning,
806 * let the normal send code handle initialization.
808 if (cmp_psn(psn
, wqe
->psn
) <= 0) {
809 qp
->s_state
= OP(SEND_LAST
);
813 /* Find the work request opcode corresponding to the given PSN. */
814 opcode
= wqe
->wr
.opcode
;
818 if (++n
== qp
->s_size
)
822 wqe
= get_swqe_ptr(qp
, n
);
823 diff
= cmp_psn(psn
, wqe
->psn
);
828 * If we are starting the request from the beginning,
829 * let the normal send code handle initialization.
832 qp
->s_state
= OP(SEND_LAST
);
835 opcode
= wqe
->wr
.opcode
;
839 * Set the state to restart in the middle of a request.
840 * Don't change the s_sge, s_cur_sge, or s_cur_size.
841 * See hfi1_make_rc_req().
845 case IB_WR_SEND_WITH_IMM
:
846 qp
->s_state
= OP(RDMA_READ_RESPONSE_FIRST
);
849 case IB_WR_RDMA_WRITE
:
850 case IB_WR_RDMA_WRITE_WITH_IMM
:
851 qp
->s_state
= OP(RDMA_READ_RESPONSE_LAST
);
854 case IB_WR_RDMA_READ
:
855 qp
->s_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
860 * This case shouldn't happen since its only
863 qp
->s_state
= OP(SEND_LAST
);
868 * Set HFI1_S_WAIT_PSN as rc_complete() may start the timer
869 * asynchronously before the send tasklet can get scheduled.
870 * Doing it in hfi1_make_rc_req() is too late.
872 if ((cmp_psn(qp
->s_psn
, qp
->s_sending_hpsn
) <= 0) &&
873 (cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) <= 0))
874 qp
->s_flags
|= HFI1_S_WAIT_PSN
;
875 qp
->s_flags
&= ~HFI1_S_AHG_VALID
;
879 * Back up requester to resend the last un-ACKed request.
880 * The QP r_lock and s_lock should be held and interrupts disabled.
882 static void restart_rc(struct hfi1_qp
*qp
, u32 psn
, int wait
)
884 struct hfi1_swqe
*wqe
= get_swqe_ptr(qp
, qp
->s_acked
);
885 struct hfi1_ibport
*ibp
;
887 if (qp
->s_retry
== 0) {
888 if (qp
->s_mig_state
== IB_MIG_ARMED
) {
890 qp
->s_retry
= qp
->s_retry_cnt
;
891 } else if (qp
->s_last
== qp
->s_acked
) {
892 hfi1_send_complete(qp
, wqe
, IB_WC_RETRY_EXC_ERR
);
893 hfi1_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
895 } else /* need to handle delayed completion */
900 ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
901 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
904 ibp
->n_rc_resends
+= delta_psn(qp
->s_psn
, psn
);
906 qp
->s_flags
&= ~(HFI1_S_WAIT_FENCE
| HFI1_S_WAIT_RDMAR
|
907 HFI1_S_WAIT_SSN_CREDIT
| HFI1_S_WAIT_PSN
|
910 qp
->s_flags
|= HFI1_S_SEND_ONE
;
915 * This is called from s_timer for missing responses.
917 static void rc_timeout(unsigned long arg
)
919 struct hfi1_qp
*qp
= (struct hfi1_qp
*)arg
;
920 struct hfi1_ibport
*ibp
;
923 spin_lock_irqsave(&qp
->r_lock
, flags
);
924 spin_lock(&qp
->s_lock
);
925 if (qp
->s_flags
& HFI1_S_TIMER
) {
926 ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
927 ibp
->n_rc_timeouts
++;
928 qp
->s_flags
&= ~HFI1_S_TIMER
;
929 del_timer(&qp
->s_timer
);
930 trace_hfi1_rc_timeout(qp
, qp
->s_last_psn
+ 1);
931 restart_rc(qp
, qp
->s_last_psn
+ 1, 1);
932 hfi1_schedule_send(qp
);
934 spin_unlock(&qp
->s_lock
);
935 spin_unlock_irqrestore(&qp
->r_lock
, flags
);
939 * This is called from s_timer for RNR timeouts.
941 void hfi1_rc_rnr_retry(unsigned long arg
)
943 struct hfi1_qp
*qp
= (struct hfi1_qp
*)arg
;
946 spin_lock_irqsave(&qp
->s_lock
, flags
);
947 if (qp
->s_flags
& HFI1_S_WAIT_RNR
) {
948 qp
->s_flags
&= ~HFI1_S_WAIT_RNR
;
949 del_timer(&qp
->s_timer
);
950 hfi1_schedule_send(qp
);
952 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
956 * Set qp->s_sending_psn to the next PSN after the given one.
957 * This would be psn+1 except when RDMA reads are present.
959 static void reset_sending_psn(struct hfi1_qp
*qp
, u32 psn
)
961 struct hfi1_swqe
*wqe
;
964 /* Find the work request corresponding to the given PSN. */
966 wqe
= get_swqe_ptr(qp
, n
);
967 if (cmp_psn(psn
, wqe
->lpsn
) <= 0) {
968 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
969 qp
->s_sending_psn
= wqe
->lpsn
+ 1;
971 qp
->s_sending_psn
= psn
+ 1;
974 if (++n
== qp
->s_size
)
982 * This should be called with the QP s_lock held and interrupts disabled.
984 void hfi1_rc_send_complete(struct hfi1_qp
*qp
, struct hfi1_ib_header
*hdr
)
986 struct hfi1_other_headers
*ohdr
;
987 struct hfi1_swqe
*wqe
;
993 if (!(ib_hfi1_state_ops
[qp
->state
] & HFI1_PROCESS_OR_FLUSH_SEND
))
996 /* Find out where the BTH is */
997 if ((be16_to_cpu(hdr
->lrh
[0]) & 3) == HFI1_LRH_BTH
)
1000 ohdr
= &hdr
->u
.l
.oth
;
1002 opcode
= be32_to_cpu(ohdr
->bth
[0]) >> 24;
1003 if (opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
1004 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) {
1005 WARN_ON(!qp
->s_rdma_ack_cnt
);
1006 qp
->s_rdma_ack_cnt
--;
1010 psn
= be32_to_cpu(ohdr
->bth
[2]);
1011 reset_sending_psn(qp
, psn
);
1014 * Start timer after a packet requesting an ACK has been sent and
1015 * there are still requests that haven't been acked.
1017 if ((psn
& IB_BTH_REQ_ACK
) && qp
->s_acked
!= qp
->s_tail
&&
1019 (HFI1_S_TIMER
| HFI1_S_WAIT_RNR
| HFI1_S_WAIT_PSN
)) &&
1020 (ib_hfi1_state_ops
[qp
->state
] & HFI1_PROCESS_RECV_OK
))
1023 while (qp
->s_last
!= qp
->s_acked
) {
1024 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
1025 if (cmp_psn(wqe
->lpsn
, qp
->s_sending_psn
) >= 0 &&
1026 cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) <= 0)
1028 for (i
= 0; i
< wqe
->wr
.num_sge
; i
++) {
1029 struct hfi1_sge
*sge
= &wqe
->sg_list
[i
];
1031 hfi1_put_mr(sge
->mr
);
1033 /* Post a send completion queue entry if requested. */
1034 if (!(qp
->s_flags
& HFI1_S_SIGNAL_REQ_WR
) ||
1035 (wqe
->wr
.send_flags
& IB_SEND_SIGNALED
)) {
1036 memset(&wc
, 0, sizeof(wc
));
1037 wc
.wr_id
= wqe
->wr
.wr_id
;
1038 wc
.status
= IB_WC_SUCCESS
;
1039 wc
.opcode
= ib_hfi1_wc_opcode
[wqe
->wr
.opcode
];
1040 wc
.byte_len
= wqe
->length
;
1042 hfi1_cq_enter(to_icq(qp
->ibqp
.send_cq
), &wc
, 0);
1044 if (++qp
->s_last
>= qp
->s_size
)
1048 * If we were waiting for sends to complete before re-sending,
1049 * and they are now complete, restart sending.
1051 trace_hfi1_rc_sendcomplete(qp
, psn
);
1052 if (qp
->s_flags
& HFI1_S_WAIT_PSN
&&
1053 cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) > 0) {
1054 qp
->s_flags
&= ~HFI1_S_WAIT_PSN
;
1055 qp
->s_sending_psn
= qp
->s_psn
;
1056 qp
->s_sending_hpsn
= qp
->s_psn
- 1;
1057 hfi1_schedule_send(qp
);
1061 static inline void update_last_psn(struct hfi1_qp
*qp
, u32 psn
)
1063 qp
->s_last_psn
= psn
;
1067 * Generate a SWQE completion.
1068 * This is similar to hfi1_send_complete but has to check to be sure
1069 * that the SGEs are not being referenced if the SWQE is being resent.
1071 static struct hfi1_swqe
*do_rc_completion(struct hfi1_qp
*qp
,
1072 struct hfi1_swqe
*wqe
,
1073 struct hfi1_ibport
*ibp
)
1079 * Don't decrement refcount and don't generate a
1080 * completion if the SWQE is being resent until the send
1083 if (cmp_psn(wqe
->lpsn
, qp
->s_sending_psn
) < 0 ||
1084 cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) > 0) {
1085 for (i
= 0; i
< wqe
->wr
.num_sge
; i
++) {
1086 struct hfi1_sge
*sge
= &wqe
->sg_list
[i
];
1088 hfi1_put_mr(sge
->mr
);
1090 /* Post a send completion queue entry if requested. */
1091 if (!(qp
->s_flags
& HFI1_S_SIGNAL_REQ_WR
) ||
1092 (wqe
->wr
.send_flags
& IB_SEND_SIGNALED
)) {
1093 memset(&wc
, 0, sizeof(wc
));
1094 wc
.wr_id
= wqe
->wr
.wr_id
;
1095 wc
.status
= IB_WC_SUCCESS
;
1096 wc
.opcode
= ib_hfi1_wc_opcode
[wqe
->wr
.opcode
];
1097 wc
.byte_len
= wqe
->length
;
1099 hfi1_cq_enter(to_icq(qp
->ibqp
.send_cq
), &wc
, 0);
1101 if (++qp
->s_last
>= qp
->s_size
)
1104 struct hfi1_pportdata
*ppd
= ppd_from_ibp(ibp
);
1106 this_cpu_inc(*ibp
->rc_delayed_comp
);
1108 * If send progress not running attempt to progress
1111 if (ppd
->dd
->flags
& HFI1_HAS_SEND_DMA
) {
1112 struct sdma_engine
*engine
;
1115 /* For now use sc to find engine */
1116 sc5
= ibp
->sl_to_sc
[qp
->remote_ah_attr
.sl
];
1117 engine
= qp_to_sdma_engine(qp
, sc5
);
1118 sdma_engine_progress_schedule(engine
);
1122 qp
->s_retry
= qp
->s_retry_cnt
;
1123 update_last_psn(qp
, wqe
->lpsn
);
1126 * If we are completing a request which is in the process of
1127 * being resent, we can stop re-sending it since we know the
1128 * responder has already seen it.
1130 if (qp
->s_acked
== qp
->s_cur
) {
1131 if (++qp
->s_cur
>= qp
->s_size
)
1133 qp
->s_acked
= qp
->s_cur
;
1134 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
1135 if (qp
->s_acked
!= qp
->s_tail
) {
1136 qp
->s_state
= OP(SEND_LAST
);
1137 qp
->s_psn
= wqe
->psn
;
1140 if (++qp
->s_acked
>= qp
->s_size
)
1142 if (qp
->state
== IB_QPS_SQD
&& qp
->s_acked
== qp
->s_cur
)
1144 wqe
= get_swqe_ptr(qp
, qp
->s_acked
);
1150 * do_rc_ack - process an incoming RC ACK
1151 * @qp: the QP the ACK came in on
1152 * @psn: the packet sequence number of the ACK
1153 * @opcode: the opcode of the request that resulted in the ACK
1155 * This is called from rc_rcv_resp() to process an incoming RC ACK
1157 * May be called at interrupt level, with the QP s_lock held.
1158 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1160 static int do_rc_ack(struct hfi1_qp
*qp
, u32 aeth
, u32 psn
, int opcode
,
1161 u64 val
, struct hfi1_ctxtdata
*rcd
)
1163 struct hfi1_ibport
*ibp
;
1164 enum ib_wc_status status
;
1165 struct hfi1_swqe
*wqe
;
1170 /* Remove QP from retry timer */
1171 if (qp
->s_flags
& (HFI1_S_TIMER
| HFI1_S_WAIT_RNR
)) {
1172 qp
->s_flags
&= ~(HFI1_S_TIMER
| HFI1_S_WAIT_RNR
);
1173 del_timer(&qp
->s_timer
);
1177 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1178 * requests and implicitly NAK RDMA read and atomic requests issued
1179 * before the NAK'ed request. The MSN won't include the NAK'ed
1180 * request but will include an ACK'ed request(s).
1185 wqe
= get_swqe_ptr(qp
, qp
->s_acked
);
1186 ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
1189 * The MSN might be for a later WQE than the PSN indicates so
1190 * only complete WQEs that the PSN finishes.
1192 while ((diff
= delta_psn(ack_psn
, wqe
->lpsn
)) >= 0) {
1194 * RDMA_READ_RESPONSE_ONLY is a special case since
1195 * we want to generate completion events for everything
1196 * before the RDMA read, copy the data, then generate
1197 * the completion for the read.
1199 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
1200 opcode
== OP(RDMA_READ_RESPONSE_ONLY
) &&
1206 * If this request is a RDMA read or atomic, and the ACK is
1207 * for a later operation, this ACK NAKs the RDMA read or
1208 * atomic. In other words, only a RDMA_READ_LAST or ONLY
1209 * can ACK a RDMA read and likewise for atomic ops. Note
1210 * that the NAK case can only happen if relaxed ordering is
1211 * used and requests are sent after an RDMA read or atomic
1212 * is sent but before the response is received.
1214 if ((wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
1215 (opcode
!= OP(RDMA_READ_RESPONSE_LAST
) || diff
!= 0)) ||
1216 ((wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
1217 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) &&
1218 (opcode
!= OP(ATOMIC_ACKNOWLEDGE
) || diff
!= 0))) {
1219 /* Retry this request. */
1220 if (!(qp
->r_flags
& HFI1_R_RDMAR_SEQ
)) {
1221 qp
->r_flags
|= HFI1_R_RDMAR_SEQ
;
1222 restart_rc(qp
, qp
->s_last_psn
+ 1, 0);
1223 if (list_empty(&qp
->rspwait
)) {
1224 qp
->r_flags
|= HFI1_R_RSP_SEND
;
1225 atomic_inc(&qp
->refcount
);
1226 list_add_tail(&qp
->rspwait
,
1227 &rcd
->qp_wait_list
);
1231 * No need to process the ACK/NAK since we are
1232 * restarting an earlier request.
1236 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
1237 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) {
1238 u64
*vaddr
= wqe
->sg_list
[0].vaddr
;
1241 if (qp
->s_num_rd_atomic
&&
1242 (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
1243 wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
1244 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)) {
1245 qp
->s_num_rd_atomic
--;
1246 /* Restart sending task if fence is complete */
1247 if ((qp
->s_flags
& HFI1_S_WAIT_FENCE
) &&
1248 !qp
->s_num_rd_atomic
) {
1249 qp
->s_flags
&= ~(HFI1_S_WAIT_FENCE
|
1251 hfi1_schedule_send(qp
);
1252 } else if (qp
->s_flags
& HFI1_S_WAIT_RDMAR
) {
1253 qp
->s_flags
&= ~(HFI1_S_WAIT_RDMAR
|
1255 hfi1_schedule_send(qp
);
1258 wqe
= do_rc_completion(qp
, wqe
, ibp
);
1259 if (qp
->s_acked
== qp
->s_tail
)
1263 switch (aeth
>> 29) {
1265 this_cpu_inc(*ibp
->rc_acks
);
1266 if (qp
->s_acked
!= qp
->s_tail
) {
1268 * We are expecting more ACKs so
1269 * reset the re-transmit timer.
1273 * We can stop re-sending the earlier packets and
1274 * continue with the next packet the receiver wants.
1276 if (cmp_psn(qp
->s_psn
, psn
) <= 0)
1277 reset_psn(qp
, psn
+ 1);
1278 } else if (cmp_psn(qp
->s_psn
, psn
) <= 0) {
1279 qp
->s_state
= OP(SEND_LAST
);
1280 qp
->s_psn
= psn
+ 1;
1282 if (qp
->s_flags
& HFI1_S_WAIT_ACK
) {
1283 qp
->s_flags
&= ~HFI1_S_WAIT_ACK
;
1284 hfi1_schedule_send(qp
);
1286 hfi1_get_credit(qp
, aeth
);
1287 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
1288 qp
->s_retry
= qp
->s_retry_cnt
;
1289 update_last_psn(qp
, psn
);
1293 case 1: /* RNR NAK */
1295 if (qp
->s_acked
== qp
->s_tail
)
1297 if (qp
->s_flags
& HFI1_S_WAIT_RNR
)
1299 if (qp
->s_rnr_retry
== 0) {
1300 status
= IB_WC_RNR_RETRY_EXC_ERR
;
1303 if (qp
->s_rnr_retry_cnt
< 7)
1306 /* The last valid PSN is the previous PSN. */
1307 update_last_psn(qp
, psn
- 1);
1309 ibp
->n_rc_resends
+= delta_psn(qp
->s_psn
, psn
);
1313 qp
->s_flags
&= ~(HFI1_S_WAIT_SSN_CREDIT
| HFI1_S_WAIT_ACK
);
1314 qp
->s_flags
|= HFI1_S_WAIT_RNR
;
1315 qp
->s_timer
.function
= hfi1_rc_rnr_retry
;
1316 qp
->s_timer
.expires
= jiffies
+ usecs_to_jiffies(
1317 ib_hfi1_rnr_table
[(aeth
>> HFI1_AETH_CREDIT_SHIFT
) &
1318 HFI1_AETH_CREDIT_MASK
]);
1319 add_timer(&qp
->s_timer
);
1323 if (qp
->s_acked
== qp
->s_tail
)
1325 /* The last valid PSN is the previous PSN. */
1326 update_last_psn(qp
, psn
- 1);
1327 switch ((aeth
>> HFI1_AETH_CREDIT_SHIFT
) &
1328 HFI1_AETH_CREDIT_MASK
) {
1329 case 0: /* PSN sequence error */
1332 * Back up to the responder's expected PSN.
1333 * Note that we might get a NAK in the middle of an
1334 * RDMA READ response which terminates the RDMA
1337 restart_rc(qp
, psn
, 0);
1338 hfi1_schedule_send(qp
);
1341 case 1: /* Invalid Request */
1342 status
= IB_WC_REM_INV_REQ_ERR
;
1343 ibp
->n_other_naks
++;
1346 case 2: /* Remote Access Error */
1347 status
= IB_WC_REM_ACCESS_ERR
;
1348 ibp
->n_other_naks
++;
1351 case 3: /* Remote Operation Error */
1352 status
= IB_WC_REM_OP_ERR
;
1353 ibp
->n_other_naks
++;
1355 if (qp
->s_last
== qp
->s_acked
) {
1356 hfi1_send_complete(qp
, wqe
, status
);
1357 hfi1_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
1362 /* Ignore other reserved NAK error codes */
1365 qp
->s_retry
= qp
->s_retry_cnt
;
1366 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
1369 default: /* 2: reserved */
1371 /* Ignore reserved NAK codes. */
1380 * We have seen an out of sequence RDMA read middle or last packet.
1381 * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1383 static void rdma_seq_err(struct hfi1_qp
*qp
, struct hfi1_ibport
*ibp
, u32 psn
,
1384 struct hfi1_ctxtdata
*rcd
)
1386 struct hfi1_swqe
*wqe
;
1388 /* Remove QP from retry timer */
1389 if (qp
->s_flags
& (HFI1_S_TIMER
| HFI1_S_WAIT_RNR
)) {
1390 qp
->s_flags
&= ~(HFI1_S_TIMER
| HFI1_S_WAIT_RNR
);
1391 del_timer(&qp
->s_timer
);
1394 wqe
= get_swqe_ptr(qp
, qp
->s_acked
);
1396 while (cmp_psn(psn
, wqe
->lpsn
) > 0) {
1397 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
1398 wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
1399 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)
1401 wqe
= do_rc_completion(qp
, wqe
, ibp
);
1405 qp
->r_flags
|= HFI1_R_RDMAR_SEQ
;
1406 restart_rc(qp
, qp
->s_last_psn
+ 1, 0);
1407 if (list_empty(&qp
->rspwait
)) {
1408 qp
->r_flags
|= HFI1_R_RSP_SEND
;
1409 atomic_inc(&qp
->refcount
);
1410 list_add_tail(&qp
->rspwait
, &rcd
->qp_wait_list
);
1415 * rc_rcv_resp - process an incoming RC response packet
1416 * @ibp: the port this packet came in on
1417 * @ohdr: the other headers for this packet
1418 * @data: the packet data
1419 * @tlen: the packet length
1420 * @qp: the QP for this packet
1421 * @opcode: the opcode for this packet
1422 * @psn: the packet sequence number for this packet
1423 * @hdrsize: the header length
1424 * @pmtu: the path MTU
1426 * This is called from hfi1_rc_rcv() to process an incoming RC response
1427 * packet for the given QP.
1428 * Called at interrupt level.
1430 static void rc_rcv_resp(struct hfi1_ibport
*ibp
,
1431 struct hfi1_other_headers
*ohdr
,
1432 void *data
, u32 tlen
, struct hfi1_qp
*qp
,
1433 u32 opcode
, u32 psn
, u32 hdrsize
, u32 pmtu
,
1434 struct hfi1_ctxtdata
*rcd
)
1436 struct hfi1_swqe
*wqe
;
1437 enum ib_wc_status status
;
1438 unsigned long flags
;
1444 spin_lock_irqsave(&qp
->s_lock
, flags
);
1446 trace_hfi1_rc_ack(qp
, psn
);
1448 /* Ignore invalid responses. */
1449 if (cmp_psn(psn
, qp
->s_next_psn
) >= 0)
1452 /* Ignore duplicate responses. */
1453 diff
= cmp_psn(psn
, qp
->s_last_psn
);
1454 if (unlikely(diff
<= 0)) {
1455 /* Update credits for "ghost" ACKs */
1456 if (diff
== 0 && opcode
== OP(ACKNOWLEDGE
)) {
1457 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1458 if ((aeth
>> 29) == 0)
1459 hfi1_get_credit(qp
, aeth
);
1465 * Skip everything other than the PSN we expect, if we are waiting
1466 * for a reply to a restarted RDMA read or atomic op.
1468 if (qp
->r_flags
& HFI1_R_RDMAR_SEQ
) {
1469 if (cmp_psn(psn
, qp
->s_last_psn
+ 1) != 0)
1471 qp
->r_flags
&= ~HFI1_R_RDMAR_SEQ
;
1474 if (unlikely(qp
->s_acked
== qp
->s_tail
))
1476 wqe
= get_swqe_ptr(qp
, qp
->s_acked
);
1477 status
= IB_WC_SUCCESS
;
1480 case OP(ACKNOWLEDGE
):
1481 case OP(ATOMIC_ACKNOWLEDGE
):
1482 case OP(RDMA_READ_RESPONSE_FIRST
):
1483 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1484 if (opcode
== OP(ATOMIC_ACKNOWLEDGE
)) {
1485 __be32
*p
= ohdr
->u
.at
.atomic_ack_eth
;
1487 val
= ((u64
) be32_to_cpu(p
[0]) << 32) |
1491 if (!do_rc_ack(qp
, aeth
, psn
, opcode
, val
, rcd
) ||
1492 opcode
!= OP(RDMA_READ_RESPONSE_FIRST
))
1494 wqe
= get_swqe_ptr(qp
, qp
->s_acked
);
1495 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1498 * If this is a response to a resent RDMA read, we
1499 * have to be careful to copy the data to the right
1502 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1506 case OP(RDMA_READ_RESPONSE_MIDDLE
):
1507 /* no AETH, no ACK */
1508 if (unlikely(cmp_psn(psn
, qp
->s_last_psn
+ 1)))
1510 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1513 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1515 if (unlikely(pmtu
>= qp
->s_rdma_read_len
))
1519 * We got a response so update the timeout.
1520 * 4.096 usec. * (1 << qp->timeout)
1522 qp
->s_flags
|= HFI1_S_TIMER
;
1523 mod_timer(&qp
->s_timer
, jiffies
+ qp
->timeout_jiffies
);
1524 if (qp
->s_flags
& HFI1_S_WAIT_ACK
) {
1525 qp
->s_flags
&= ~HFI1_S_WAIT_ACK
;
1526 hfi1_schedule_send(qp
);
1529 if (opcode
== OP(RDMA_READ_RESPONSE_MIDDLE
))
1530 qp
->s_retry
= qp
->s_retry_cnt
;
1533 * Update the RDMA receive state but do the copy w/o
1534 * holding the locks and blocking interrupts.
1536 qp
->s_rdma_read_len
-= pmtu
;
1537 update_last_psn(qp
, psn
);
1538 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1539 hfi1_copy_sge(&qp
->s_rdma_read_sge
, data
, pmtu
, 0);
1542 case OP(RDMA_READ_RESPONSE_ONLY
):
1543 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1544 if (!do_rc_ack(qp
, aeth
, psn
, opcode
, 0, rcd
))
1546 /* Get the number of bytes the message was padded by. */
1547 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1549 * Check that the data size is >= 0 && <= pmtu.
1550 * Remember to account for ICRC (4).
1552 if (unlikely(tlen
< (hdrsize
+ pad
+ 4)))
1555 * If this is a response to a resent RDMA read, we
1556 * have to be careful to copy the data to the right
1559 wqe
= get_swqe_ptr(qp
, qp
->s_acked
);
1560 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1564 case OP(RDMA_READ_RESPONSE_LAST
):
1565 /* ACKs READ req. */
1566 if (unlikely(cmp_psn(psn
, qp
->s_last_psn
+ 1)))
1568 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1570 /* Get the number of bytes the message was padded by. */
1571 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1573 * Check that the data size is >= 1 && <= pmtu.
1574 * Remember to account for ICRC (4).
1576 if (unlikely(tlen
<= (hdrsize
+ pad
+ 4)))
1579 tlen
-= hdrsize
+ pad
+ 4;
1580 if (unlikely(tlen
!= qp
->s_rdma_read_len
))
1582 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1583 hfi1_copy_sge(&qp
->s_rdma_read_sge
, data
, tlen
, 0);
1584 WARN_ON(qp
->s_rdma_read_sge
.num_sge
);
1585 (void) do_rc_ack(qp
, aeth
, psn
,
1586 OP(RDMA_READ_RESPONSE_LAST
), 0, rcd
);
1591 status
= IB_WC_LOC_QP_OP_ERR
;
1595 rdma_seq_err(qp
, ibp
, psn
, rcd
);
1599 status
= IB_WC_LOC_LEN_ERR
;
1601 if (qp
->s_last
== qp
->s_acked
) {
1602 hfi1_send_complete(qp
, wqe
, status
);
1603 hfi1_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
1606 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1612 * rc_rcv_error - process an incoming duplicate or error RC packet
1613 * @ohdr: the other headers for this packet
1614 * @data: the packet data
1615 * @qp: the QP for this packet
1616 * @opcode: the opcode for this packet
1617 * @psn: the packet sequence number for this packet
1618 * @diff: the difference between the PSN and the expected PSN
1620 * This is called from hfi1_rc_rcv() to process an unexpected
1621 * incoming RC packet for the given QP.
1622 * Called at interrupt level.
1623 * Return 1 if no more processing is needed; otherwise return 0 to
1624 * schedule a response to be sent.
1626 static noinline
int rc_rcv_error(struct hfi1_other_headers
*ohdr
, void *data
,
1627 struct hfi1_qp
*qp
, u32 opcode
, u32 psn
, int diff
,
1628 struct hfi1_ctxtdata
*rcd
)
1630 struct hfi1_ibport
*ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
1631 struct hfi1_ack_entry
*e
;
1632 unsigned long flags
;
1636 trace_hfi1_rc_rcv_error(qp
, psn
);
1639 * Packet sequence error.
1640 * A NAK will ACK earlier sends and RDMA writes.
1641 * Don't queue the NAK if we already sent one.
1643 if (!qp
->r_nak_state
) {
1645 qp
->r_nak_state
= IB_NAK_PSN_ERROR
;
1646 /* Use the expected PSN. */
1647 qp
->r_ack_psn
= qp
->r_psn
;
1649 * Wait to send the sequence NAK until all packets
1650 * in the receive queue have been processed.
1651 * Otherwise, we end up propagating congestion.
1653 if (list_empty(&qp
->rspwait
)) {
1654 qp
->r_flags
|= HFI1_R_RSP_NAK
;
1655 atomic_inc(&qp
->refcount
);
1656 list_add_tail(&qp
->rspwait
, &rcd
->qp_wait_list
);
1663 * Handle a duplicate request. Don't re-execute SEND, RDMA
1664 * write or atomic op. Don't NAK errors, just silently drop
1665 * the duplicate request. Note that r_sge, r_len, and
1666 * r_rcv_len may be in use so don't modify them.
1668 * We are supposed to ACK the earliest duplicate PSN but we
1669 * can coalesce an outstanding duplicate ACK. We have to
1670 * send the earliest so that RDMA reads can be restarted at
1671 * the requester's expected PSN.
1673 * First, find where this duplicate PSN falls within the
1674 * ACKs previously sent.
1675 * old_req is true if there is an older response that is scheduled
1676 * to be sent before sending this one.
1682 spin_lock_irqsave(&qp
->s_lock
, flags
);
1684 for (i
= qp
->r_head_ack_queue
; ; i
= prev
) {
1685 if (i
== qp
->s_tail_ack_queue
)
1690 prev
= HFI1_MAX_RDMA_ATOMIC
;
1691 if (prev
== qp
->r_head_ack_queue
) {
1695 e
= &qp
->s_ack_queue
[prev
];
1700 if (cmp_psn(psn
, e
->psn
) >= 0) {
1701 if (prev
== qp
->s_tail_ack_queue
&&
1702 cmp_psn(psn
, e
->lpsn
) <= 0)
1708 case OP(RDMA_READ_REQUEST
): {
1709 struct ib_reth
*reth
;
1714 * If we didn't find the RDMA read request in the ack queue,
1715 * we can ignore this request.
1717 if (!e
|| e
->opcode
!= OP(RDMA_READ_REQUEST
))
1719 /* RETH comes after BTH */
1720 reth
= &ohdr
->u
.rc
.reth
;
1722 * Address range must be a subset of the original
1723 * request and start on pmtu boundaries.
1724 * We reuse the old ack_queue slot since the requester
1725 * should not back up and request an earlier PSN for the
1728 offset
= delta_psn(psn
, e
->psn
) * qp
->pmtu
;
1729 len
= be32_to_cpu(reth
->length
);
1730 if (unlikely(offset
+ len
!= e
->rdma_sge
.sge_length
))
1732 if (e
->rdma_sge
.mr
) {
1733 hfi1_put_mr(e
->rdma_sge
.mr
);
1734 e
->rdma_sge
.mr
= NULL
;
1737 u32 rkey
= be32_to_cpu(reth
->rkey
);
1738 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1741 ok
= hfi1_rkey_ok(qp
, &e
->rdma_sge
, len
, vaddr
, rkey
,
1742 IB_ACCESS_REMOTE_READ
);
1746 e
->rdma_sge
.vaddr
= NULL
;
1747 e
->rdma_sge
.length
= 0;
1748 e
->rdma_sge
.sge_length
= 0;
1753 qp
->s_tail_ack_queue
= prev
;
1757 case OP(COMPARE_SWAP
):
1758 case OP(FETCH_ADD
): {
1760 * If we didn't find the atomic request in the ack queue
1761 * or the send tasklet is already backed up to send an
1762 * earlier entry, we can ignore this request.
1764 if (!e
|| e
->opcode
!= (u8
) opcode
|| old_req
)
1766 qp
->s_tail_ack_queue
= prev
;
1772 * Ignore this operation if it doesn't request an ACK
1773 * or an earlier RDMA read or atomic is going to be resent.
1775 if (!(psn
& IB_BTH_REQ_ACK
) || old_req
)
1778 * Resend the most recent ACK if this request is
1779 * after all the previous RDMA reads and atomics.
1781 if (i
== qp
->r_head_ack_queue
) {
1782 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1783 qp
->r_nak_state
= 0;
1784 qp
->r_ack_psn
= qp
->r_psn
- 1;
1789 * Resend the RDMA read or atomic op which
1790 * ACKs this duplicate request.
1792 qp
->s_tail_ack_queue
= i
;
1795 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1796 qp
->s_flags
|= HFI1_S_RESP_PENDING
;
1797 qp
->r_nak_state
= 0;
1798 hfi1_schedule_send(qp
);
1801 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1809 void hfi1_rc_error(struct hfi1_qp
*qp
, enum ib_wc_status err
)
1811 unsigned long flags
;
1814 spin_lock_irqsave(&qp
->s_lock
, flags
);
1815 lastwqe
= hfi1_error_qp(qp
, err
);
1816 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1821 ev
.device
= qp
->ibqp
.device
;
1822 ev
.element
.qp
= &qp
->ibqp
;
1823 ev
.event
= IB_EVENT_QP_LAST_WQE_REACHED
;
1824 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1828 static inline void update_ack_queue(struct hfi1_qp
*qp
, unsigned n
)
1833 if (next
> HFI1_MAX_RDMA_ATOMIC
)
1835 qp
->s_tail_ack_queue
= next
;
1836 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1839 static void log_cca_event(struct hfi1_pportdata
*ppd
, u8 sl
, u32 rlid
,
1840 u32 lqpn
, u32 rqpn
, u8 svc_type
)
1842 struct opa_hfi1_cong_log_event_internal
*cc_event
;
1843 unsigned long flags
;
1845 if (sl
>= OPA_MAX_SLS
)
1848 spin_lock_irqsave(&ppd
->cc_log_lock
, flags
);
1850 ppd
->threshold_cong_event_map
[sl
/8] |= 1 << (sl
% 8);
1851 ppd
->threshold_event_counter
++;
1853 cc_event
= &ppd
->cc_events
[ppd
->cc_log_idx
++];
1854 if (ppd
->cc_log_idx
== OPA_CONG_LOG_ELEMS
)
1855 ppd
->cc_log_idx
= 0;
1856 cc_event
->lqpn
= lqpn
& HFI1_QPN_MASK
;
1857 cc_event
->rqpn
= rqpn
& HFI1_QPN_MASK
;
1859 cc_event
->svc_type
= svc_type
;
1860 cc_event
->rlid
= rlid
;
1861 /* keep timestamp in units of 1.024 usec */
1862 cc_event
->timestamp
= ktime_to_ns(ktime_get()) / 1024;
1864 spin_unlock_irqrestore(&ppd
->cc_log_lock
, flags
);
1867 void process_becn(struct hfi1_pportdata
*ppd
, u8 sl
, u16 rlid
, u32 lqpn
,
1868 u32 rqpn
, u8 svc_type
)
1870 struct cca_timer
*cca_timer
;
1871 u16 ccti
, ccti_incr
, ccti_timer
, ccti_limit
;
1872 u8 trigger_threshold
;
1873 struct cc_state
*cc_state
;
1874 unsigned long flags
;
1876 if (sl
>= OPA_MAX_SLS
)
1879 cca_timer
= &ppd
->cca_timer
[sl
];
1881 cc_state
= get_cc_state(ppd
);
1883 if (cc_state
== NULL
)
1887 * 1) increase CCTI (for this SL)
1888 * 2) select IPG (i.e., call set_link_ipg())
1891 ccti_limit
= cc_state
->cct
.ccti_limit
;
1892 ccti_incr
= cc_state
->cong_setting
.entries
[sl
].ccti_increase
;
1893 ccti_timer
= cc_state
->cong_setting
.entries
[sl
].ccti_timer
;
1895 cc_state
->cong_setting
.entries
[sl
].trigger_threshold
;
1897 spin_lock_irqsave(&ppd
->cca_timer_lock
, flags
);
1899 if (cca_timer
->ccti
< ccti_limit
) {
1900 if (cca_timer
->ccti
+ ccti_incr
<= ccti_limit
)
1901 cca_timer
->ccti
+= ccti_incr
;
1903 cca_timer
->ccti
= ccti_limit
;
1907 spin_unlock_irqrestore(&ppd
->cca_timer_lock
, flags
);
1909 ccti
= cca_timer
->ccti
;
1911 if (!hrtimer_active(&cca_timer
->hrtimer
)) {
1912 /* ccti_timer is in units of 1.024 usec */
1913 unsigned long nsec
= 1024 * ccti_timer
;
1915 hrtimer_start(&cca_timer
->hrtimer
, ns_to_ktime(nsec
),
1919 if ((trigger_threshold
!= 0) && (ccti
>= trigger_threshold
))
1920 log_cca_event(ppd
, sl
, rlid
, lqpn
, rqpn
, svc_type
);
1924 * hfi1_rc_rcv - process an incoming RC packet
1925 * @rcd: the context pointer
1926 * @hdr: the header of this packet
1927 * @rcv_flags: flags relevant to rcv processing
1928 * @data: the packet data
1929 * @tlen: the packet length
1930 * @qp: the QP for this packet
1932 * This is called from qp_rcv() to process an incoming RC packet
1934 * May be called at interrupt level.
1936 void hfi1_rc_rcv(struct hfi1_packet
*packet
)
1938 struct hfi1_ctxtdata
*rcd
= packet
->rcd
;
1939 struct hfi1_ib_header
*hdr
= packet
->hdr
;
1940 u32 rcv_flags
= packet
->rcv_flags
;
1941 void *data
= packet
->ebuf
;
1942 u32 tlen
= packet
->tlen
;
1943 struct hfi1_qp
*qp
= packet
->qp
;
1944 struct hfi1_ibport
*ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
1945 struct hfi1_pportdata
*ppd
= ppd_from_ibp(ibp
);
1946 struct hfi1_other_headers
*ohdr
= packet
->ohdr
;
1948 u32 hdrsize
= packet
->hlen
;
1952 u32 pmtu
= qp
->pmtu
;
1954 struct ib_reth
*reth
;
1955 unsigned long flags
;
1957 int ret
, is_fecn
= 0;
1959 bth0
= be32_to_cpu(ohdr
->bth
[0]);
1960 if (hfi1_ruc_check_hdr(ibp
, hdr
, rcv_flags
& HFI1_HAS_GRH
, qp
, bth0
))
1963 bth1
= be32_to_cpu(ohdr
->bth
[1]);
1964 if (unlikely(bth1
& (HFI1_BECN_SMASK
| HFI1_FECN_SMASK
))) {
1965 if (bth1
& HFI1_BECN_SMASK
) {
1966 u16 rlid
= qp
->remote_ah_attr
.dlid
;
1969 lqpn
= qp
->ibqp
.qp_num
;
1970 rqpn
= qp
->remote_qpn
;
1973 qp
->remote_ah_attr
.sl
,
1977 is_fecn
= bth1
& HFI1_FECN_SMASK
;
1980 psn
= be32_to_cpu(ohdr
->bth
[2]);
1981 opcode
= (bth0
>> 24) & 0xff;
1984 * Process responses (ACKs) before anything else. Note that the
1985 * packet sequence number will be for something in the send work
1986 * queue rather than the expected receive packet sequence number.
1987 * In other words, this QP is the requester.
1989 if (opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
1990 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) {
1991 rc_rcv_resp(ibp
, ohdr
, data
, tlen
, qp
, opcode
, psn
,
1992 hdrsize
, pmtu
, rcd
);
1998 /* Compute 24 bits worth of difference. */
1999 diff
= delta_psn(psn
, qp
->r_psn
);
2000 if (unlikely(diff
)) {
2001 if (rc_rcv_error(ohdr
, data
, qp
, opcode
, psn
, diff
, rcd
))
2006 /* Check for opcode sequence errors. */
2007 switch (qp
->r_state
) {
2008 case OP(SEND_FIRST
):
2009 case OP(SEND_MIDDLE
):
2010 if (opcode
== OP(SEND_MIDDLE
) ||
2011 opcode
== OP(SEND_LAST
) ||
2012 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
))
2016 case OP(RDMA_WRITE_FIRST
):
2017 case OP(RDMA_WRITE_MIDDLE
):
2018 if (opcode
== OP(RDMA_WRITE_MIDDLE
) ||
2019 opcode
== OP(RDMA_WRITE_LAST
) ||
2020 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
2025 if (opcode
== OP(SEND_MIDDLE
) ||
2026 opcode
== OP(SEND_LAST
) ||
2027 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
) ||
2028 opcode
== OP(RDMA_WRITE_MIDDLE
) ||
2029 opcode
== OP(RDMA_WRITE_LAST
) ||
2030 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
2033 * Note that it is up to the requester to not send a new
2034 * RDMA read or atomic operation before receiving an ACK
2035 * for the previous operation.
2040 if (qp
->state
== IB_QPS_RTR
&& !(qp
->r_flags
& HFI1_R_COMM_EST
))
2043 /* OK, process the packet. */
2045 case OP(SEND_FIRST
):
2046 ret
= hfi1_get_rwqe(qp
, 0);
2053 case OP(SEND_MIDDLE
):
2054 case OP(RDMA_WRITE_MIDDLE
):
2056 /* Check for invalid length PMTU or posted rwqe len. */
2057 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
2059 qp
->r_rcv_len
+= pmtu
;
2060 if (unlikely(qp
->r_rcv_len
> qp
->r_len
))
2062 hfi1_copy_sge(&qp
->r_sge
, data
, pmtu
, 1);
2065 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
):
2067 ret
= hfi1_get_rwqe(qp
, 1);
2075 case OP(SEND_ONLY_WITH_IMMEDIATE
):
2076 ret
= hfi1_get_rwqe(qp
, 0);
2082 if (opcode
== OP(SEND_ONLY
))
2083 goto no_immediate_data
;
2084 /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
2085 case OP(SEND_LAST_WITH_IMMEDIATE
):
2087 wc
.ex
.imm_data
= ohdr
->u
.imm_data
;
2088 wc
.wc_flags
= IB_WC_WITH_IMM
;
2091 case OP(RDMA_WRITE_LAST
):
2096 /* Get the number of bytes the message was padded by. */
2097 pad
= (bth0
>> 20) & 3;
2098 /* Check for invalid length. */
2099 /* LAST len should be >= 1 */
2100 if (unlikely(tlen
< (hdrsize
+ pad
+ 4)))
2102 /* Don't count the CRC. */
2103 tlen
-= (hdrsize
+ pad
+ 4);
2104 wc
.byte_len
= tlen
+ qp
->r_rcv_len
;
2105 if (unlikely(wc
.byte_len
> qp
->r_len
))
2107 hfi1_copy_sge(&qp
->r_sge
, data
, tlen
, 1);
2108 hfi1_put_ss(&qp
->r_sge
);
2110 if (!test_and_clear_bit(HFI1_R_WRID_VALID
, &qp
->r_aflags
))
2112 wc
.wr_id
= qp
->r_wr_id
;
2113 wc
.status
= IB_WC_SUCCESS
;
2114 if (opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
) ||
2115 opcode
== OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
))
2116 wc
.opcode
= IB_WC_RECV_RDMA_WITH_IMM
;
2118 wc
.opcode
= IB_WC_RECV
;
2120 wc
.src_qp
= qp
->remote_qpn
;
2121 wc
.slid
= qp
->remote_ah_attr
.dlid
;
2123 * It seems that IB mandates the presence of an SL in a
2124 * work completion only for the UD transport (see section
2125 * 11.4.2 of IBTA Vol. 1).
2127 * However, the way the SL is chosen below is consistent
2128 * with the way that IB/qib works and is trying avoid
2129 * introducing incompatibilities.
2131 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
2133 wc
.sl
= qp
->remote_ah_attr
.sl
;
2134 /* zero fields that are N/A */
2137 wc
.dlid_path_bits
= 0;
2139 /* Signal completion event if the solicited bit is set. */
2140 hfi1_cq_enter(to_icq(qp
->ibqp
.recv_cq
), &wc
,
2141 (bth0
& IB_BTH_SOLICITED
) != 0);
2144 case OP(RDMA_WRITE_FIRST
):
2145 case OP(RDMA_WRITE_ONLY
):
2146 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
):
2147 if (unlikely(!(qp
->qp_access_flags
& IB_ACCESS_REMOTE_WRITE
)))
2150 reth
= &ohdr
->u
.rc
.reth
;
2151 qp
->r_len
= be32_to_cpu(reth
->length
);
2153 qp
->r_sge
.sg_list
= NULL
;
2154 if (qp
->r_len
!= 0) {
2155 u32 rkey
= be32_to_cpu(reth
->rkey
);
2156 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
2159 /* Check rkey & NAK */
2160 ok
= hfi1_rkey_ok(qp
, &qp
->r_sge
.sge
, qp
->r_len
, vaddr
,
2161 rkey
, IB_ACCESS_REMOTE_WRITE
);
2164 qp
->r_sge
.num_sge
= 1;
2166 qp
->r_sge
.num_sge
= 0;
2167 qp
->r_sge
.sge
.mr
= NULL
;
2168 qp
->r_sge
.sge
.vaddr
= NULL
;
2169 qp
->r_sge
.sge
.length
= 0;
2170 qp
->r_sge
.sge
.sge_length
= 0;
2172 if (opcode
== OP(RDMA_WRITE_FIRST
))
2174 else if (opcode
== OP(RDMA_WRITE_ONLY
))
2175 goto no_immediate_data
;
2176 ret
= hfi1_get_rwqe(qp
, 1);
2181 wc
.ex
.imm_data
= ohdr
->u
.rc
.imm_data
;
2182 wc
.wc_flags
= IB_WC_WITH_IMM
;
2185 case OP(RDMA_READ_REQUEST
): {
2186 struct hfi1_ack_entry
*e
;
2190 if (unlikely(!(qp
->qp_access_flags
& IB_ACCESS_REMOTE_READ
)))
2192 next
= qp
->r_head_ack_queue
+ 1;
2193 /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
2194 if (next
> HFI1_MAX_RDMA_ATOMIC
)
2196 spin_lock_irqsave(&qp
->s_lock
, flags
);
2197 if (unlikely(next
== qp
->s_tail_ack_queue
)) {
2198 if (!qp
->s_ack_queue
[next
].sent
)
2199 goto nack_inv_unlck
;
2200 update_ack_queue(qp
, next
);
2202 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
2203 if (e
->opcode
== OP(RDMA_READ_REQUEST
) && e
->rdma_sge
.mr
) {
2204 hfi1_put_mr(e
->rdma_sge
.mr
);
2205 e
->rdma_sge
.mr
= NULL
;
2207 reth
= &ohdr
->u
.rc
.reth
;
2208 len
= be32_to_cpu(reth
->length
);
2210 u32 rkey
= be32_to_cpu(reth
->rkey
);
2211 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
2214 /* Check rkey & NAK */
2215 ok
= hfi1_rkey_ok(qp
, &e
->rdma_sge
, len
, vaddr
,
2216 rkey
, IB_ACCESS_REMOTE_READ
);
2218 goto nack_acc_unlck
;
2220 * Update the next expected PSN. We add 1 later
2221 * below, so only add the remainder here.
2224 qp
->r_psn
+= (len
- 1) / pmtu
;
2226 e
->rdma_sge
.mr
= NULL
;
2227 e
->rdma_sge
.vaddr
= NULL
;
2228 e
->rdma_sge
.length
= 0;
2229 e
->rdma_sge
.sge_length
= 0;
2234 e
->lpsn
= qp
->r_psn
;
2236 * We need to increment the MSN here instead of when we
2237 * finish sending the result since a duplicate request would
2238 * increment it more than once.
2242 qp
->r_state
= opcode
;
2243 qp
->r_nak_state
= 0;
2244 qp
->r_head_ack_queue
= next
;
2246 /* Schedule the send tasklet. */
2247 qp
->s_flags
|= HFI1_S_RESP_PENDING
;
2248 hfi1_schedule_send(qp
);
2250 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2256 case OP(COMPARE_SWAP
):
2257 case OP(FETCH_ADD
): {
2258 struct ib_atomic_eth
*ateth
;
2259 struct hfi1_ack_entry
*e
;
2266 if (unlikely(!(qp
->qp_access_flags
& IB_ACCESS_REMOTE_ATOMIC
)))
2268 next
= qp
->r_head_ack_queue
+ 1;
2269 if (next
> HFI1_MAX_RDMA_ATOMIC
)
2271 spin_lock_irqsave(&qp
->s_lock
, flags
);
2272 if (unlikely(next
== qp
->s_tail_ack_queue
)) {
2273 if (!qp
->s_ack_queue
[next
].sent
)
2274 goto nack_inv_unlck
;
2275 update_ack_queue(qp
, next
);
2277 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
2278 if (e
->opcode
== OP(RDMA_READ_REQUEST
) && e
->rdma_sge
.mr
) {
2279 hfi1_put_mr(e
->rdma_sge
.mr
);
2280 e
->rdma_sge
.mr
= NULL
;
2282 ateth
= &ohdr
->u
.atomic_eth
;
2283 vaddr
= ((u64
) be32_to_cpu(ateth
->vaddr
[0]) << 32) |
2284 be32_to_cpu(ateth
->vaddr
[1]);
2285 if (unlikely(vaddr
& (sizeof(u64
) - 1)))
2286 goto nack_inv_unlck
;
2287 rkey
= be32_to_cpu(ateth
->rkey
);
2288 /* Check rkey & NAK */
2289 if (unlikely(!hfi1_rkey_ok(qp
, &qp
->r_sge
.sge
, sizeof(u64
),
2291 IB_ACCESS_REMOTE_ATOMIC
)))
2292 goto nack_acc_unlck
;
2293 /* Perform atomic OP and save result. */
2294 maddr
= (atomic64_t
*) qp
->r_sge
.sge
.vaddr
;
2295 sdata
= be64_to_cpu(ateth
->swap_data
);
2296 e
->atomic_data
= (opcode
== OP(FETCH_ADD
)) ?
2297 (u64
) atomic64_add_return(sdata
, maddr
) - sdata
:
2298 (u64
) cmpxchg((u64
*) qp
->r_sge
.sge
.vaddr
,
2299 be64_to_cpu(ateth
->compare_data
),
2301 hfi1_put_mr(qp
->r_sge
.sge
.mr
);
2302 qp
->r_sge
.num_sge
= 0;
2309 qp
->r_state
= opcode
;
2310 qp
->r_nak_state
= 0;
2311 qp
->r_head_ack_queue
= next
;
2313 /* Schedule the send tasklet. */
2314 qp
->s_flags
|= HFI1_S_RESP_PENDING
;
2315 hfi1_schedule_send(qp
);
2317 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2324 /* NAK unknown opcodes. */
2328 qp
->r_state
= opcode
;
2329 qp
->r_ack_psn
= psn
;
2330 qp
->r_nak_state
= 0;
2331 /* Send an ACK if requested or required. */
2332 if (psn
& (1 << 31))
2337 qp
->r_nak_state
= IB_RNR_NAK
| qp
->r_min_rnr_timer
;
2338 qp
->r_ack_psn
= qp
->r_psn
;
2339 /* Queue RNR NAK for later */
2340 if (list_empty(&qp
->rspwait
)) {
2341 qp
->r_flags
|= HFI1_R_RSP_NAK
;
2342 atomic_inc(&qp
->refcount
);
2343 list_add_tail(&qp
->rspwait
, &rcd
->qp_wait_list
);
2348 hfi1_rc_error(qp
, IB_WC_LOC_QP_OP_ERR
);
2349 qp
->r_nak_state
= IB_NAK_REMOTE_OPERATIONAL_ERROR
;
2350 qp
->r_ack_psn
= qp
->r_psn
;
2351 /* Queue NAK for later */
2352 if (list_empty(&qp
->rspwait
)) {
2353 qp
->r_flags
|= HFI1_R_RSP_NAK
;
2354 atomic_inc(&qp
->refcount
);
2355 list_add_tail(&qp
->rspwait
, &rcd
->qp_wait_list
);
2360 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2362 hfi1_rc_error(qp
, IB_WC_LOC_QP_OP_ERR
);
2363 qp
->r_nak_state
= IB_NAK_INVALID_REQUEST
;
2364 qp
->r_ack_psn
= qp
->r_psn
;
2365 /* Queue NAK for later */
2366 if (list_empty(&qp
->rspwait
)) {
2367 qp
->r_flags
|= HFI1_R_RSP_NAK
;
2368 atomic_inc(&qp
->refcount
);
2369 list_add_tail(&qp
->rspwait
, &rcd
->qp_wait_list
);
2374 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2376 hfi1_rc_error(qp
, IB_WC_LOC_PROT_ERR
);
2377 qp
->r_nak_state
= IB_NAK_REMOTE_ACCESS_ERROR
;
2378 qp
->r_ack_psn
= qp
->r_psn
;
2380 hfi1_send_rc_ack(rcd
, qp
, is_fecn
);
2383 void hfi1_rc_hdrerr(
2384 struct hfi1_ctxtdata
*rcd
,
2385 struct hfi1_ib_header
*hdr
,
2389 int has_grh
= rcv_flags
& HFI1_HAS_GRH
;
2390 struct hfi1_other_headers
*ohdr
;
2391 struct hfi1_ibport
*ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
2399 ohdr
= &hdr
->u
.l
.oth
;
2401 bth0
= be32_to_cpu(ohdr
->bth
[0]);
2402 if (hfi1_ruc_check_hdr(ibp
, hdr
, has_grh
, qp
, bth0
))
2405 psn
= be32_to_cpu(ohdr
->bth
[2]);
2406 opcode
= (bth0
>> 24) & 0xff;
2408 /* Only deal with RDMA Writes for now */
2409 if (opcode
< IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST
) {
2410 diff
= delta_psn(psn
, qp
->r_psn
);
2411 if (!qp
->r_nak_state
&& diff
>= 0) {
2413 qp
->r_nak_state
= IB_NAK_PSN_ERROR
;
2414 /* Use the expected PSN. */
2415 qp
->r_ack_psn
= qp
->r_psn
;
2417 * Wait to send the sequence
2418 * NAK until all packets
2419 * in the receive queue have
2421 * Otherwise, we end up
2422 * propagating congestion.
2424 if (list_empty(&qp
->rspwait
)) {
2425 qp
->r_flags
|= HFI1_R_RSP_NAK
;
2426 atomic_inc(&qp
->refcount
);
2429 &rcd
->qp_wait_list
);
2431 } /* Out of sequence NAK */
2432 } /* QP Request NAKs */