2 * Copyright(c) 2016, 2017 Intel Corporation.
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
24 * - Redistributions of source code must retain the above copyright
25 * notice, this list of conditions and the following disclaimer.
26 * - Redistributions in binary form must reproduce the above copyright
27 * notice, this list of conditions and the following disclaimer in
28 * the documentation and/or other materials provided with the
30 * - Neither the name of Intel Corporation nor the names of its
31 * contributors may be used to endorse or promote products derived
32 * from this software without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 #include <linux/hash.h>
49 #include <linux/bitops.h>
50 #include <linux/lockdep.h>
51 #include <linux/vmalloc.h>
52 #include <linux/slab.h>
53 #include <rdma/ib_verbs.h>
54 #include <rdma/ib_hdrs.h>
59 static void rvt_rc_timeout(unsigned long arg
);
62 * Convert the AETH RNR timeout code into the number of microseconds.
64 static const u32 ib_rvt_rnr_table
[32] = {
65 655360, /* 00: 655.36 */
85 10240, /* 14: 10.24 */
86 15360, /* 15: 15.36 */
87 20480, /* 16: 20.48 */
88 30720, /* 17: 30.72 */
89 40960, /* 18: 40.96 */
90 61440, /* 19: 61.44 */
91 81920, /* 1A: 81.92 */
92 122880, /* 1B: 122.88 */
93 163840, /* 1C: 163.84 */
94 245760, /* 1D: 245.76 */
95 327680, /* 1E: 327.68 */
96 491520 /* 1F: 491.52 */
100 * Note that it is OK to post send work requests in the SQE and ERR
101 * states; rvt_do_send() will process them and generate error
102 * completions as per IB 1.2 C10-96.
104 const int ib_rvt_state_ops
[IB_QPS_ERR
+ 1] = {
106 [IB_QPS_INIT
] = RVT_POST_RECV_OK
,
107 [IB_QPS_RTR
] = RVT_POST_RECV_OK
| RVT_PROCESS_RECV_OK
,
108 [IB_QPS_RTS
] = RVT_POST_RECV_OK
| RVT_PROCESS_RECV_OK
|
109 RVT_POST_SEND_OK
| RVT_PROCESS_SEND_OK
|
110 RVT_PROCESS_NEXT_SEND_OK
,
111 [IB_QPS_SQD
] = RVT_POST_RECV_OK
| RVT_PROCESS_RECV_OK
|
112 RVT_POST_SEND_OK
| RVT_PROCESS_SEND_OK
,
113 [IB_QPS_SQE
] = RVT_POST_RECV_OK
| RVT_PROCESS_RECV_OK
|
114 RVT_POST_SEND_OK
| RVT_FLUSH_SEND
,
115 [IB_QPS_ERR
] = RVT_POST_RECV_OK
| RVT_FLUSH_RECV
|
116 RVT_POST_SEND_OK
| RVT_FLUSH_SEND
,
118 EXPORT_SYMBOL(ib_rvt_state_ops
);
120 static void get_map_page(struct rvt_qpn_table
*qpt
,
121 struct rvt_qpn_map
*map
,
124 unsigned long page
= get_zeroed_page(gfp
);
127 * Free the page if someone raced with us installing it.
130 spin_lock(&qpt
->lock
);
134 map
->page
= (void *)page
;
135 spin_unlock(&qpt
->lock
);
139 * init_qpn_table - initialize the QP number table for a device
140 * @qpt: the QPN table
142 static int init_qpn_table(struct rvt_dev_info
*rdi
, struct rvt_qpn_table
*qpt
)
145 struct rvt_qpn_map
*map
;
148 if (!(rdi
->dparms
.qpn_res_end
>= rdi
->dparms
.qpn_res_start
))
151 spin_lock_init(&qpt
->lock
);
153 qpt
->last
= rdi
->dparms
.qpn_start
;
154 qpt
->incr
= rdi
->dparms
.qpn_inc
<< rdi
->dparms
.qos_shift
;
157 * Drivers may want some QPs beyond what we need for verbs let them use
158 * our qpn table. No need for two. Lets go ahead and mark the bitmaps
159 * for those. The reserved range must be *after* the range which verbs
163 /* Figure out number of bit maps needed before reserved range */
164 qpt
->nmaps
= rdi
->dparms
.qpn_res_start
/ RVT_BITS_PER_PAGE
;
166 /* This should always be zero */
167 offset
= rdi
->dparms
.qpn_res_start
& RVT_BITS_PER_PAGE_MASK
;
169 /* Starting with the first reserved bit map */
170 map
= &qpt
->map
[qpt
->nmaps
];
172 rvt_pr_info(rdi
, "Reserving QPNs from 0x%x to 0x%x for non-verbs use\n",
173 rdi
->dparms
.qpn_res_start
, rdi
->dparms
.qpn_res_end
);
174 for (i
= rdi
->dparms
.qpn_res_start
; i
<= rdi
->dparms
.qpn_res_end
; i
++) {
176 get_map_page(qpt
, map
, GFP_KERNEL
);
182 set_bit(offset
, map
->page
);
184 if (offset
== RVT_BITS_PER_PAGE
) {
195 * free_qpn_table - free the QP number table for a device
196 * @qpt: the QPN table
198 static void free_qpn_table(struct rvt_qpn_table
*qpt
)
202 for (i
= 0; i
< ARRAY_SIZE(qpt
->map
); i
++)
203 free_page((unsigned long)qpt
->map
[i
].page
);
207 * rvt_driver_qp_init - Init driver qp resources
208 * @rdi: rvt dev strucutre
210 * Return: 0 on success
212 int rvt_driver_qp_init(struct rvt_dev_info
*rdi
)
217 if (!rdi
->dparms
.qp_table_size
)
221 * If driver is not doing any QP allocation then make sure it is
222 * providing the necessary QP functions.
224 if (!rdi
->driver_f
.free_all_qps
||
225 !rdi
->driver_f
.qp_priv_alloc
||
226 !rdi
->driver_f
.qp_priv_free
||
227 !rdi
->driver_f
.notify_qp_reset
||
228 !rdi
->driver_f
.notify_restart_rc
)
231 /* allocate parent object */
232 rdi
->qp_dev
= kzalloc_node(sizeof(*rdi
->qp_dev
), GFP_KERNEL
,
237 /* allocate hash table */
238 rdi
->qp_dev
->qp_table_size
= rdi
->dparms
.qp_table_size
;
239 rdi
->qp_dev
->qp_table_bits
= ilog2(rdi
->dparms
.qp_table_size
);
240 rdi
->qp_dev
->qp_table
=
241 kmalloc_node(rdi
->qp_dev
->qp_table_size
*
242 sizeof(*rdi
->qp_dev
->qp_table
),
243 GFP_KERNEL
, rdi
->dparms
.node
);
244 if (!rdi
->qp_dev
->qp_table
)
247 for (i
= 0; i
< rdi
->qp_dev
->qp_table_size
; i
++)
248 RCU_INIT_POINTER(rdi
->qp_dev
->qp_table
[i
], NULL
);
250 spin_lock_init(&rdi
->qp_dev
->qpt_lock
);
252 /* initialize qpn map */
253 if (init_qpn_table(rdi
, &rdi
->qp_dev
->qpn_table
))
256 spin_lock_init(&rdi
->n_qps_lock
);
261 kfree(rdi
->qp_dev
->qp_table
);
262 free_qpn_table(&rdi
->qp_dev
->qpn_table
);
271 * free_all_qps - check for QPs still in use
272 * @qpt: the QP table to empty
274 * There should not be any QPs still in use.
275 * Free memory for table.
277 static unsigned rvt_free_all_qps(struct rvt_dev_info
*rdi
)
281 unsigned n
, qp_inuse
= 0;
282 spinlock_t
*ql
; /* work around too long line below */
284 if (rdi
->driver_f
.free_all_qps
)
285 qp_inuse
= rdi
->driver_f
.free_all_qps(rdi
);
287 qp_inuse
+= rvt_mcast_tree_empty(rdi
);
292 ql
= &rdi
->qp_dev
->qpt_lock
;
293 spin_lock_irqsave(ql
, flags
);
294 for (n
= 0; n
< rdi
->qp_dev
->qp_table_size
; n
++) {
295 qp
= rcu_dereference_protected(rdi
->qp_dev
->qp_table
[n
],
296 lockdep_is_held(ql
));
297 RCU_INIT_POINTER(rdi
->qp_dev
->qp_table
[n
], NULL
);
299 for (; qp
; qp
= rcu_dereference_protected(qp
->next
,
300 lockdep_is_held(ql
)))
303 spin_unlock_irqrestore(ql
, flags
);
309 * rvt_qp_exit - clean up qps on device exit
310 * @rdi: rvt dev structure
312 * Check for qp leaks and free resources.
314 void rvt_qp_exit(struct rvt_dev_info
*rdi
)
316 u32 qps_inuse
= rvt_free_all_qps(rdi
);
319 rvt_pr_err(rdi
, "QP memory leak! %u still in use\n",
324 kfree(rdi
->qp_dev
->qp_table
);
325 free_qpn_table(&rdi
->qp_dev
->qpn_table
);
329 static inline unsigned mk_qpn(struct rvt_qpn_table
*qpt
,
330 struct rvt_qpn_map
*map
, unsigned off
)
332 return (map
- qpt
->map
) * RVT_BITS_PER_PAGE
+ off
;
336 * alloc_qpn - Allocate the next available qpn or zero/one for QP type
337 * IB_QPT_SMI/IB_QPT_GSI
338 *@rdi: rvt device info structure
339 *@qpt: queue pair number table pointer
340 *@port_num: IB port number, 1 based, comes from core
342 * Return: The queue pair number
344 static int alloc_qpn(struct rvt_dev_info
*rdi
, struct rvt_qpn_table
*qpt
,
345 enum ib_qp_type type
, u8 port_num
, gfp_t gfp
)
347 u32 i
, offset
, max_scan
, qpn
;
348 struct rvt_qpn_map
*map
;
351 if (rdi
->driver_f
.alloc_qpn
)
352 return rdi
->driver_f
.alloc_qpn(rdi
, qpt
, type
, port_num
, gfp
);
354 if (type
== IB_QPT_SMI
|| type
== IB_QPT_GSI
) {
357 ret
= type
== IB_QPT_GSI
;
358 n
= 1 << (ret
+ 2 * (port_num
- 1));
359 spin_lock(&qpt
->lock
);
364 spin_unlock(&qpt
->lock
);
368 qpn
= qpt
->last
+ qpt
->incr
;
369 if (qpn
>= RVT_QPN_MAX
)
370 qpn
= qpt
->incr
| ((qpt
->last
& 1) ^ 1);
371 /* offset carries bit 0 */
372 offset
= qpn
& RVT_BITS_PER_PAGE_MASK
;
373 map
= &qpt
->map
[qpn
/ RVT_BITS_PER_PAGE
];
374 max_scan
= qpt
->nmaps
- !offset
;
376 if (unlikely(!map
->page
)) {
377 get_map_page(qpt
, map
, gfp
);
378 if (unlikely(!map
->page
))
382 if (!test_and_set_bit(offset
, map
->page
)) {
389 * This qpn might be bogus if offset >= BITS_PER_PAGE.
390 * That is OK. It gets re-assigned below
392 qpn
= mk_qpn(qpt
, map
, offset
);
393 } while (offset
< RVT_BITS_PER_PAGE
&& qpn
< RVT_QPN_MAX
);
395 * In order to keep the number of pages allocated to a
396 * minimum, we scan the all existing pages before increasing
397 * the size of the bitmap table.
399 if (++i
> max_scan
) {
400 if (qpt
->nmaps
== RVT_QPNMAP_ENTRIES
)
402 map
= &qpt
->map
[qpt
->nmaps
++];
403 /* start at incr with current bit 0 */
404 offset
= qpt
->incr
| (offset
& 1);
405 } else if (map
< &qpt
->map
[qpt
->nmaps
]) {
407 /* start at incr with current bit 0 */
408 offset
= qpt
->incr
| (offset
& 1);
411 /* wrap to first map page, invert bit 0 */
412 offset
= qpt
->incr
| ((offset
& 1) ^ 1);
414 /* there can be no set bits in low-order QoS bits */
415 WARN_ON(offset
& (BIT(rdi
->dparms
.qos_shift
) - 1));
416 qpn
= mk_qpn(qpt
, map
, offset
);
425 static void free_qpn(struct rvt_qpn_table
*qpt
, u32 qpn
)
427 struct rvt_qpn_map
*map
;
429 map
= qpt
->map
+ qpn
/ RVT_BITS_PER_PAGE
;
431 clear_bit(qpn
& RVT_BITS_PER_PAGE_MASK
, map
->page
);
435 * rvt_clear_mr_refs - Drop help mr refs
436 * @qp: rvt qp data structure
437 * @clr_sends: If shoudl clear send side or not
439 static void rvt_clear_mr_refs(struct rvt_qp
*qp
, int clr_sends
)
442 struct rvt_dev_info
*rdi
= ib_to_rvt(qp
->ibqp
.device
);
444 if (test_and_clear_bit(RVT_R_REWIND_SGE
, &qp
->r_aflags
))
445 rvt_put_ss(&qp
->s_rdma_read_sge
);
447 rvt_put_ss(&qp
->r_sge
);
450 while (qp
->s_last
!= qp
->s_head
) {
451 struct rvt_swqe
*wqe
= rvt_get_swqe_ptr(qp
, qp
->s_last
);
454 for (i
= 0; i
< wqe
->wr
.num_sge
; i
++) {
455 struct rvt_sge
*sge
= &wqe
->sg_list
[i
];
459 if (qp
->ibqp
.qp_type
== IB_QPT_UD
||
460 qp
->ibqp
.qp_type
== IB_QPT_SMI
||
461 qp
->ibqp
.qp_type
== IB_QPT_GSI
)
462 atomic_dec(&ibah_to_rvtah(
463 wqe
->ud_wr
.ah
)->refcount
);
464 if (++qp
->s_last
>= qp
->s_size
)
466 smp_wmb(); /* see qp_set_savail */
469 rvt_put_mr(qp
->s_rdma_mr
);
470 qp
->s_rdma_mr
= NULL
;
474 if (qp
->ibqp
.qp_type
!= IB_QPT_RC
)
477 for (n
= 0; n
< rvt_max_atomic(rdi
); n
++) {
478 struct rvt_ack_entry
*e
= &qp
->s_ack_queue
[n
];
480 if (e
->rdma_sge
.mr
) {
481 rvt_put_mr(e
->rdma_sge
.mr
);
482 e
->rdma_sge
.mr
= NULL
;
488 * rvt_remove_qp - remove qp form table
489 * @rdi: rvt dev struct
492 * Remove the QP from the table so it can't be found asynchronously by
493 * the receive routine.
495 static void rvt_remove_qp(struct rvt_dev_info
*rdi
, struct rvt_qp
*qp
)
497 struct rvt_ibport
*rvp
= rdi
->ports
[qp
->port_num
- 1];
498 u32 n
= hash_32(qp
->ibqp
.qp_num
, rdi
->qp_dev
->qp_table_bits
);
502 spin_lock_irqsave(&rdi
->qp_dev
->qpt_lock
, flags
);
504 if (rcu_dereference_protected(rvp
->qp
[0],
505 lockdep_is_held(&rdi
->qp_dev
->qpt_lock
)) == qp
) {
506 RCU_INIT_POINTER(rvp
->qp
[0], NULL
);
507 } else if (rcu_dereference_protected(rvp
->qp
[1],
508 lockdep_is_held(&rdi
->qp_dev
->qpt_lock
)) == qp
) {
509 RCU_INIT_POINTER(rvp
->qp
[1], NULL
);
512 struct rvt_qp __rcu
**qpp
;
515 qpp
= &rdi
->qp_dev
->qp_table
[n
];
516 for (; (q
= rcu_dereference_protected(*qpp
,
517 lockdep_is_held(&rdi
->qp_dev
->qpt_lock
))) != NULL
;
520 RCU_INIT_POINTER(*qpp
,
521 rcu_dereference_protected(qp
->next
,
522 lockdep_is_held(&rdi
->qp_dev
->qpt_lock
)));
524 trace_rvt_qpremove(qp
, n
);
530 spin_unlock_irqrestore(&rdi
->qp_dev
->qpt_lock
, flags
);
538 * rvt_init_qp - initialize the QP state to the reset state
539 * @qp: the QP to init or reinit
542 * This function is called from both rvt_create_qp() and
543 * rvt_reset_qp(). The difference is that the reset
544 * patch the necessary locks to protect against concurent
547 static void rvt_init_qp(struct rvt_dev_info
*rdi
, struct rvt_qp
*qp
,
548 enum ib_qp_type type
)
552 qp
->qp_access_flags
= 0;
553 qp
->s_flags
&= RVT_S_SIGNAL_REQ_WR
;
559 qp
->s_sending_psn
= 0;
560 qp
->s_sending_hpsn
= 0;
564 if (type
== IB_QPT_RC
) {
565 qp
->s_state
= IB_OPCODE_RC_SEND_LAST
;
566 qp
->r_state
= IB_OPCODE_RC_SEND_LAST
;
568 qp
->s_state
= IB_OPCODE_UC_SEND_LAST
;
569 qp
->r_state
= IB_OPCODE_UC_SEND_LAST
;
571 qp
->s_ack_state
= IB_OPCODE_RC_ACKNOWLEDGE
;
582 qp
->s_mig_state
= IB_MIG_MIGRATED
;
583 qp
->r_head_ack_queue
= 0;
584 qp
->s_tail_ack_queue
= 0;
585 qp
->s_num_rd_atomic
= 0;
587 qp
->r_rq
.wq
->head
= 0;
588 qp
->r_rq
.wq
->tail
= 0;
590 qp
->r_sge
.num_sge
= 0;
591 atomic_set(&qp
->s_reserved_used
, 0);
595 * rvt_reset_qp - initialize the QP state to the reset state
596 * @qp: the QP to reset
599 * r_lock, s_hlock, and s_lock are required to be held by the caller
601 static void rvt_reset_qp(struct rvt_dev_info
*rdi
, struct rvt_qp
*qp
,
602 enum ib_qp_type type
)
603 __must_hold(&qp
->s_lock
)
604 __must_hold(&qp
->s_hlock
)
605 __must_hold(&qp
->r_lock
)
607 lockdep_assert_held(&qp
->r_lock
);
608 lockdep_assert_held(&qp
->s_hlock
);
609 lockdep_assert_held(&qp
->s_lock
);
610 if (qp
->state
!= IB_QPS_RESET
) {
611 qp
->state
= IB_QPS_RESET
;
613 /* Let drivers flush their waitlist */
614 rdi
->driver_f
.flush_qp_waiters(qp
);
615 rvt_stop_rc_timers(qp
);
616 qp
->s_flags
&= ~(RVT_S_TIMER
| RVT_S_ANY_WAIT
);
617 spin_unlock(&qp
->s_lock
);
618 spin_unlock(&qp
->s_hlock
);
619 spin_unlock_irq(&qp
->r_lock
);
621 /* Stop the send queue and the retry timer */
622 rdi
->driver_f
.stop_send_queue(qp
);
623 rvt_del_timers_sync(qp
);
624 /* Wait for things to stop */
625 rdi
->driver_f
.quiesce_qp(qp
);
627 /* take qp out the hash and wait for it to be unused */
628 rvt_remove_qp(rdi
, qp
);
629 wait_event(qp
->wait
, !atomic_read(&qp
->refcount
));
631 /* grab the lock b/c it was locked at call time */
632 spin_lock_irq(&qp
->r_lock
);
633 spin_lock(&qp
->s_hlock
);
634 spin_lock(&qp
->s_lock
);
636 rvt_clear_mr_refs(qp
, 1);
638 * Let the driver do any tear down or re-init it needs to for
639 * a qp that has been reset
641 rdi
->driver_f
.notify_qp_reset(qp
);
643 rvt_init_qp(rdi
, qp
, type
);
644 lockdep_assert_held(&qp
->r_lock
);
645 lockdep_assert_held(&qp
->s_hlock
);
646 lockdep_assert_held(&qp
->s_lock
);
650 * rvt_create_qp - create a queue pair for a device
651 * @ibpd: the protection domain who's device we create the queue pair for
652 * @init_attr: the attributes of the queue pair
653 * @udata: user data for libibverbs.so
655 * Queue pair creation is mostly an rvt issue. However, drivers have their own
656 * unique idea of what queue pair numbers mean. For instance there is a reserved
659 * Return: the queue pair on success, otherwise returns an errno.
661 * Called by the ib_create_qp() core verbs function.
663 struct ib_qp
*rvt_create_qp(struct ib_pd
*ibpd
,
664 struct ib_qp_init_attr
*init_attr
,
665 struct ib_udata
*udata
)
669 struct rvt_swqe
*swq
= NULL
;
672 struct ib_qp
*ret
= ERR_PTR(-ENOMEM
);
673 struct rvt_dev_info
*rdi
= ib_to_rvt(ibpd
->device
);
679 return ERR_PTR(-EINVAL
);
681 if (init_attr
->cap
.max_send_sge
> rdi
->dparms
.props
.max_sge
||
682 init_attr
->cap
.max_send_wr
> rdi
->dparms
.props
.max_qp_wr
||
683 init_attr
->create_flags
& ~(IB_QP_CREATE_USE_GFP_NOIO
))
684 return ERR_PTR(-EINVAL
);
686 /* GFP_NOIO is applicable to RC QP's only */
688 if (init_attr
->create_flags
& IB_QP_CREATE_USE_GFP_NOIO
&&
689 init_attr
->qp_type
!= IB_QPT_RC
)
690 return ERR_PTR(-EINVAL
);
692 gfp
= init_attr
->create_flags
& IB_QP_CREATE_USE_GFP_NOIO
?
693 GFP_NOIO
: GFP_KERNEL
;
695 /* Check receive queue parameters if no SRQ is specified. */
696 if (!init_attr
->srq
) {
697 if (init_attr
->cap
.max_recv_sge
> rdi
->dparms
.props
.max_sge
||
698 init_attr
->cap
.max_recv_wr
> rdi
->dparms
.props
.max_qp_wr
)
699 return ERR_PTR(-EINVAL
);
701 if (init_attr
->cap
.max_send_sge
+
702 init_attr
->cap
.max_send_wr
+
703 init_attr
->cap
.max_recv_sge
+
704 init_attr
->cap
.max_recv_wr
== 0)
705 return ERR_PTR(-EINVAL
);
708 init_attr
->cap
.max_send_wr
+ 1 +
709 rdi
->dparms
.reserved_operations
;
710 switch (init_attr
->qp_type
) {
713 if (init_attr
->port_num
== 0 ||
714 init_attr
->port_num
> ibpd
->device
->phys_port_cnt
)
715 return ERR_PTR(-EINVAL
);
719 sz
= sizeof(struct rvt_sge
) *
720 init_attr
->cap
.max_send_sge
+
721 sizeof(struct rvt_swqe
);
725 gfp
| __GFP_ZERO
, PAGE_KERNEL
);
731 return ERR_PTR(-ENOMEM
);
735 if (init_attr
->srq
) {
736 struct rvt_srq
*srq
= ibsrq_to_rvtsrq(init_attr
->srq
);
738 if (srq
->rq
.max_sge
> 1)
739 sg_list_sz
= sizeof(*qp
->r_sg_list
) *
740 (srq
->rq
.max_sge
- 1);
741 } else if (init_attr
->cap
.max_recv_sge
> 1)
742 sg_list_sz
= sizeof(*qp
->r_sg_list
) *
743 (init_attr
->cap
.max_recv_sge
- 1);
744 qp
= kzalloc_node(sz
+ sg_list_sz
, gfp
, rdi
->dparms
.node
);
748 RCU_INIT_POINTER(qp
->next
, NULL
);
749 if (init_attr
->qp_type
== IB_QPT_RC
) {
752 sizeof(*qp
->s_ack_queue
) *
756 if (!qp
->s_ack_queue
)
759 /* initialize timers needed for rc qp */
760 setup_timer(&qp
->s_timer
, rvt_rc_timeout
, (unsigned long)qp
);
761 hrtimer_init(&qp
->s_rnr_timer
, CLOCK_MONOTONIC
,
763 qp
->s_rnr_timer
.function
= rvt_rc_rnr_retry
;
766 * Driver needs to set up it's private QP structure and do any
767 * initialization that is needed.
769 priv
= rdi
->driver_f
.qp_priv_alloc(rdi
, qp
, gfp
);
775 qp
->timeout_jiffies
=
776 usecs_to_jiffies((4096UL * (1UL << qp
->timeout
)) /
778 if (init_attr
->srq
) {
781 qp
->r_rq
.size
= init_attr
->cap
.max_recv_wr
+ 1;
782 qp
->r_rq
.max_sge
= init_attr
->cap
.max_recv_sge
;
783 sz
= (sizeof(struct ib_sge
) * qp
->r_rq
.max_sge
) +
784 sizeof(struct rvt_rwqe
);
786 qp
->r_rq
.wq
= vmalloc_user(
787 sizeof(struct rvt_rwq
) +
789 else if (gfp
== GFP_NOIO
)
790 qp
->r_rq
.wq
= __vmalloc(
791 sizeof(struct rvt_rwq
) +
793 gfp
| __GFP_ZERO
, PAGE_KERNEL
);
795 qp
->r_rq
.wq
= vzalloc_node(
796 sizeof(struct rvt_rwq
) +
800 goto bail_driver_priv
;
804 * ib_create_qp() will initialize qp->ibqp
805 * except for qp->ibqp.qp_num.
807 spin_lock_init(&qp
->r_lock
);
808 spin_lock_init(&qp
->s_hlock
);
809 spin_lock_init(&qp
->s_lock
);
810 spin_lock_init(&qp
->r_rq
.lock
);
811 atomic_set(&qp
->refcount
, 0);
812 atomic_set(&qp
->local_ops_pending
, 0);
813 init_waitqueue_head(&qp
->wait
);
814 init_timer(&qp
->s_timer
);
815 qp
->s_timer
.data
= (unsigned long)qp
;
816 INIT_LIST_HEAD(&qp
->rspwait
);
817 qp
->state
= IB_QPS_RESET
;
820 qp
->s_avail
= init_attr
->cap
.max_send_wr
;
821 qp
->s_max_sge
= init_attr
->cap
.max_send_sge
;
822 if (init_attr
->sq_sig_type
== IB_SIGNAL_REQ_WR
)
823 qp
->s_flags
= RVT_S_SIGNAL_REQ_WR
;
825 err
= alloc_qpn(rdi
, &rdi
->qp_dev
->qpn_table
,
827 init_attr
->port_num
, gfp
);
832 qp
->ibqp
.qp_num
= err
;
833 qp
->port_num
= init_attr
->port_num
;
834 rvt_init_qp(rdi
, qp
, init_attr
->qp_type
);
838 /* Don't support raw QPs */
839 return ERR_PTR(-EINVAL
);
842 init_attr
->cap
.max_inline_data
= 0;
845 * Return the address of the RWQ as the offset to mmap.
846 * See rvt_mmap() for details.
848 if (udata
&& udata
->outlen
>= sizeof(__u64
)) {
852 err
= ib_copy_to_udata(udata
, &offset
,
859 u32 s
= sizeof(struct rvt_rwq
) + qp
->r_rq
.size
* sz
;
861 qp
->ip
= rvt_create_mmap_info(rdi
, s
,
862 ibpd
->uobject
->context
,
865 ret
= ERR_PTR(-ENOMEM
);
869 err
= ib_copy_to_udata(udata
, &qp
->ip
->offset
,
870 sizeof(qp
->ip
->offset
));
876 qp
->pid
= current
->pid
;
879 spin_lock(&rdi
->n_qps_lock
);
880 if (rdi
->n_qps_allocated
== rdi
->dparms
.props
.max_qp
) {
881 spin_unlock(&rdi
->n_qps_lock
);
882 ret
= ERR_PTR(-ENOMEM
);
886 rdi
->n_qps_allocated
++;
888 * Maintain a busy_jiffies variable that will be added to the timeout
889 * period in mod_retry_timer and add_retry_timer. This busy jiffies
890 * is scaled by the number of rc qps created for the device to reduce
891 * the number of timeouts occurring when there is a large number of
892 * qps. busy_jiffies is incremented every rc qp scaling interval.
893 * The scaling interval is selected based on extensive performance
894 * evaluation of targeted workloads.
896 if (init_attr
->qp_type
== IB_QPT_RC
) {
898 rdi
->busy_jiffies
= rdi
->n_rc_qps
/ RC_QP_SCALING_INTERVAL
;
900 spin_unlock(&rdi
->n_qps_lock
);
903 spin_lock_irq(&rdi
->pending_lock
);
904 list_add(&qp
->ip
->pending_mmaps
, &rdi
->pending_mmaps
);
905 spin_unlock_irq(&rdi
->pending_lock
);
911 * We have our QP and its good, now keep track of what types of opcodes
912 * can be processed on this QP. We do this by keeping track of what the
913 * 3 high order bits of the opcode are.
915 switch (init_attr
->qp_type
) {
919 qp
->allowed_ops
= IB_OPCODE_UD
;
922 qp
->allowed_ops
= IB_OPCODE_RC
;
925 qp
->allowed_ops
= IB_OPCODE_UC
;
928 ret
= ERR_PTR(-EINVAL
);
936 kref_put(&qp
->ip
->ref
, rvt_release_mmap_info
);
939 free_qpn(&rdi
->qp_dev
->qpn_table
, qp
->ibqp
.qp_num
);
946 rdi
->driver_f
.qp_priv_free(rdi
, qp
);
949 kfree(qp
->s_ack_queue
);
959 * rvt_error_qp - put a QP into the error state
960 * @qp: the QP to put into the error state
961 * @err: the receive completion error to signal if a RWQE is active
963 * Flushes both send and receive work queues.
965 * Return: true if last WQE event should be generated.
966 * The QP r_lock and s_lock should be held and interrupts disabled.
967 * If we are already in error state, just return.
969 int rvt_error_qp(struct rvt_qp
*qp
, enum ib_wc_status err
)
973 struct rvt_dev_info
*rdi
= ib_to_rvt(qp
->ibqp
.device
);
975 lockdep_assert_held(&qp
->r_lock
);
976 lockdep_assert_held(&qp
->s_lock
);
977 if (qp
->state
== IB_QPS_ERR
|| qp
->state
== IB_QPS_RESET
)
980 qp
->state
= IB_QPS_ERR
;
982 if (qp
->s_flags
& (RVT_S_TIMER
| RVT_S_WAIT_RNR
)) {
983 qp
->s_flags
&= ~(RVT_S_TIMER
| RVT_S_WAIT_RNR
);
984 del_timer(&qp
->s_timer
);
987 if (qp
->s_flags
& RVT_S_ANY_WAIT_SEND
)
988 qp
->s_flags
&= ~RVT_S_ANY_WAIT_SEND
;
990 rdi
->driver_f
.notify_error_qp(qp
);
992 /* Schedule the sending tasklet to drain the send work queue. */
993 if (ACCESS_ONCE(qp
->s_last
) != qp
->s_head
)
994 rdi
->driver_f
.schedule_send(qp
);
996 rvt_clear_mr_refs(qp
, 0);
998 memset(&wc
, 0, sizeof(wc
));
1000 wc
.opcode
= IB_WC_RECV
;
1002 if (test_and_clear_bit(RVT_R_WRID_VALID
, &qp
->r_aflags
)) {
1003 wc
.wr_id
= qp
->r_wr_id
;
1005 rvt_cq_enter(ibcq_to_rvtcq(qp
->ibqp
.recv_cq
), &wc
, 1);
1007 wc
.status
= IB_WC_WR_FLUSH_ERR
;
1014 spin_lock(&qp
->r_rq
.lock
);
1016 /* sanity check pointers before trusting them */
1019 if (head
>= qp
->r_rq
.size
)
1022 if (tail
>= qp
->r_rq
.size
)
1024 while (tail
!= head
) {
1025 wc
.wr_id
= rvt_get_rwqe_ptr(&qp
->r_rq
, tail
)->wr_id
;
1026 if (++tail
>= qp
->r_rq
.size
)
1028 rvt_cq_enter(ibcq_to_rvtcq(qp
->ibqp
.recv_cq
), &wc
, 1);
1032 spin_unlock(&qp
->r_rq
.lock
);
1033 } else if (qp
->ibqp
.event_handler
) {
1040 EXPORT_SYMBOL(rvt_error_qp
);
1043 * Put the QP into the hash table.
1044 * The hash table holds a reference to the QP.
1046 static void rvt_insert_qp(struct rvt_dev_info
*rdi
, struct rvt_qp
*qp
)
1048 struct rvt_ibport
*rvp
= rdi
->ports
[qp
->port_num
- 1];
1049 unsigned long flags
;
1052 spin_lock_irqsave(&rdi
->qp_dev
->qpt_lock
, flags
);
1054 if (qp
->ibqp
.qp_num
<= 1) {
1055 rcu_assign_pointer(rvp
->qp
[qp
->ibqp
.qp_num
], qp
);
1057 u32 n
= hash_32(qp
->ibqp
.qp_num
, rdi
->qp_dev
->qp_table_bits
);
1059 qp
->next
= rdi
->qp_dev
->qp_table
[n
];
1060 rcu_assign_pointer(rdi
->qp_dev
->qp_table
[n
], qp
);
1061 trace_rvt_qpinsert(qp
, n
);
1064 spin_unlock_irqrestore(&rdi
->qp_dev
->qpt_lock
, flags
);
1068 * rvt_modify_qp - modify the attributes of a queue pair
1069 * @ibqp: the queue pair who's attributes we're modifying
1070 * @attr: the new attributes
1071 * @attr_mask: the mask of attributes to modify
1072 * @udata: user data for libibverbs.so
1074 * Return: 0 on success, otherwise returns an errno.
1076 int rvt_modify_qp(struct ib_qp
*ibqp
, struct ib_qp_attr
*attr
,
1077 int attr_mask
, struct ib_udata
*udata
)
1079 struct rvt_dev_info
*rdi
= ib_to_rvt(ibqp
->device
);
1080 struct rvt_qp
*qp
= ibqp_to_rvtqp(ibqp
);
1081 enum ib_qp_state cur_state
, new_state
;
1085 int pmtu
= 0; /* for gcc warning only */
1086 enum rdma_link_layer link
;
1088 link
= rdma_port_get_link_layer(ibqp
->device
, qp
->port_num
);
1090 spin_lock_irq(&qp
->r_lock
);
1091 spin_lock(&qp
->s_hlock
);
1092 spin_lock(&qp
->s_lock
);
1094 cur_state
= attr_mask
& IB_QP_CUR_STATE
?
1095 attr
->cur_qp_state
: qp
->state
;
1096 new_state
= attr_mask
& IB_QP_STATE
? attr
->qp_state
: cur_state
;
1098 if (!ib_modify_qp_is_ok(cur_state
, new_state
, ibqp
->qp_type
,
1102 if (rdi
->driver_f
.check_modify_qp
&&
1103 rdi
->driver_f
.check_modify_qp(qp
, attr
, attr_mask
, udata
))
1106 if (attr_mask
& IB_QP_AV
) {
1107 if (rdma_ah_get_dlid(&attr
->ah_attr
) >=
1108 be16_to_cpu(IB_MULTICAST_LID_BASE
))
1110 if (rvt_check_ah(qp
->ibqp
.device
, &attr
->ah_attr
))
1114 if (attr_mask
& IB_QP_ALT_PATH
) {
1115 if (rdma_ah_get_dlid(&attr
->alt_ah_attr
) >=
1116 be16_to_cpu(IB_MULTICAST_LID_BASE
))
1118 if (rvt_check_ah(qp
->ibqp
.device
, &attr
->alt_ah_attr
))
1120 if (attr
->alt_pkey_index
>= rvt_get_npkeys(rdi
))
1124 if (attr_mask
& IB_QP_PKEY_INDEX
)
1125 if (attr
->pkey_index
>= rvt_get_npkeys(rdi
))
1128 if (attr_mask
& IB_QP_MIN_RNR_TIMER
)
1129 if (attr
->min_rnr_timer
> 31)
1132 if (attr_mask
& IB_QP_PORT
)
1133 if (qp
->ibqp
.qp_type
== IB_QPT_SMI
||
1134 qp
->ibqp
.qp_type
== IB_QPT_GSI
||
1135 attr
->port_num
== 0 ||
1136 attr
->port_num
> ibqp
->device
->phys_port_cnt
)
1139 if (attr_mask
& IB_QP_DEST_QPN
)
1140 if (attr
->dest_qp_num
> RVT_QPN_MASK
)
1143 if (attr_mask
& IB_QP_RETRY_CNT
)
1144 if (attr
->retry_cnt
> 7)
1147 if (attr_mask
& IB_QP_RNR_RETRY
)
1148 if (attr
->rnr_retry
> 7)
1152 * Don't allow invalid path_mtu values. OK to set greater
1153 * than the active mtu (or even the max_cap, if we have tuned
1154 * that to a small mtu. We'll set qp->path_mtu
1155 * to the lesser of requested attribute mtu and active,
1156 * for packetizing messages.
1157 * Note that the QP port has to be set in INIT and MTU in RTR.
1159 if (attr_mask
& IB_QP_PATH_MTU
) {
1160 pmtu
= rdi
->driver_f
.get_pmtu_from_attr(rdi
, qp
, attr
);
1165 if (attr_mask
& IB_QP_PATH_MIG_STATE
) {
1166 if (attr
->path_mig_state
== IB_MIG_REARM
) {
1167 if (qp
->s_mig_state
== IB_MIG_ARMED
)
1169 if (new_state
!= IB_QPS_RTS
)
1171 } else if (attr
->path_mig_state
== IB_MIG_MIGRATED
) {
1172 if (qp
->s_mig_state
== IB_MIG_REARM
)
1174 if (new_state
!= IB_QPS_RTS
&& new_state
!= IB_QPS_SQD
)
1176 if (qp
->s_mig_state
== IB_MIG_ARMED
)
1183 if (attr_mask
& IB_QP_MAX_DEST_RD_ATOMIC
)
1184 if (attr
->max_dest_rd_atomic
> rdi
->dparms
.max_rdma_atomic
)
1187 switch (new_state
) {
1189 if (qp
->state
!= IB_QPS_RESET
)
1190 rvt_reset_qp(rdi
, qp
, ibqp
->qp_type
);
1194 /* Allow event to re-trigger if QP set to RTR more than once */
1195 qp
->r_flags
&= ~RVT_R_COMM_EST
;
1196 qp
->state
= new_state
;
1200 qp
->s_draining
= qp
->s_last
!= qp
->s_cur
;
1201 qp
->state
= new_state
;
1205 if (qp
->ibqp
.qp_type
== IB_QPT_RC
)
1207 qp
->state
= new_state
;
1211 lastwqe
= rvt_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
1215 qp
->state
= new_state
;
1219 if (attr_mask
& IB_QP_PKEY_INDEX
)
1220 qp
->s_pkey_index
= attr
->pkey_index
;
1222 if (attr_mask
& IB_QP_PORT
)
1223 qp
->port_num
= attr
->port_num
;
1225 if (attr_mask
& IB_QP_DEST_QPN
)
1226 qp
->remote_qpn
= attr
->dest_qp_num
;
1228 if (attr_mask
& IB_QP_SQ_PSN
) {
1229 qp
->s_next_psn
= attr
->sq_psn
& rdi
->dparms
.psn_modify_mask
;
1230 qp
->s_psn
= qp
->s_next_psn
;
1231 qp
->s_sending_psn
= qp
->s_next_psn
;
1232 qp
->s_last_psn
= qp
->s_next_psn
- 1;
1233 qp
->s_sending_hpsn
= qp
->s_last_psn
;
1236 if (attr_mask
& IB_QP_RQ_PSN
)
1237 qp
->r_psn
= attr
->rq_psn
& rdi
->dparms
.psn_modify_mask
;
1239 if (attr_mask
& IB_QP_ACCESS_FLAGS
)
1240 qp
->qp_access_flags
= attr
->qp_access_flags
;
1242 if (attr_mask
& IB_QP_AV
) {
1243 qp
->remote_ah_attr
= attr
->ah_attr
;
1244 qp
->s_srate
= rdma_ah_get_static_rate(&attr
->ah_attr
);
1245 qp
->srate_mbps
= ib_rate_to_mbps(qp
->s_srate
);
1248 if (attr_mask
& IB_QP_ALT_PATH
) {
1249 qp
->alt_ah_attr
= attr
->alt_ah_attr
;
1250 qp
->s_alt_pkey_index
= attr
->alt_pkey_index
;
1253 if (attr_mask
& IB_QP_PATH_MIG_STATE
) {
1254 qp
->s_mig_state
= attr
->path_mig_state
;
1256 qp
->remote_ah_attr
= qp
->alt_ah_attr
;
1257 qp
->port_num
= rdma_ah_get_port_num(&qp
->alt_ah_attr
);
1258 qp
->s_pkey_index
= qp
->s_alt_pkey_index
;
1262 if (attr_mask
& IB_QP_PATH_MTU
) {
1263 qp
->pmtu
= rdi
->driver_f
.mtu_from_qp(rdi
, qp
, pmtu
);
1264 qp
->path_mtu
= rdi
->driver_f
.mtu_to_path_mtu(qp
->pmtu
);
1265 qp
->log_pmtu
= ilog2(qp
->pmtu
);
1268 if (attr_mask
& IB_QP_RETRY_CNT
) {
1269 qp
->s_retry_cnt
= attr
->retry_cnt
;
1270 qp
->s_retry
= attr
->retry_cnt
;
1273 if (attr_mask
& IB_QP_RNR_RETRY
) {
1274 qp
->s_rnr_retry_cnt
= attr
->rnr_retry
;
1275 qp
->s_rnr_retry
= attr
->rnr_retry
;
1278 if (attr_mask
& IB_QP_MIN_RNR_TIMER
)
1279 qp
->r_min_rnr_timer
= attr
->min_rnr_timer
;
1281 if (attr_mask
& IB_QP_TIMEOUT
) {
1282 qp
->timeout
= attr
->timeout
;
1283 qp
->timeout_jiffies
=
1284 usecs_to_jiffies((4096UL * (1UL << qp
->timeout
)) /
1288 if (attr_mask
& IB_QP_QKEY
)
1289 qp
->qkey
= attr
->qkey
;
1291 if (attr_mask
& IB_QP_MAX_DEST_RD_ATOMIC
)
1292 qp
->r_max_rd_atomic
= attr
->max_dest_rd_atomic
;
1294 if (attr_mask
& IB_QP_MAX_QP_RD_ATOMIC
)
1295 qp
->s_max_rd_atomic
= attr
->max_rd_atomic
;
1297 if (rdi
->driver_f
.modify_qp
)
1298 rdi
->driver_f
.modify_qp(qp
, attr
, attr_mask
, udata
);
1300 spin_unlock(&qp
->s_lock
);
1301 spin_unlock(&qp
->s_hlock
);
1302 spin_unlock_irq(&qp
->r_lock
);
1304 if (cur_state
== IB_QPS_RESET
&& new_state
== IB_QPS_INIT
)
1305 rvt_insert_qp(rdi
, qp
);
1308 ev
.device
= qp
->ibqp
.device
;
1309 ev
.element
.qp
= &qp
->ibqp
;
1310 ev
.event
= IB_EVENT_QP_LAST_WQE_REACHED
;
1311 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1314 ev
.device
= qp
->ibqp
.device
;
1315 ev
.element
.qp
= &qp
->ibqp
;
1316 ev
.event
= IB_EVENT_PATH_MIG
;
1317 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1322 spin_unlock(&qp
->s_lock
);
1323 spin_unlock(&qp
->s_hlock
);
1324 spin_unlock_irq(&qp
->r_lock
);
1328 /** rvt_free_qpn - Free a qpn from the bit map
1330 * @qpn: queue pair number to free
1332 static void rvt_free_qpn(struct rvt_qpn_table
*qpt
, u32 qpn
)
1334 struct rvt_qpn_map
*map
;
1336 map
= qpt
->map
+ qpn
/ RVT_BITS_PER_PAGE
;
1338 clear_bit(qpn
& RVT_BITS_PER_PAGE_MASK
, map
->page
);
1342 * rvt_destroy_qp - destroy a queue pair
1343 * @ibqp: the queue pair to destroy
1345 * Note that this can be called while the QP is actively sending or
1348 * Return: 0 on success.
1350 int rvt_destroy_qp(struct ib_qp
*ibqp
)
1352 struct rvt_qp
*qp
= ibqp_to_rvtqp(ibqp
);
1353 struct rvt_dev_info
*rdi
= ib_to_rvt(ibqp
->device
);
1355 spin_lock_irq(&qp
->r_lock
);
1356 spin_lock(&qp
->s_hlock
);
1357 spin_lock(&qp
->s_lock
);
1358 rvt_reset_qp(rdi
, qp
, ibqp
->qp_type
);
1359 spin_unlock(&qp
->s_lock
);
1360 spin_unlock(&qp
->s_hlock
);
1361 spin_unlock_irq(&qp
->r_lock
);
1363 /* qpn is now available for use again */
1364 rvt_free_qpn(&rdi
->qp_dev
->qpn_table
, qp
->ibqp
.qp_num
);
1366 spin_lock(&rdi
->n_qps_lock
);
1367 rdi
->n_qps_allocated
--;
1368 if (qp
->ibqp
.qp_type
== IB_QPT_RC
) {
1370 rdi
->busy_jiffies
= rdi
->n_rc_qps
/ RC_QP_SCALING_INTERVAL
;
1372 spin_unlock(&rdi
->n_qps_lock
);
1375 kref_put(&qp
->ip
->ref
, rvt_release_mmap_info
);
1379 rdi
->driver_f
.qp_priv_free(rdi
, qp
);
1380 kfree(qp
->s_ack_queue
);
1386 * rvt_query_qp - query an ipbq
1387 * @ibqp: IB qp to query
1388 * @attr: attr struct to fill in
1389 * @attr_mask: attr mask ignored
1390 * @init_attr: struct to fill in
1394 int rvt_query_qp(struct ib_qp
*ibqp
, struct ib_qp_attr
*attr
,
1395 int attr_mask
, struct ib_qp_init_attr
*init_attr
)
1397 struct rvt_qp
*qp
= ibqp_to_rvtqp(ibqp
);
1398 struct rvt_dev_info
*rdi
= ib_to_rvt(ibqp
->device
);
1400 attr
->qp_state
= qp
->state
;
1401 attr
->cur_qp_state
= attr
->qp_state
;
1402 attr
->path_mtu
= qp
->path_mtu
;
1403 attr
->path_mig_state
= qp
->s_mig_state
;
1404 attr
->qkey
= qp
->qkey
;
1405 attr
->rq_psn
= qp
->r_psn
& rdi
->dparms
.psn_mask
;
1406 attr
->sq_psn
= qp
->s_next_psn
& rdi
->dparms
.psn_mask
;
1407 attr
->dest_qp_num
= qp
->remote_qpn
;
1408 attr
->qp_access_flags
= qp
->qp_access_flags
;
1409 attr
->cap
.max_send_wr
= qp
->s_size
- 1 -
1410 rdi
->dparms
.reserved_operations
;
1411 attr
->cap
.max_recv_wr
= qp
->ibqp
.srq
? 0 : qp
->r_rq
.size
- 1;
1412 attr
->cap
.max_send_sge
= qp
->s_max_sge
;
1413 attr
->cap
.max_recv_sge
= qp
->r_rq
.max_sge
;
1414 attr
->cap
.max_inline_data
= 0;
1415 attr
->ah_attr
= qp
->remote_ah_attr
;
1416 attr
->alt_ah_attr
= qp
->alt_ah_attr
;
1417 attr
->pkey_index
= qp
->s_pkey_index
;
1418 attr
->alt_pkey_index
= qp
->s_alt_pkey_index
;
1419 attr
->en_sqd_async_notify
= 0;
1420 attr
->sq_draining
= qp
->s_draining
;
1421 attr
->max_rd_atomic
= qp
->s_max_rd_atomic
;
1422 attr
->max_dest_rd_atomic
= qp
->r_max_rd_atomic
;
1423 attr
->min_rnr_timer
= qp
->r_min_rnr_timer
;
1424 attr
->port_num
= qp
->port_num
;
1425 attr
->timeout
= qp
->timeout
;
1426 attr
->retry_cnt
= qp
->s_retry_cnt
;
1427 attr
->rnr_retry
= qp
->s_rnr_retry_cnt
;
1428 attr
->alt_port_num
=
1429 rdma_ah_get_port_num(&qp
->alt_ah_attr
);
1430 attr
->alt_timeout
= qp
->alt_timeout
;
1432 init_attr
->event_handler
= qp
->ibqp
.event_handler
;
1433 init_attr
->qp_context
= qp
->ibqp
.qp_context
;
1434 init_attr
->send_cq
= qp
->ibqp
.send_cq
;
1435 init_attr
->recv_cq
= qp
->ibqp
.recv_cq
;
1436 init_attr
->srq
= qp
->ibqp
.srq
;
1437 init_attr
->cap
= attr
->cap
;
1438 if (qp
->s_flags
& RVT_S_SIGNAL_REQ_WR
)
1439 init_attr
->sq_sig_type
= IB_SIGNAL_REQ_WR
;
1441 init_attr
->sq_sig_type
= IB_SIGNAL_ALL_WR
;
1442 init_attr
->qp_type
= qp
->ibqp
.qp_type
;
1443 init_attr
->port_num
= qp
->port_num
;
1448 * rvt_post_receive - post a receive on a QP
1449 * @ibqp: the QP to post the receive on
1450 * @wr: the WR to post
1451 * @bad_wr: the first bad WR is put here
1453 * This may be called from interrupt context.
1455 * Return: 0 on success otherwise errno
1457 int rvt_post_recv(struct ib_qp
*ibqp
, struct ib_recv_wr
*wr
,
1458 struct ib_recv_wr
**bad_wr
)
1460 struct rvt_qp
*qp
= ibqp_to_rvtqp(ibqp
);
1461 struct rvt_rwq
*wq
= qp
->r_rq
.wq
;
1462 unsigned long flags
;
1463 int qp_err_flush
= (ib_rvt_state_ops
[qp
->state
] & RVT_FLUSH_RECV
) &&
1466 /* Check that state is OK to post receive. */
1467 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_POST_RECV_OK
) || !wq
) {
1472 for (; wr
; wr
= wr
->next
) {
1473 struct rvt_rwqe
*wqe
;
1477 if ((unsigned)wr
->num_sge
> qp
->r_rq
.max_sge
) {
1482 spin_lock_irqsave(&qp
->r_rq
.lock
, flags
);
1483 next
= wq
->head
+ 1;
1484 if (next
>= qp
->r_rq
.size
)
1486 if (next
== wq
->tail
) {
1487 spin_unlock_irqrestore(&qp
->r_rq
.lock
, flags
);
1491 if (unlikely(qp_err_flush
)) {
1494 memset(&wc
, 0, sizeof(wc
));
1496 wc
.opcode
= IB_WC_RECV
;
1497 wc
.wr_id
= wr
->wr_id
;
1498 wc
.status
= IB_WC_WR_FLUSH_ERR
;
1499 rvt_cq_enter(ibcq_to_rvtcq(qp
->ibqp
.recv_cq
), &wc
, 1);
1501 wqe
= rvt_get_rwqe_ptr(&qp
->r_rq
, wq
->head
);
1502 wqe
->wr_id
= wr
->wr_id
;
1503 wqe
->num_sge
= wr
->num_sge
;
1504 for (i
= 0; i
< wr
->num_sge
; i
++)
1505 wqe
->sg_list
[i
] = wr
->sg_list
[i
];
1507 * Make sure queue entry is written
1508 * before the head index.
1513 spin_unlock_irqrestore(&qp
->r_rq
.lock
, flags
);
1519 * rvt_qp_valid_operation - validate post send wr request
1521 * @post-parms - the post send table for the driver
1522 * @wr - the work request
1524 * The routine validates the operation based on the
1525 * validation table an returns the length of the operation
1526 * which can extend beyond the ib_send_bw. Operation
1527 * dependent flags key atomic operation validation.
1529 * There is an exception for UD qps that validates the pd and
1530 * overrides the length to include the additional UD specific
1533 * Returns a negative error or the length of the work request
1534 * for building the swqe.
1536 static inline int rvt_qp_valid_operation(
1538 const struct rvt_operation_params
*post_parms
,
1539 struct ib_send_wr
*wr
)
1543 if (wr
->opcode
>= RVT_OPERATION_MAX
|| !post_parms
[wr
->opcode
].length
)
1545 if (!(post_parms
[wr
->opcode
].qpt_support
& BIT(qp
->ibqp
.qp_type
)))
1547 if ((post_parms
[wr
->opcode
].flags
& RVT_OPERATION_PRIV
) &&
1548 ibpd_to_rvtpd(qp
->ibqp
.pd
)->user
)
1550 if (post_parms
[wr
->opcode
].flags
& RVT_OPERATION_ATOMIC_SGE
&&
1551 (wr
->num_sge
== 0 ||
1552 wr
->sg_list
[0].length
< sizeof(u64
) ||
1553 wr
->sg_list
[0].addr
& (sizeof(u64
) - 1)))
1555 if (post_parms
[wr
->opcode
].flags
& RVT_OPERATION_ATOMIC
&&
1556 !qp
->s_max_rd_atomic
)
1558 len
= post_parms
[wr
->opcode
].length
;
1560 if (qp
->ibqp
.qp_type
!= IB_QPT_UC
&&
1561 qp
->ibqp
.qp_type
!= IB_QPT_RC
) {
1562 if (qp
->ibqp
.pd
!= ud_wr(wr
)->ah
->pd
)
1564 len
= sizeof(struct ib_ud_wr
);
1570 * rvt_qp_is_avail - determine queue capacity
1572 * @rdi - the rdmavt device
1573 * @reserved_op - is reserved operation
1575 * This assumes the s_hlock is held but the s_last
1576 * qp variable is uncontrolled.
1578 * For non reserved operations, the qp->s_avail
1581 * The return value is zero or a -ENOMEM.
1583 static inline int rvt_qp_is_avail(
1585 struct rvt_dev_info
*rdi
,
1592 /* see rvt_qp_wqe_unreserve() */
1593 smp_mb__before_atomic();
1594 reserved_used
= atomic_read(&qp
->s_reserved_used
);
1595 if (unlikely(reserved_op
)) {
1596 /* see rvt_qp_wqe_unreserve() */
1597 smp_mb__before_atomic();
1598 if (reserved_used
>= rdi
->dparms
.reserved_operations
)
1602 /* non-reserved operations */
1603 if (likely(qp
->s_avail
))
1605 smp_read_barrier_depends(); /* see rc.c */
1606 slast
= ACCESS_ONCE(qp
->s_last
);
1607 if (qp
->s_head
>= slast
)
1608 avail
= qp
->s_size
- (qp
->s_head
- slast
);
1610 avail
= slast
- qp
->s_head
;
1612 /* see rvt_qp_wqe_unreserve() */
1613 smp_mb__before_atomic();
1614 reserved_used
= atomic_read(&qp
->s_reserved_used
);
1616 (rdi
->dparms
.reserved_operations
- reserved_used
);
1617 /* insure we don't assign a negative s_avail */
1618 if ((s32
)avail
<= 0)
1620 qp
->s_avail
= avail
;
1621 if (WARN_ON(qp
->s_avail
>
1622 (qp
->s_size
- 1 - rdi
->dparms
.reserved_operations
)))
1624 "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
1625 qp
->ibqp
.qp_num
, qp
->s_size
, qp
->s_avail
,
1626 qp
->s_head
, qp
->s_tail
, qp
->s_cur
,
1627 qp
->s_acked
, qp
->s_last
);
1632 * rvt_post_one_wr - post one RC, UC, or UD send work request
1633 * @qp: the QP to post on
1634 * @wr: the work request to send
1636 static int rvt_post_one_wr(struct rvt_qp
*qp
,
1637 struct ib_send_wr
*wr
,
1640 struct rvt_swqe
*wqe
;
1645 struct rvt_lkey_table
*rkt
;
1647 struct rvt_dev_info
*rdi
= ib_to_rvt(qp
->ibqp
.device
);
1652 int local_ops_delayed
= 0;
1654 BUILD_BUG_ON(IB_QPT_MAX
>= (sizeof(u32
) * BITS_PER_BYTE
));
1656 /* IB spec says that num_sge == 0 is OK. */
1657 if (unlikely(wr
->num_sge
> qp
->s_max_sge
))
1660 ret
= rvt_qp_valid_operation(qp
, rdi
->post_parms
, wr
);
1666 * Local operations include fast register and local invalidate.
1667 * Fast register needs to be processed immediately because the
1668 * registered lkey may be used by following work requests and the
1669 * lkey needs to be valid at the time those requests are posted.
1670 * Local invalidate can be processed immediately if fencing is
1671 * not required and no previous local invalidate ops are pending.
1672 * Signaled local operations that have been processed immediately
1673 * need to have requests with "completion only" flags set posted
1674 * to the send queue in order to generate completions.
1676 if ((rdi
->post_parms
[wr
->opcode
].flags
& RVT_OPERATION_LOCAL
)) {
1677 switch (wr
->opcode
) {
1679 ret
= rvt_fast_reg_mr(qp
,
1682 reg_wr(wr
)->access
);
1683 if (ret
|| !(wr
->send_flags
& IB_SEND_SIGNALED
))
1686 case IB_WR_LOCAL_INV
:
1687 if ((wr
->send_flags
& IB_SEND_FENCE
) ||
1688 atomic_read(&qp
->local_ops_pending
)) {
1689 local_ops_delayed
= 1;
1691 ret
= rvt_invalidate_rkey(
1692 qp
, wr
->ex
.invalidate_rkey
);
1693 if (ret
|| !(wr
->send_flags
& IB_SEND_SIGNALED
))
1702 reserved_op
= rdi
->post_parms
[wr
->opcode
].flags
&
1703 RVT_OPERATION_USE_RESERVE
;
1704 /* check for avail */
1705 ret
= rvt_qp_is_avail(qp
, rdi
, reserved_op
);
1708 next
= qp
->s_head
+ 1;
1709 if (next
>= qp
->s_size
)
1712 rkt
= &rdi
->lkey_table
;
1713 pd
= ibpd_to_rvtpd(qp
->ibqp
.pd
);
1714 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_head
);
1716 /* cplen has length from above */
1717 memcpy(&wqe
->wr
, wr
, cplen
);
1722 acc
= wr
->opcode
>= IB_WR_RDMA_READ
?
1723 IB_ACCESS_LOCAL_WRITE
: 0;
1724 for (i
= 0; i
< wr
->num_sge
; i
++) {
1725 u32 length
= wr
->sg_list
[i
].length
;
1730 ok
= rvt_lkey_ok(rkt
, pd
, &wqe
->sg_list
[j
],
1731 &wr
->sg_list
[i
], acc
);
1734 goto bail_inval_free
;
1736 wqe
->length
+= length
;
1739 wqe
->wr
.num_sge
= j
;
1742 /* general part of wqe valid - allow for driver checks */
1743 if (rdi
->driver_f
.check_send_wqe
) {
1744 ret
= rdi
->driver_f
.check_send_wqe(qp
, wqe
);
1746 goto bail_inval_free
;
1751 log_pmtu
= qp
->log_pmtu
;
1752 if (qp
->ibqp
.qp_type
!= IB_QPT_UC
&&
1753 qp
->ibqp
.qp_type
!= IB_QPT_RC
) {
1754 struct rvt_ah
*ah
= ibah_to_rvtah(wqe
->ud_wr
.ah
);
1756 log_pmtu
= ah
->log_pmtu
;
1757 atomic_inc(&ibah_to_rvtah(ud_wr(wr
)->ah
)->refcount
);
1760 if (rdi
->post_parms
[wr
->opcode
].flags
& RVT_OPERATION_LOCAL
) {
1761 if (local_ops_delayed
)
1762 atomic_inc(&qp
->local_ops_pending
);
1764 wqe
->wr
.send_flags
|= RVT_SEND_COMPLETION_ONLY
;
1769 wqe
->ssn
= qp
->s_ssn
++;
1770 wqe
->psn
= qp
->s_next_psn
;
1771 wqe
->lpsn
= wqe
->psn
+
1773 ((wqe
->length
- 1) >> log_pmtu
) :
1775 qp
->s_next_psn
= wqe
->lpsn
+ 1;
1777 if (unlikely(reserved_op
)) {
1778 wqe
->wr
.send_flags
|= RVT_SEND_RESERVE_USED
;
1779 rvt_qp_wqe_reserve(qp
, wqe
);
1781 wqe
->wr
.send_flags
&= ~RVT_SEND_RESERVE_USED
;
1784 trace_rvt_post_one_wr(qp
, wqe
);
1785 smp_wmb(); /* see request builders */
1791 /* release mr holds */
1793 struct rvt_sge
*sge
= &wqe
->sg_list
[--j
];
1795 rvt_put_mr(sge
->mr
);
1801 * rvt_post_send - post a send on a QP
1802 * @ibqp: the QP to post the send on
1803 * @wr: the list of work requests to post
1804 * @bad_wr: the first bad WR is put here
1806 * This may be called from interrupt context.
1808 * Return: 0 on success else errno
1810 int rvt_post_send(struct ib_qp
*ibqp
, struct ib_send_wr
*wr
,
1811 struct ib_send_wr
**bad_wr
)
1813 struct rvt_qp
*qp
= ibqp_to_rvtqp(ibqp
);
1814 struct rvt_dev_info
*rdi
= ib_to_rvt(ibqp
->device
);
1815 unsigned long flags
= 0;
1820 spin_lock_irqsave(&qp
->s_hlock
, flags
);
1823 * Ensure QP state is such that we can send. If not bail out early,
1824 * there is no need to do this every time we post a send.
1826 if (unlikely(!(ib_rvt_state_ops
[qp
->state
] & RVT_POST_SEND_OK
))) {
1827 spin_unlock_irqrestore(&qp
->s_hlock
, flags
);
1832 * If the send queue is empty, and we only have a single WR then just go
1833 * ahead and kick the send engine into gear. Otherwise we will always
1834 * just schedule the send to happen later.
1836 call_send
= qp
->s_head
== ACCESS_ONCE(qp
->s_last
) && !wr
->next
;
1838 for (; wr
; wr
= wr
->next
) {
1839 err
= rvt_post_one_wr(qp
, wr
, &call_send
);
1840 if (unlikely(err
)) {
1847 spin_unlock_irqrestore(&qp
->s_hlock
, flags
);
1850 rdi
->driver_f
.do_send(qp
);
1852 rdi
->driver_f
.schedule_send_no_lock(qp
);
1858 * rvt_post_srq_receive - post a receive on a shared receive queue
1859 * @ibsrq: the SRQ to post the receive on
1860 * @wr: the list of work requests to post
1861 * @bad_wr: A pointer to the first WR to cause a problem is put here
1863 * This may be called from interrupt context.
1865 * Return: 0 on success else errno
1867 int rvt_post_srq_recv(struct ib_srq
*ibsrq
, struct ib_recv_wr
*wr
,
1868 struct ib_recv_wr
**bad_wr
)
1870 struct rvt_srq
*srq
= ibsrq_to_rvtsrq(ibsrq
);
1872 unsigned long flags
;
1874 for (; wr
; wr
= wr
->next
) {
1875 struct rvt_rwqe
*wqe
;
1879 if ((unsigned)wr
->num_sge
> srq
->rq
.max_sge
) {
1884 spin_lock_irqsave(&srq
->rq
.lock
, flags
);
1886 next
= wq
->head
+ 1;
1887 if (next
>= srq
->rq
.size
)
1889 if (next
== wq
->tail
) {
1890 spin_unlock_irqrestore(&srq
->rq
.lock
, flags
);
1895 wqe
= rvt_get_rwqe_ptr(&srq
->rq
, wq
->head
);
1896 wqe
->wr_id
= wr
->wr_id
;
1897 wqe
->num_sge
= wr
->num_sge
;
1898 for (i
= 0; i
< wr
->num_sge
; i
++)
1899 wqe
->sg_list
[i
] = wr
->sg_list
[i
];
1900 /* Make sure queue entry is written before the head index. */
1903 spin_unlock_irqrestore(&srq
->rq
.lock
, flags
);
1909 * qp_comm_est - handle trap with QP established
1912 void rvt_comm_est(struct rvt_qp
*qp
)
1914 qp
->r_flags
|= RVT_R_COMM_EST
;
1915 if (qp
->ibqp
.event_handler
) {
1918 ev
.device
= qp
->ibqp
.device
;
1919 ev
.element
.qp
= &qp
->ibqp
;
1920 ev
.event
= IB_EVENT_COMM_EST
;
1921 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1924 EXPORT_SYMBOL(rvt_comm_est
);
1926 void rvt_rc_error(struct rvt_qp
*qp
, enum ib_wc_status err
)
1928 unsigned long flags
;
1931 spin_lock_irqsave(&qp
->s_lock
, flags
);
1932 lastwqe
= rvt_error_qp(qp
, err
);
1933 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1938 ev
.device
= qp
->ibqp
.device
;
1939 ev
.element
.qp
= &qp
->ibqp
;
1940 ev
.event
= IB_EVENT_QP_LAST_WQE_REACHED
;
1941 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1944 EXPORT_SYMBOL(rvt_rc_error
);
1947 * rvt_rnr_tbl_to_usec - return index into ib_rvt_rnr_table
1948 * @index - the index
1949 * return usec from an index into ib_rvt_rnr_table
1951 unsigned long rvt_rnr_tbl_to_usec(u32 index
)
1953 return ib_rvt_rnr_table
[(index
& IB_AETH_CREDIT_MASK
)];
1955 EXPORT_SYMBOL(rvt_rnr_tbl_to_usec
);
1957 static inline unsigned long rvt_aeth_to_usec(u32 aeth
)
1959 return ib_rvt_rnr_table
[(aeth
>> IB_AETH_CREDIT_SHIFT
) &
1960 IB_AETH_CREDIT_MASK
];
1964 * rvt_add_retry_timer - add/start a retry timer
1966 * add a retry timer on the QP
1968 void rvt_add_retry_timer(struct rvt_qp
*qp
)
1970 struct ib_qp
*ibqp
= &qp
->ibqp
;
1971 struct rvt_dev_info
*rdi
= ib_to_rvt(ibqp
->device
);
1973 lockdep_assert_held(&qp
->s_lock
);
1974 qp
->s_flags
|= RVT_S_TIMER
;
1975 /* 4.096 usec. * (1 << qp->timeout) */
1976 qp
->s_timer
.expires
= jiffies
+ qp
->timeout_jiffies
+
1978 add_timer(&qp
->s_timer
);
1980 EXPORT_SYMBOL(rvt_add_retry_timer
);
1983 * rvt_add_rnr_timer - add/start an rnr timer
1985 * @aeth - aeth of RNR timeout, simulated aeth for loopback
1986 * add an rnr timer on the QP
1988 void rvt_add_rnr_timer(struct rvt_qp
*qp
, u32 aeth
)
1992 lockdep_assert_held(&qp
->s_lock
);
1993 qp
->s_flags
|= RVT_S_WAIT_RNR
;
1994 to
= rvt_aeth_to_usec(aeth
);
1995 hrtimer_start(&qp
->s_rnr_timer
,
1996 ns_to_ktime(1000 * to
), HRTIMER_MODE_REL
);
1998 EXPORT_SYMBOL(rvt_add_rnr_timer
);
2001 * rvt_stop_rc_timers - stop all timers
2003 * stop any pending timers
2005 void rvt_stop_rc_timers(struct rvt_qp
*qp
)
2007 lockdep_assert_held(&qp
->s_lock
);
2008 /* Remove QP from all timers */
2009 if (qp
->s_flags
& (RVT_S_TIMER
| RVT_S_WAIT_RNR
)) {
2010 qp
->s_flags
&= ~(RVT_S_TIMER
| RVT_S_WAIT_RNR
);
2011 del_timer(&qp
->s_timer
);
2012 hrtimer_try_to_cancel(&qp
->s_rnr_timer
);
2015 EXPORT_SYMBOL(rvt_stop_rc_timers
);
2018 * rvt_stop_rnr_timer - stop an rnr timer
2021 * stop an rnr timer and return if the timer
2024 static int rvt_stop_rnr_timer(struct rvt_qp
*qp
)
2028 lockdep_assert_held(&qp
->s_lock
);
2029 /* Remove QP from rnr timer */
2030 if (qp
->s_flags
& RVT_S_WAIT_RNR
) {
2031 qp
->s_flags
&= ~RVT_S_WAIT_RNR
;
2032 rval
= hrtimer_try_to_cancel(&qp
->s_rnr_timer
);
2038 * rvt_del_timers_sync - wait for any timeout routines to exit
2041 void rvt_del_timers_sync(struct rvt_qp
*qp
)
2043 del_timer_sync(&qp
->s_timer
);
2044 hrtimer_cancel(&qp
->s_rnr_timer
);
2046 EXPORT_SYMBOL(rvt_del_timers_sync
);
2049 * This is called from s_timer for missing responses.
2051 static void rvt_rc_timeout(unsigned long arg
)
2053 struct rvt_qp
*qp
= (struct rvt_qp
*)arg
;
2054 struct rvt_dev_info
*rdi
= ib_to_rvt(qp
->ibqp
.device
);
2055 unsigned long flags
;
2057 spin_lock_irqsave(&qp
->r_lock
, flags
);
2058 spin_lock(&qp
->s_lock
);
2059 if (qp
->s_flags
& RVT_S_TIMER
) {
2060 struct rvt_ibport
*rvp
= rdi
->ports
[qp
->port_num
- 1];
2062 qp
->s_flags
&= ~RVT_S_TIMER
;
2063 rvp
->n_rc_timeouts
++;
2064 del_timer(&qp
->s_timer
);
2065 trace_rvt_rc_timeout(qp
, qp
->s_last_psn
+ 1);
2066 if (rdi
->driver_f
.notify_restart_rc
)
2067 rdi
->driver_f
.notify_restart_rc(qp
,
2070 rdi
->driver_f
.schedule_send(qp
);
2072 spin_unlock(&qp
->s_lock
);
2073 spin_unlock_irqrestore(&qp
->r_lock
, flags
);
2077 * This is called from s_timer for RNR timeouts.
2079 enum hrtimer_restart
rvt_rc_rnr_retry(struct hrtimer
*t
)
2081 struct rvt_qp
*qp
= container_of(t
, struct rvt_qp
, s_rnr_timer
);
2082 struct rvt_dev_info
*rdi
= ib_to_rvt(qp
->ibqp
.device
);
2083 unsigned long flags
;
2085 spin_lock_irqsave(&qp
->s_lock
, flags
);
2086 rvt_stop_rnr_timer(qp
);
2087 rdi
->driver_f
.schedule_send(qp
);
2088 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2089 return HRTIMER_NORESTART
;
2091 EXPORT_SYMBOL(rvt_rc_rnr_retry
);