]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blob - net/sunrpc/xprtrdma/verbs.c
Merge tag 'nfsd-4.18' of git://linux-nfs.org/~bfields/linux
[mirror_ubuntu-eoan-kernel.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2 * Copyright (c) 2014-2017 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the BSD-type
9 * license below:
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 *
15 * Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 *
18 * Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials provided
21 * with the distribution.
22 *
23 * Neither the name of the Network Appliance, Inc. nor the names of
24 * its contributors may be used to endorse or promote products
25 * derived from this software without specific prior written
26 * permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
29 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
31 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
32 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
33 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
34 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
35 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
36 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
38 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 */
40
41 /*
42 * verbs.c
43 *
44 * Encapsulates the major functions managing:
45 * o adapters
46 * o endpoints
47 * o connections
48 * o buffer memory
49 */
50
51 #include <linux/interrupt.h>
52 #include <linux/slab.h>
53 #include <linux/sunrpc/addr.h>
54 #include <linux/sunrpc/svc_rdma.h>
55
56 #include <asm-generic/barrier.h>
57 #include <asm/bitops.h>
58
59 #include <rdma/ib_cm.h>
60
61 #include "xprt_rdma.h"
62 #include <trace/events/rpcrdma.h>
63
64 /*
65 * Globals/Macros
66 */
67
68 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
69 # define RPCDBG_FACILITY RPCDBG_TRANS
70 #endif
71
72 /*
73 * internal functions
74 */
75 static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
76 static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
77 static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
78
79 struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
80
81 int
82 rpcrdma_alloc_wq(void)
83 {
84 struct workqueue_struct *recv_wq;
85
86 recv_wq = alloc_workqueue("xprtrdma_receive",
87 WQ_MEM_RECLAIM | WQ_HIGHPRI,
88 0);
89 if (!recv_wq)
90 return -ENOMEM;
91
92 rpcrdma_receive_wq = recv_wq;
93 return 0;
94 }
95
96 void
97 rpcrdma_destroy_wq(void)
98 {
99 struct workqueue_struct *wq;
100
101 if (rpcrdma_receive_wq) {
102 wq = rpcrdma_receive_wq;
103 rpcrdma_receive_wq = NULL;
104 destroy_workqueue(wq);
105 }
106 }
107
108 static void
109 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
110 {
111 struct rpcrdma_ep *ep = context;
112 struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
113 rx_ep);
114
115 trace_xprtrdma_qp_error(r_xprt, event);
116 pr_err("rpcrdma: %s on device %s ep %p\n",
117 ib_event_msg(event->event), event->device->name, context);
118
119 if (ep->rep_connected == 1) {
120 ep->rep_connected = -EIO;
121 rpcrdma_conn_func(ep);
122 wake_up_all(&ep->rep_connect_wait);
123 }
124 }
125
126 /**
127 * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
128 * @cq: completion queue (ignored)
129 * @wc: completed WR
130 *
131 */
132 static void
133 rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
134 {
135 struct ib_cqe *cqe = wc->wr_cqe;
136 struct rpcrdma_sendctx *sc =
137 container_of(cqe, struct rpcrdma_sendctx, sc_cqe);
138
139 /* WARNING: Only wr_cqe and status are reliable at this point */
140 trace_xprtrdma_wc_send(sc, wc);
141 if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)
142 pr_err("rpcrdma: Send: %s (%u/0x%x)\n",
143 ib_wc_status_msg(wc->status),
144 wc->status, wc->vendor_err);
145
146 rpcrdma_sendctx_put_locked(sc);
147 }
148
149 /**
150 * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
151 * @cq: completion queue (ignored)
152 * @wc: completed WR
153 *
154 */
155 static void
156 rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
157 {
158 struct ib_cqe *cqe = wc->wr_cqe;
159 struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
160 rr_cqe);
161
162 /* WARNING: Only wr_id and status are reliable at this point */
163 trace_xprtrdma_wc_receive(rep, wc);
164 if (wc->status != IB_WC_SUCCESS)
165 goto out_fail;
166
167 /* status == SUCCESS means all fields in wc are trustworthy */
168 rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len);
169 rep->rr_wc_flags = wc->wc_flags;
170 rep->rr_inv_rkey = wc->ex.invalidate_rkey;
171
172 ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
173 rdmab_addr(rep->rr_rdmabuf),
174 wc->byte_len, DMA_FROM_DEVICE);
175
176 out_schedule:
177 rpcrdma_reply_handler(rep);
178 return;
179
180 out_fail:
181 if (wc->status != IB_WC_WR_FLUSH_ERR)
182 pr_err("rpcrdma: Recv: %s (%u/0x%x)\n",
183 ib_wc_status_msg(wc->status),
184 wc->status, wc->vendor_err);
185 rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0);
186 goto out_schedule;
187 }
188
189 static void
190 rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
191 struct rdma_conn_param *param)
192 {
193 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
194 const struct rpcrdma_connect_private *pmsg = param->private_data;
195 unsigned int rsize, wsize;
196
197 /* Default settings for RPC-over-RDMA Version One */
198 r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
199 rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
200 wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
201
202 if (pmsg &&
203 pmsg->cp_magic == rpcrdma_cmp_magic &&
204 pmsg->cp_version == RPCRDMA_CMP_VERSION) {
205 r_xprt->rx_ia.ri_implicit_roundup = true;
206 rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
207 wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
208 }
209
210 if (rsize < cdata->inline_rsize)
211 cdata->inline_rsize = rsize;
212 if (wsize < cdata->inline_wsize)
213 cdata->inline_wsize = wsize;
214 dprintk("RPC: %s: max send %u, max recv %u\n",
215 __func__, cdata->inline_wsize, cdata->inline_rsize);
216 rpcrdma_set_max_header_sizes(r_xprt);
217 }
218
219 static int
220 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
221 {
222 struct rpcrdma_xprt *xprt = id->context;
223 struct rpcrdma_ia *ia = &xprt->rx_ia;
224 struct rpcrdma_ep *ep = &xprt->rx_ep;
225 int connstate = 0;
226
227 trace_xprtrdma_conn_upcall(xprt, event);
228 switch (event->event) {
229 case RDMA_CM_EVENT_ADDR_RESOLVED:
230 case RDMA_CM_EVENT_ROUTE_RESOLVED:
231 ia->ri_async_rc = 0;
232 complete(&ia->ri_done);
233 break;
234 case RDMA_CM_EVENT_ADDR_ERROR:
235 ia->ri_async_rc = -EHOSTUNREACH;
236 complete(&ia->ri_done);
237 break;
238 case RDMA_CM_EVENT_ROUTE_ERROR:
239 ia->ri_async_rc = -ENETUNREACH;
240 complete(&ia->ri_done);
241 break;
242 case RDMA_CM_EVENT_DEVICE_REMOVAL:
243 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
244 pr_info("rpcrdma: removing device %s for %s:%s\n",
245 ia->ri_device->name,
246 rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt));
247 #endif
248 set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
249 ep->rep_connected = -ENODEV;
250 xprt_force_disconnect(&xprt->rx_xprt);
251 wait_for_completion(&ia->ri_remove_done);
252
253 ia->ri_id = NULL;
254 ia->ri_device = NULL;
255 /* Return 1 to ensure the core destroys the id. */
256 return 1;
257 case RDMA_CM_EVENT_ESTABLISHED:
258 ++xprt->rx_xprt.connect_cookie;
259 connstate = 1;
260 rpcrdma_update_connect_private(xprt, &event->param.conn);
261 goto connected;
262 case RDMA_CM_EVENT_CONNECT_ERROR:
263 connstate = -ENOTCONN;
264 goto connected;
265 case RDMA_CM_EVENT_UNREACHABLE:
266 connstate = -ENETDOWN;
267 goto connected;
268 case RDMA_CM_EVENT_REJECTED:
269 dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
270 rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
271 rdma_reject_msg(id, event->status));
272 connstate = -ECONNREFUSED;
273 if (event->status == IB_CM_REJ_STALE_CONN)
274 connstate = -EAGAIN;
275 goto connected;
276 case RDMA_CM_EVENT_DISCONNECTED:
277 ++xprt->rx_xprt.connect_cookie;
278 connstate = -ECONNABORTED;
279 connected:
280 xprt->rx_buf.rb_credits = 1;
281 ep->rep_connected = connstate;
282 rpcrdma_conn_func(ep);
283 wake_up_all(&ep->rep_connect_wait);
284 /*FALLTHROUGH*/
285 default:
286 dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n",
287 __func__,
288 rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt),
289 ia->ri_device->name, ia->ri_ops->ro_displayname,
290 ep, rdma_event_msg(event->event));
291 break;
292 }
293
294 return 0;
295 }
296
297 static struct rdma_cm_id *
298 rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
299 {
300 unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
301 struct rdma_cm_id *id;
302 int rc;
303
304 trace_xprtrdma_conn_start(xprt);
305
306 init_completion(&ia->ri_done);
307 init_completion(&ia->ri_remove_done);
308
309 id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
310 IB_QPT_RC);
311 if (IS_ERR(id)) {
312 rc = PTR_ERR(id);
313 dprintk("RPC: %s: rdma_create_id() failed %i\n",
314 __func__, rc);
315 return id;
316 }
317
318 ia->ri_async_rc = -ETIMEDOUT;
319 rc = rdma_resolve_addr(id, NULL,
320 (struct sockaddr *)&xprt->rx_xprt.addr,
321 RDMA_RESOLVE_TIMEOUT);
322 if (rc) {
323 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
324 __func__, rc);
325 goto out;
326 }
327 rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
328 if (rc < 0) {
329 trace_xprtrdma_conn_tout(xprt);
330 goto out;
331 }
332
333 rc = ia->ri_async_rc;
334 if (rc)
335 goto out;
336
337 ia->ri_async_rc = -ETIMEDOUT;
338 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
339 if (rc) {
340 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
341 __func__, rc);
342 goto out;
343 }
344 rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
345 if (rc < 0) {
346 trace_xprtrdma_conn_tout(xprt);
347 goto out;
348 }
349 rc = ia->ri_async_rc;
350 if (rc)
351 goto out;
352
353 return id;
354
355 out:
356 rdma_destroy_id(id);
357 return ERR_PTR(rc);
358 }
359
360 /*
361 * Exported functions.
362 */
363
364 /**
365 * rpcrdma_ia_open - Open and initialize an Interface Adapter.
366 * @xprt: transport with IA to (re)initialize
367 *
368 * Returns 0 on success, negative errno if an appropriate
369 * Interface Adapter could not be found and opened.
370 */
371 int
372 rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
373 {
374 struct rpcrdma_ia *ia = &xprt->rx_ia;
375 int rc;
376
377 ia->ri_id = rpcrdma_create_id(xprt, ia);
378 if (IS_ERR(ia->ri_id)) {
379 rc = PTR_ERR(ia->ri_id);
380 goto out_err;
381 }
382 ia->ri_device = ia->ri_id->device;
383
384 ia->ri_pd = ib_alloc_pd(ia->ri_device, 0);
385 if (IS_ERR(ia->ri_pd)) {
386 rc = PTR_ERR(ia->ri_pd);
387 pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
388 goto out_err;
389 }
390
391 switch (xprt_rdma_memreg_strategy) {
392 case RPCRDMA_FRWR:
393 if (frwr_is_supported(ia)) {
394 ia->ri_ops = &rpcrdma_frwr_memreg_ops;
395 break;
396 }
397 /*FALLTHROUGH*/
398 case RPCRDMA_MTHCAFMR:
399 if (fmr_is_supported(ia)) {
400 ia->ri_ops = &rpcrdma_fmr_memreg_ops;
401 break;
402 }
403 /*FALLTHROUGH*/
404 default:
405 pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
406 ia->ri_device->name, xprt_rdma_memreg_strategy);
407 rc = -EINVAL;
408 goto out_err;
409 }
410
411 return 0;
412
413 out_err:
414 rpcrdma_ia_close(ia);
415 return rc;
416 }
417
418 /**
419 * rpcrdma_ia_remove - Handle device driver unload
420 * @ia: interface adapter being removed
421 *
422 * Divest transport H/W resources associated with this adapter,
423 * but allow it to be restored later.
424 */
425 void
426 rpcrdma_ia_remove(struct rpcrdma_ia *ia)
427 {
428 struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
429 rx_ia);
430 struct rpcrdma_ep *ep = &r_xprt->rx_ep;
431 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
432 struct rpcrdma_req *req;
433 struct rpcrdma_rep *rep;
434
435 cancel_delayed_work_sync(&buf->rb_refresh_worker);
436
437 /* This is similar to rpcrdma_ep_destroy, but:
438 * - Don't cancel the connect worker.
439 * - Don't call rpcrdma_ep_disconnect, which waits
440 * for another conn upcall, which will deadlock.
441 * - rdma_disconnect is unneeded, the underlying
442 * connection is already gone.
443 */
444 if (ia->ri_id->qp) {
445 ib_drain_qp(ia->ri_id->qp);
446 rdma_destroy_qp(ia->ri_id);
447 ia->ri_id->qp = NULL;
448 }
449 ib_free_cq(ep->rep_attr.recv_cq);
450 ep->rep_attr.recv_cq = NULL;
451 ib_free_cq(ep->rep_attr.send_cq);
452 ep->rep_attr.send_cq = NULL;
453
454 /* The ULP is responsible for ensuring all DMA
455 * mappings and MRs are gone.
456 */
457 list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list)
458 rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf);
459 list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
460 rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf);
461 rpcrdma_dma_unmap_regbuf(req->rl_sendbuf);
462 rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
463 }
464 rpcrdma_mrs_destroy(buf);
465 ib_dealloc_pd(ia->ri_pd);
466 ia->ri_pd = NULL;
467
468 /* Allow waiters to continue */
469 complete(&ia->ri_remove_done);
470
471 trace_xprtrdma_remove(r_xprt);
472 }
473
474 /**
475 * rpcrdma_ia_close - Clean up/close an IA.
476 * @ia: interface adapter to close
477 *
478 */
479 void
480 rpcrdma_ia_close(struct rpcrdma_ia *ia)
481 {
482 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
483 if (ia->ri_id->qp)
484 rdma_destroy_qp(ia->ri_id);
485 rdma_destroy_id(ia->ri_id);
486 }
487 ia->ri_id = NULL;
488 ia->ri_device = NULL;
489
490 /* If the pd is still busy, xprtrdma missed freeing a resource */
491 if (ia->ri_pd && !IS_ERR(ia->ri_pd))
492 ib_dealloc_pd(ia->ri_pd);
493 ia->ri_pd = NULL;
494 }
495
496 /*
497 * Create unconnected endpoint.
498 */
499 int
500 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
501 struct rpcrdma_create_data_internal *cdata)
502 {
503 struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
504 unsigned int max_qp_wr, max_sge;
505 struct ib_cq *sendcq, *recvcq;
506 int rc;
507
508 max_sge = min_t(unsigned int, ia->ri_device->attrs.max_sge,
509 RPCRDMA_MAX_SEND_SGES);
510 if (max_sge < RPCRDMA_MIN_SEND_SGES) {
511 pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
512 return -ENOMEM;
513 }
514 ia->ri_max_send_sges = max_sge;
515
516 if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
517 dprintk("RPC: %s: insufficient wqe's available\n",
518 __func__);
519 return -ENOMEM;
520 }
521 max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1;
522
523 /* check provider's send/recv wr limits */
524 if (cdata->max_requests > max_qp_wr)
525 cdata->max_requests = max_qp_wr;
526
527 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
528 ep->rep_attr.qp_context = ep;
529 ep->rep_attr.srq = NULL;
530 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
531 ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
532 ep->rep_attr.cap.max_send_wr += 1; /* drain cqe */
533 rc = ia->ri_ops->ro_open(ia, ep, cdata);
534 if (rc)
535 return rc;
536 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
537 ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
538 ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
539 ep->rep_attr.cap.max_send_sge = max_sge;
540 ep->rep_attr.cap.max_recv_sge = 1;
541 ep->rep_attr.cap.max_inline_data = 0;
542 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
543 ep->rep_attr.qp_type = IB_QPT_RC;
544 ep->rep_attr.port_num = ~0;
545
546 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
547 "iovs: send %d recv %d\n",
548 __func__,
549 ep->rep_attr.cap.max_send_wr,
550 ep->rep_attr.cap.max_recv_wr,
551 ep->rep_attr.cap.max_send_sge,
552 ep->rep_attr.cap.max_recv_sge);
553
554 /* set trigger for requesting send completion */
555 ep->rep_send_batch = min_t(unsigned int, RPCRDMA_MAX_SEND_BATCH,
556 cdata->max_requests >> 2);
557 ep->rep_send_count = ep->rep_send_batch;
558 init_waitqueue_head(&ep->rep_connect_wait);
559 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
560
561 sendcq = ib_alloc_cq(ia->ri_device, NULL,
562 ep->rep_attr.cap.max_send_wr + 1,
563 1, IB_POLL_WORKQUEUE);
564 if (IS_ERR(sendcq)) {
565 rc = PTR_ERR(sendcq);
566 dprintk("RPC: %s: failed to create send CQ: %i\n",
567 __func__, rc);
568 goto out1;
569 }
570
571 recvcq = ib_alloc_cq(ia->ri_device, NULL,
572 ep->rep_attr.cap.max_recv_wr + 1,
573 0, IB_POLL_WORKQUEUE);
574 if (IS_ERR(recvcq)) {
575 rc = PTR_ERR(recvcq);
576 dprintk("RPC: %s: failed to create recv CQ: %i\n",
577 __func__, rc);
578 goto out2;
579 }
580
581 ep->rep_attr.send_cq = sendcq;
582 ep->rep_attr.recv_cq = recvcq;
583
584 /* Initialize cma parameters */
585 memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
586
587 /* Prepare RDMA-CM private message */
588 pmsg->cp_magic = rpcrdma_cmp_magic;
589 pmsg->cp_version = RPCRDMA_CMP_VERSION;
590 pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok;
591 pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
592 pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
593 ep->rep_remote_cma.private_data = pmsg;
594 ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
595
596 /* Client offers RDMA Read but does not initiate */
597 ep->rep_remote_cma.initiator_depth = 0;
598 ep->rep_remote_cma.responder_resources =
599 min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom);
600
601 /* Limit transport retries so client can detect server
602 * GID changes quickly. RPC layer handles re-establishing
603 * transport connection and retransmission.
604 */
605 ep->rep_remote_cma.retry_count = 6;
606
607 /* RPC-over-RDMA handles its own flow control. In addition,
608 * make all RNR NAKs visible so we know that RPC-over-RDMA
609 * flow control is working correctly (no NAKs should be seen).
610 */
611 ep->rep_remote_cma.flow_control = 0;
612 ep->rep_remote_cma.rnr_retry_count = 0;
613
614 return 0;
615
616 out2:
617 ib_free_cq(sendcq);
618 out1:
619 return rc;
620 }
621
622 /*
623 * rpcrdma_ep_destroy
624 *
625 * Disconnect and destroy endpoint. After this, the only
626 * valid operations on the ep are to free it (if dynamically
627 * allocated) or re-create it.
628 */
629 void
630 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
631 {
632 cancel_delayed_work_sync(&ep->rep_connect_worker);
633
634 if (ia->ri_id && ia->ri_id->qp) {
635 rpcrdma_ep_disconnect(ep, ia);
636 rdma_destroy_qp(ia->ri_id);
637 ia->ri_id->qp = NULL;
638 }
639
640 if (ep->rep_attr.recv_cq)
641 ib_free_cq(ep->rep_attr.recv_cq);
642 if (ep->rep_attr.send_cq)
643 ib_free_cq(ep->rep_attr.send_cq);
644 }
645
646 /* Re-establish a connection after a device removal event.
647 * Unlike a normal reconnection, a fresh PD and a new set
648 * of MRs and buffers is needed.
649 */
650 static int
651 rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
652 struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
653 {
654 int rc, err;
655
656 trace_xprtrdma_reinsert(r_xprt);
657
658 rc = -EHOSTUNREACH;
659 if (rpcrdma_ia_open(r_xprt))
660 goto out1;
661
662 rc = -ENOMEM;
663 err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data);
664 if (err) {
665 pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
666 goto out2;
667 }
668
669 rc = -ENETUNREACH;
670 err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
671 if (err) {
672 pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
673 goto out3;
674 }
675
676 rpcrdma_mrs_create(r_xprt);
677 return 0;
678
679 out3:
680 rpcrdma_ep_destroy(ep, ia);
681 out2:
682 rpcrdma_ia_close(ia);
683 out1:
684 return rc;
685 }
686
687 static int
688 rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
689 struct rpcrdma_ia *ia)
690 {
691 struct rdma_cm_id *id, *old;
692 int err, rc;
693
694 trace_xprtrdma_reconnect(r_xprt);
695
696 rpcrdma_ep_disconnect(ep, ia);
697
698 rc = -EHOSTUNREACH;
699 id = rpcrdma_create_id(r_xprt, ia);
700 if (IS_ERR(id))
701 goto out;
702
703 /* As long as the new ID points to the same device as the
704 * old ID, we can reuse the transport's existing PD and all
705 * previously allocated MRs. Also, the same device means
706 * the transport's previous DMA mappings are still valid.
707 *
708 * This is a sanity check only. There should be no way these
709 * point to two different devices here.
710 */
711 old = id;
712 rc = -ENETUNREACH;
713 if (ia->ri_device != id->device) {
714 pr_err("rpcrdma: can't reconnect on different device!\n");
715 goto out_destroy;
716 }
717
718 err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
719 if (err) {
720 dprintk("RPC: %s: rdma_create_qp returned %d\n",
721 __func__, err);
722 goto out_destroy;
723 }
724
725 /* Atomically replace the transport's ID and QP. */
726 rc = 0;
727 old = ia->ri_id;
728 ia->ri_id = id;
729 rdma_destroy_qp(old);
730
731 out_destroy:
732 rdma_destroy_id(old);
733 out:
734 return rc;
735 }
736
737 /*
738 * Connect unconnected endpoint.
739 */
740 int
741 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
742 {
743 struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
744 rx_ia);
745 unsigned int extras;
746 int rc;
747
748 retry:
749 switch (ep->rep_connected) {
750 case 0:
751 dprintk("RPC: %s: connecting...\n", __func__);
752 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
753 if (rc) {
754 dprintk("RPC: %s: rdma_create_qp failed %i\n",
755 __func__, rc);
756 rc = -ENETUNREACH;
757 goto out_noupdate;
758 }
759 break;
760 case -ENODEV:
761 rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia);
762 if (rc)
763 goto out_noupdate;
764 break;
765 default:
766 rc = rpcrdma_ep_reconnect(r_xprt, ep, ia);
767 if (rc)
768 goto out;
769 }
770
771 ep->rep_connected = 0;
772
773 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
774 if (rc) {
775 dprintk("RPC: %s: rdma_connect() failed with %i\n",
776 __func__, rc);
777 goto out;
778 }
779
780 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
781 if (ep->rep_connected <= 0) {
782 if (ep->rep_connected == -EAGAIN)
783 goto retry;
784 rc = ep->rep_connected;
785 goto out;
786 }
787
788 dprintk("RPC: %s: connected\n", __func__);
789 extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
790 if (extras)
791 rpcrdma_ep_post_extra_recv(r_xprt, extras);
792
793 out:
794 if (rc)
795 ep->rep_connected = rc;
796
797 out_noupdate:
798 return rc;
799 }
800
801 /*
802 * rpcrdma_ep_disconnect
803 *
804 * This is separate from destroy to facilitate the ability
805 * to reconnect without recreating the endpoint.
806 *
807 * This call is not reentrant, and must not be made in parallel
808 * on the same endpoint.
809 */
810 void
811 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
812 {
813 int rc;
814
815 rc = rdma_disconnect(ia->ri_id);
816 if (!rc)
817 /* returns without wait if not connected */
818 wait_event_interruptible(ep->rep_connect_wait,
819 ep->rep_connected != 1);
820 else
821 ep->rep_connected = rc;
822 trace_xprtrdma_disconnect(container_of(ep, struct rpcrdma_xprt,
823 rx_ep), rc);
824
825 ib_drain_qp(ia->ri_id->qp);
826 }
827
828 /* Fixed-size circular FIFO queue. This implementation is wait-free and
829 * lock-free.
830 *
831 * Consumer is the code path that posts Sends. This path dequeues a
832 * sendctx for use by a Send operation. Multiple consumer threads
833 * are serialized by the RPC transport lock, which allows only one
834 * ->send_request call at a time.
835 *
836 * Producer is the code path that handles Send completions. This path
837 * enqueues a sendctx that has been completed. Multiple producer
838 * threads are serialized by the ib_poll_cq() function.
839 */
840
841 /* rpcrdma_sendctxs_destroy() assumes caller has already quiesced
842 * queue activity, and ib_drain_qp has flushed all remaining Send
843 * requests.
844 */
845 static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf)
846 {
847 unsigned long i;
848
849 for (i = 0; i <= buf->rb_sc_last; i++)
850 kfree(buf->rb_sc_ctxs[i]);
851 kfree(buf->rb_sc_ctxs);
852 }
853
854 static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia)
855 {
856 struct rpcrdma_sendctx *sc;
857
858 sc = kzalloc(sizeof(*sc) +
859 ia->ri_max_send_sges * sizeof(struct ib_sge),
860 GFP_KERNEL);
861 if (!sc)
862 return NULL;
863
864 sc->sc_wr.wr_cqe = &sc->sc_cqe;
865 sc->sc_wr.sg_list = sc->sc_sges;
866 sc->sc_wr.opcode = IB_WR_SEND;
867 sc->sc_cqe.done = rpcrdma_wc_send;
868 return sc;
869 }
870
871 static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
872 {
873 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
874 struct rpcrdma_sendctx *sc;
875 unsigned long i;
876
877 /* Maximum number of concurrent outstanding Send WRs. Capping
878 * the circular queue size stops Send Queue overflow by causing
879 * the ->send_request call to fail temporarily before too many
880 * Sends are posted.
881 */
882 i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS;
883 dprintk("RPC: %s: allocating %lu send_ctxs\n", __func__, i);
884 buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
885 if (!buf->rb_sc_ctxs)
886 return -ENOMEM;
887
888 buf->rb_sc_last = i - 1;
889 for (i = 0; i <= buf->rb_sc_last; i++) {
890 sc = rpcrdma_sendctx_create(&r_xprt->rx_ia);
891 if (!sc)
892 goto out_destroy;
893
894 sc->sc_xprt = r_xprt;
895 buf->rb_sc_ctxs[i] = sc;
896 }
897
898 return 0;
899
900 out_destroy:
901 rpcrdma_sendctxs_destroy(buf);
902 return -ENOMEM;
903 }
904
905 /* The sendctx queue is not guaranteed to have a size that is a
906 * power of two, thus the helpers in circ_buf.h cannot be used.
907 * The other option is to use modulus (%), which can be expensive.
908 */
909 static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf,
910 unsigned long item)
911 {
912 return likely(item < buf->rb_sc_last) ? item + 1 : 0;
913 }
914
915 /**
916 * rpcrdma_sendctx_get_locked - Acquire a send context
917 * @buf: transport buffers from which to acquire an unused context
918 *
919 * Returns pointer to a free send completion context; or NULL if
920 * the queue is empty.
921 *
922 * Usage: Called to acquire an SGE array before preparing a Send WR.
923 *
924 * The caller serializes calls to this function (per rpcrdma_buffer),
925 * and provides an effective memory barrier that flushes the new value
926 * of rb_sc_head.
927 */
928 struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf)
929 {
930 struct rpcrdma_xprt *r_xprt;
931 struct rpcrdma_sendctx *sc;
932 unsigned long next_head;
933
934 next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head);
935
936 if (next_head == READ_ONCE(buf->rb_sc_tail))
937 goto out_emptyq;
938
939 /* ORDER: item must be accessed _before_ head is updated */
940 sc = buf->rb_sc_ctxs[next_head];
941
942 /* Releasing the lock in the caller acts as a memory
943 * barrier that flushes rb_sc_head.
944 */
945 buf->rb_sc_head = next_head;
946
947 return sc;
948
949 out_emptyq:
950 /* The queue is "empty" if there have not been enough Send
951 * completions recently. This is a sign the Send Queue is
952 * backing up. Cause the caller to pause and try again.
953 */
954 dprintk("RPC: %s: empty sendctx queue\n", __func__);
955 r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf);
956 r_xprt->rx_stats.empty_sendctx_q++;
957 return NULL;
958 }
959
960 /**
961 * rpcrdma_sendctx_put_locked - Release a send context
962 * @sc: send context to release
963 *
964 * Usage: Called from Send completion to return a sendctxt
965 * to the queue.
966 *
967 * The caller serializes calls to this function (per rpcrdma_buffer).
968 */
969 void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
970 {
971 struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf;
972 unsigned long next_tail;
973
974 /* Unmap SGEs of previously completed by unsignaled
975 * Sends by walking up the queue until @sc is found.
976 */
977 next_tail = buf->rb_sc_tail;
978 do {
979 next_tail = rpcrdma_sendctx_next(buf, next_tail);
980
981 /* ORDER: item must be accessed _before_ tail is updated */
982 rpcrdma_unmap_sendctx(buf->rb_sc_ctxs[next_tail]);
983
984 } while (buf->rb_sc_ctxs[next_tail] != sc);
985
986 /* Paired with READ_ONCE */
987 smp_store_release(&buf->rb_sc_tail, next_tail);
988 }
989
990 static void
991 rpcrdma_mr_recovery_worker(struct work_struct *work)
992 {
993 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
994 rb_recovery_worker.work);
995 struct rpcrdma_mr *mr;
996
997 spin_lock(&buf->rb_recovery_lock);
998 while (!list_empty(&buf->rb_stale_mrs)) {
999 mr = rpcrdma_mr_pop(&buf->rb_stale_mrs);
1000 spin_unlock(&buf->rb_recovery_lock);
1001
1002 trace_xprtrdma_recover_mr(mr);
1003 mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr);
1004
1005 spin_lock(&buf->rb_recovery_lock);
1006 }
1007 spin_unlock(&buf->rb_recovery_lock);
1008 }
1009
1010 void
1011 rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr)
1012 {
1013 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
1014 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1015
1016 spin_lock(&buf->rb_recovery_lock);
1017 rpcrdma_mr_push(mr, &buf->rb_stale_mrs);
1018 spin_unlock(&buf->rb_recovery_lock);
1019
1020 schedule_delayed_work(&buf->rb_recovery_worker, 0);
1021 }
1022
1023 static void
1024 rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
1025 {
1026 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1027 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1028 unsigned int count;
1029 LIST_HEAD(free);
1030 LIST_HEAD(all);
1031
1032 for (count = 0; count < 3; count++) {
1033 struct rpcrdma_mr *mr;
1034 int rc;
1035
1036 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1037 if (!mr)
1038 break;
1039
1040 rc = ia->ri_ops->ro_init_mr(ia, mr);
1041 if (rc) {
1042 kfree(mr);
1043 break;
1044 }
1045
1046 mr->mr_xprt = r_xprt;
1047
1048 list_add(&mr->mr_list, &free);
1049 list_add(&mr->mr_all, &all);
1050 }
1051
1052 spin_lock(&buf->rb_mrlock);
1053 list_splice(&free, &buf->rb_mrs);
1054 list_splice(&all, &buf->rb_all);
1055 r_xprt->rx_stats.mrs_allocated += count;
1056 spin_unlock(&buf->rb_mrlock);
1057 trace_xprtrdma_createmrs(r_xprt, count);
1058
1059 xprt_write_space(&r_xprt->rx_xprt);
1060 }
1061
1062 static void
1063 rpcrdma_mr_refresh_worker(struct work_struct *work)
1064 {
1065 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
1066 rb_refresh_worker.work);
1067 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
1068 rx_buf);
1069
1070 rpcrdma_mrs_create(r_xprt);
1071 }
1072
1073 struct rpcrdma_req *
1074 rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
1075 {
1076 struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
1077 struct rpcrdma_regbuf *rb;
1078 struct rpcrdma_req *req;
1079
1080 req = kzalloc(sizeof(*req), GFP_KERNEL);
1081 if (req == NULL)
1082 return ERR_PTR(-ENOMEM);
1083
1084 rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
1085 DMA_TO_DEVICE, GFP_KERNEL);
1086 if (IS_ERR(rb)) {
1087 kfree(req);
1088 return ERR_PTR(-ENOMEM);
1089 }
1090 req->rl_rdmabuf = rb;
1091 xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
1092 req->rl_buffer = buffer;
1093 INIT_LIST_HEAD(&req->rl_registered);
1094
1095 spin_lock(&buffer->rb_reqslock);
1096 list_add(&req->rl_all, &buffer->rb_allreqs);
1097 spin_unlock(&buffer->rb_reqslock);
1098 return req;
1099 }
1100
1101 /**
1102 * rpcrdma_create_rep - Allocate an rpcrdma_rep object
1103 * @r_xprt: controlling transport
1104 *
1105 * Returns 0 on success or a negative errno on failure.
1106 */
1107 int
1108 rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
1109 {
1110 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1111 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1112 struct rpcrdma_rep *rep;
1113 int rc;
1114
1115 rc = -ENOMEM;
1116 rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1117 if (rep == NULL)
1118 goto out;
1119
1120 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
1121 DMA_FROM_DEVICE, GFP_KERNEL);
1122 if (IS_ERR(rep->rr_rdmabuf)) {
1123 rc = PTR_ERR(rep->rr_rdmabuf);
1124 goto out_free;
1125 }
1126 xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base,
1127 rdmab_length(rep->rr_rdmabuf));
1128
1129 rep->rr_cqe.done = rpcrdma_wc_receive;
1130 rep->rr_rxprt = r_xprt;
1131 INIT_WORK(&rep->rr_work, rpcrdma_deferred_completion);
1132 rep->rr_recv_wr.next = NULL;
1133 rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
1134 rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
1135 rep->rr_recv_wr.num_sge = 1;
1136
1137 spin_lock(&buf->rb_lock);
1138 list_add(&rep->rr_list, &buf->rb_recv_bufs);
1139 spin_unlock(&buf->rb_lock);
1140 return 0;
1141
1142 out_free:
1143 kfree(rep);
1144 out:
1145 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1146 __func__, rc);
1147 return rc;
1148 }
1149
1150 int
1151 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1152 {
1153 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1154 int i, rc;
1155
1156 buf->rb_max_requests = r_xprt->rx_data.max_requests;
1157 buf->rb_bc_srv_max_requests = 0;
1158 spin_lock_init(&buf->rb_mrlock);
1159 spin_lock_init(&buf->rb_lock);
1160 spin_lock_init(&buf->rb_recovery_lock);
1161 INIT_LIST_HEAD(&buf->rb_mrs);
1162 INIT_LIST_HEAD(&buf->rb_all);
1163 INIT_LIST_HEAD(&buf->rb_stale_mrs);
1164 INIT_DELAYED_WORK(&buf->rb_refresh_worker,
1165 rpcrdma_mr_refresh_worker);
1166 INIT_DELAYED_WORK(&buf->rb_recovery_worker,
1167 rpcrdma_mr_recovery_worker);
1168
1169 rpcrdma_mrs_create(r_xprt);
1170
1171 INIT_LIST_HEAD(&buf->rb_send_bufs);
1172 INIT_LIST_HEAD(&buf->rb_allreqs);
1173 spin_lock_init(&buf->rb_reqslock);
1174 for (i = 0; i < buf->rb_max_requests; i++) {
1175 struct rpcrdma_req *req;
1176
1177 req = rpcrdma_create_req(r_xprt);
1178 if (IS_ERR(req)) {
1179 dprintk("RPC: %s: request buffer %d alloc"
1180 " failed\n", __func__, i);
1181 rc = PTR_ERR(req);
1182 goto out;
1183 }
1184 list_add(&req->rl_list, &buf->rb_send_bufs);
1185 }
1186
1187 INIT_LIST_HEAD(&buf->rb_recv_bufs);
1188 for (i = 0; i <= buf->rb_max_requests; i++) {
1189 rc = rpcrdma_create_rep(r_xprt);
1190 if (rc)
1191 goto out;
1192 }
1193
1194 rc = rpcrdma_sendctxs_create(r_xprt);
1195 if (rc)
1196 goto out;
1197
1198 return 0;
1199 out:
1200 rpcrdma_buffer_destroy(buf);
1201 return rc;
1202 }
1203
1204 static struct rpcrdma_req *
1205 rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf)
1206 {
1207 struct rpcrdma_req *req;
1208
1209 req = list_first_entry(&buf->rb_send_bufs,
1210 struct rpcrdma_req, rl_list);
1211 list_del_init(&req->rl_list);
1212 return req;
1213 }
1214
1215 static struct rpcrdma_rep *
1216 rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
1217 {
1218 struct rpcrdma_rep *rep;
1219
1220 rep = list_first_entry(&buf->rb_recv_bufs,
1221 struct rpcrdma_rep, rr_list);
1222 list_del(&rep->rr_list);
1223 return rep;
1224 }
1225
1226 static void
1227 rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
1228 {
1229 rpcrdma_free_regbuf(rep->rr_rdmabuf);
1230 kfree(rep);
1231 }
1232
1233 void
1234 rpcrdma_destroy_req(struct rpcrdma_req *req)
1235 {
1236 rpcrdma_free_regbuf(req->rl_recvbuf);
1237 rpcrdma_free_regbuf(req->rl_sendbuf);
1238 rpcrdma_free_regbuf(req->rl_rdmabuf);
1239 kfree(req);
1240 }
1241
1242 static void
1243 rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf)
1244 {
1245 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
1246 rx_buf);
1247 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1248 struct rpcrdma_mr *mr;
1249 unsigned int count;
1250
1251 count = 0;
1252 spin_lock(&buf->rb_mrlock);
1253 while (!list_empty(&buf->rb_all)) {
1254 mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all);
1255 list_del(&mr->mr_all);
1256
1257 spin_unlock(&buf->rb_mrlock);
1258
1259 /* Ensure MW is not on any rl_registered list */
1260 if (!list_empty(&mr->mr_list))
1261 list_del(&mr->mr_list);
1262
1263 ia->ri_ops->ro_release_mr(mr);
1264 count++;
1265 spin_lock(&buf->rb_mrlock);
1266 }
1267 spin_unlock(&buf->rb_mrlock);
1268 r_xprt->rx_stats.mrs_allocated = 0;
1269
1270 dprintk("RPC: %s: released %u MRs\n", __func__, count);
1271 }
1272
1273 void
1274 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1275 {
1276 cancel_delayed_work_sync(&buf->rb_recovery_worker);
1277 cancel_delayed_work_sync(&buf->rb_refresh_worker);
1278
1279 rpcrdma_sendctxs_destroy(buf);
1280
1281 while (!list_empty(&buf->rb_recv_bufs)) {
1282 struct rpcrdma_rep *rep;
1283
1284 rep = rpcrdma_buffer_get_rep_locked(buf);
1285 rpcrdma_destroy_rep(rep);
1286 }
1287 buf->rb_send_count = 0;
1288
1289 spin_lock(&buf->rb_reqslock);
1290 while (!list_empty(&buf->rb_allreqs)) {
1291 struct rpcrdma_req *req;
1292
1293 req = list_first_entry(&buf->rb_allreqs,
1294 struct rpcrdma_req, rl_all);
1295 list_del(&req->rl_all);
1296
1297 spin_unlock(&buf->rb_reqslock);
1298 rpcrdma_destroy_req(req);
1299 spin_lock(&buf->rb_reqslock);
1300 }
1301 spin_unlock(&buf->rb_reqslock);
1302 buf->rb_recv_count = 0;
1303
1304 rpcrdma_mrs_destroy(buf);
1305 }
1306
1307 /**
1308 * rpcrdma_mr_get - Allocate an rpcrdma_mr object
1309 * @r_xprt: controlling transport
1310 *
1311 * Returns an initialized rpcrdma_mr or NULL if no free
1312 * rpcrdma_mr objects are available.
1313 */
1314 struct rpcrdma_mr *
1315 rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
1316 {
1317 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1318 struct rpcrdma_mr *mr = NULL;
1319
1320 spin_lock(&buf->rb_mrlock);
1321 if (!list_empty(&buf->rb_mrs))
1322 mr = rpcrdma_mr_pop(&buf->rb_mrs);
1323 spin_unlock(&buf->rb_mrlock);
1324
1325 if (!mr)
1326 goto out_nomrs;
1327 return mr;
1328
1329 out_nomrs:
1330 trace_xprtrdma_nomrs(r_xprt);
1331 if (r_xprt->rx_ep.rep_connected != -ENODEV)
1332 schedule_delayed_work(&buf->rb_refresh_worker, 0);
1333
1334 /* Allow the reply handler and refresh worker to run */
1335 cond_resched();
1336
1337 return NULL;
1338 }
1339
1340 static void
1341 __rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr)
1342 {
1343 spin_lock(&buf->rb_mrlock);
1344 rpcrdma_mr_push(mr, &buf->rb_mrs);
1345 spin_unlock(&buf->rb_mrlock);
1346 }
1347
1348 /**
1349 * rpcrdma_mr_put - Release an rpcrdma_mr object
1350 * @mr: object to release
1351 *
1352 */
1353 void
1354 rpcrdma_mr_put(struct rpcrdma_mr *mr)
1355 {
1356 __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr);
1357 }
1358
1359 /**
1360 * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it
1361 * @mr: object to release
1362 *
1363 */
1364 void
1365 rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
1366 {
1367 struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
1368
1369 trace_xprtrdma_dma_unmap(mr);
1370 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
1371 mr->mr_sg, mr->mr_nents, mr->mr_dir);
1372 __rpcrdma_mr_put(&r_xprt->rx_buf, mr);
1373 }
1374
1375 static struct rpcrdma_rep *
1376 rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers)
1377 {
1378 /* If an RPC previously completed without a reply (say, a
1379 * credential problem or a soft timeout occurs) then hold off
1380 * on supplying more Receive buffers until the number of new
1381 * pending RPCs catches up to the number of posted Receives.
1382 */
1383 if (unlikely(buffers->rb_send_count < buffers->rb_recv_count))
1384 return NULL;
1385
1386 if (unlikely(list_empty(&buffers->rb_recv_bufs)))
1387 return NULL;
1388 buffers->rb_recv_count++;
1389 return rpcrdma_buffer_get_rep_locked(buffers);
1390 }
1391
1392 /*
1393 * Get a set of request/reply buffers.
1394 *
1395 * Reply buffer (if available) is attached to send buffer upon return.
1396 */
1397 struct rpcrdma_req *
1398 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1399 {
1400 struct rpcrdma_req *req;
1401
1402 spin_lock(&buffers->rb_lock);
1403 if (list_empty(&buffers->rb_send_bufs))
1404 goto out_reqbuf;
1405 buffers->rb_send_count++;
1406 req = rpcrdma_buffer_get_req_locked(buffers);
1407 req->rl_reply = rpcrdma_buffer_get_rep(buffers);
1408 spin_unlock(&buffers->rb_lock);
1409
1410 return req;
1411
1412 out_reqbuf:
1413 spin_unlock(&buffers->rb_lock);
1414 return NULL;
1415 }
1416
1417 /*
1418 * Put request/reply buffers back into pool.
1419 * Pre-decrement counter/array index.
1420 */
1421 void
1422 rpcrdma_buffer_put(struct rpcrdma_req *req)
1423 {
1424 struct rpcrdma_buffer *buffers = req->rl_buffer;
1425 struct rpcrdma_rep *rep = req->rl_reply;
1426
1427 req->rl_reply = NULL;
1428
1429 spin_lock(&buffers->rb_lock);
1430 buffers->rb_send_count--;
1431 list_add_tail(&req->rl_list, &buffers->rb_send_bufs);
1432 if (rep) {
1433 buffers->rb_recv_count--;
1434 list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
1435 }
1436 spin_unlock(&buffers->rb_lock);
1437 }
1438
1439 /*
1440 * Recover reply buffers from pool.
1441 * This happens when recovering from disconnect.
1442 */
1443 void
1444 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1445 {
1446 struct rpcrdma_buffer *buffers = req->rl_buffer;
1447
1448 spin_lock(&buffers->rb_lock);
1449 req->rl_reply = rpcrdma_buffer_get_rep(buffers);
1450 spin_unlock(&buffers->rb_lock);
1451 }
1452
1453 /*
1454 * Put reply buffers back into pool when not attached to
1455 * request. This happens in error conditions.
1456 */
1457 void
1458 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1459 {
1460 struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
1461
1462 spin_lock(&buffers->rb_lock);
1463 buffers->rb_recv_count--;
1464 list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
1465 spin_unlock(&buffers->rb_lock);
1466 }
1467
1468 /**
1469 * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
1470 * @size: size of buffer to be allocated, in bytes
1471 * @direction: direction of data movement
1472 * @flags: GFP flags
1473 *
1474 * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
1475 * can be persistently DMA-mapped for I/O.
1476 *
1477 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
1478 * receiving the payload of RDMA RECV operations. During Long Calls
1479 * or Replies they may be registered externally via ro_map.
1480 */
1481 struct rpcrdma_regbuf *
1482 rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
1483 gfp_t flags)
1484 {
1485 struct rpcrdma_regbuf *rb;
1486
1487 rb = kmalloc(sizeof(*rb) + size, flags);
1488 if (rb == NULL)
1489 return ERR_PTR(-ENOMEM);
1490
1491 rb->rg_device = NULL;
1492 rb->rg_direction = direction;
1493 rb->rg_iov.length = size;
1494
1495 return rb;
1496 }
1497
1498 /**
1499 * __rpcrdma_map_regbuf - DMA-map a regbuf
1500 * @ia: controlling rpcrdma_ia
1501 * @rb: regbuf to be mapped
1502 */
1503 bool
1504 __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1505 {
1506 struct ib_device *device = ia->ri_device;
1507
1508 if (rb->rg_direction == DMA_NONE)
1509 return false;
1510
1511 rb->rg_iov.addr = ib_dma_map_single(device,
1512 (void *)rb->rg_base,
1513 rdmab_length(rb),
1514 rb->rg_direction);
1515 if (ib_dma_mapping_error(device, rdmab_addr(rb)))
1516 return false;
1517
1518 rb->rg_device = device;
1519 rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
1520 return true;
1521 }
1522
1523 static void
1524 rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
1525 {
1526 if (!rb)
1527 return;
1528
1529 if (!rpcrdma_regbuf_is_mapped(rb))
1530 return;
1531
1532 ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
1533 rdmab_length(rb), rb->rg_direction);
1534 rb->rg_device = NULL;
1535 }
1536
1537 /**
1538 * rpcrdma_free_regbuf - deregister and free registered buffer
1539 * @rb: regbuf to be deregistered and freed
1540 */
1541 void
1542 rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
1543 {
1544 rpcrdma_dma_unmap_regbuf(rb);
1545 kfree(rb);
1546 }
1547
1548 /*
1549 * Prepost any receive buffer, then post send.
1550 *
1551 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1552 */
1553 int
1554 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1555 struct rpcrdma_ep *ep,
1556 struct rpcrdma_req *req)
1557 {
1558 struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
1559 int rc;
1560
1561 if (req->rl_reply) {
1562 rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
1563 if (rc)
1564 return rc;
1565 req->rl_reply = NULL;
1566 }
1567
1568 if (!ep->rep_send_count ||
1569 test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
1570 send_wr->send_flags |= IB_SEND_SIGNALED;
1571 ep->rep_send_count = ep->rep_send_batch;
1572 } else {
1573 send_wr->send_flags &= ~IB_SEND_SIGNALED;
1574 --ep->rep_send_count;
1575 }
1576
1577 rc = ia->ri_ops->ro_send(ia, req);
1578 trace_xprtrdma_post_send(req, rc);
1579 if (rc)
1580 return -ENOTCONN;
1581 return 0;
1582 }
1583
1584 int
1585 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1586 struct rpcrdma_rep *rep)
1587 {
1588 struct ib_recv_wr *recv_wr_fail;
1589 int rc;
1590
1591 if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
1592 goto out_map;
1593 rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
1594 trace_xprtrdma_post_recv(rep, rc);
1595 if (rc)
1596 return -ENOTCONN;
1597 return 0;
1598
1599 out_map:
1600 pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
1601 return -EIO;
1602 }
1603
1604 /**
1605 * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests
1606 * @r_xprt: transport associated with these backchannel resources
1607 * @count: minimum number of incoming requests expected
1608 *
1609 * Returns zero if all requested buffers were posted, or a negative errno.
1610 */
1611 int
1612 rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
1613 {
1614 struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
1615 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1616 struct rpcrdma_rep *rep;
1617 int rc;
1618
1619 while (count--) {
1620 spin_lock(&buffers->rb_lock);
1621 if (list_empty(&buffers->rb_recv_bufs))
1622 goto out_reqbuf;
1623 rep = rpcrdma_buffer_get_rep_locked(buffers);
1624 spin_unlock(&buffers->rb_lock);
1625
1626 rc = rpcrdma_ep_post_recv(ia, rep);
1627 if (rc)
1628 goto out_rc;
1629 }
1630
1631 return 0;
1632
1633 out_reqbuf:
1634 spin_unlock(&buffers->rb_lock);
1635 trace_xprtrdma_noreps(r_xprt);
1636 return -ENOMEM;
1637
1638 out_rc:
1639 rpcrdma_recv_buffer_put(rep);
1640 return rc;
1641 }