]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - net/sunrpc/xprtrdma/verbs.c
Merge branch 'for-linus-4.1' of git://git.kernel.org/pub/scm/linux/kernel/git/mason...
[mirror_ubuntu-artful-kernel.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50 #include <linux/interrupt.h>
51 #include <linux/slab.h>
52 #include <linux/prefetch.h>
53 #include <linux/sunrpc/addr.h>
54 #include <asm/bitops.h>
55
56 #include "xprt_rdma.h"
57
58 /*
59 * Globals/Macros
60 */
61
62 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
63 # define RPCDBG_FACILITY RPCDBG_TRANS
64 #endif
65
66 /*
67 * internal functions
68 */
69
70 /*
71 * handle replies in tasklet context, using a single, global list
72 * rdma tasklet function -- just turn around and call the func
73 * for all replies on the list
74 */
75
76 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
77 static LIST_HEAD(rpcrdma_tasklets_g);
78
79 static void
80 rpcrdma_run_tasklet(unsigned long data)
81 {
82 struct rpcrdma_rep *rep;
83 void (*func)(struct rpcrdma_rep *);
84 unsigned long flags;
85
86 data = data;
87 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
88 while (!list_empty(&rpcrdma_tasklets_g)) {
89 rep = list_entry(rpcrdma_tasklets_g.next,
90 struct rpcrdma_rep, rr_list);
91 list_del(&rep->rr_list);
92 func = rep->rr_func;
93 rep->rr_func = NULL;
94 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
95
96 if (func)
97 func(rep);
98 else
99 rpcrdma_recv_buffer_put(rep);
100
101 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
102 }
103 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
104 }
105
106 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
107
108 static const char * const async_event[] = {
109 "CQ error",
110 "QP fatal error",
111 "QP request error",
112 "QP access error",
113 "communication established",
114 "send queue drained",
115 "path migration successful",
116 "path mig error",
117 "device fatal error",
118 "port active",
119 "port error",
120 "LID change",
121 "P_key change",
122 "SM change",
123 "SRQ error",
124 "SRQ limit reached",
125 "last WQE reached",
126 "client reregister",
127 "GID change",
128 };
129
130 #define ASYNC_MSG(status) \
131 ((status) < ARRAY_SIZE(async_event) ? \
132 async_event[(status)] : "unknown async error")
133
134 static void
135 rpcrdma_schedule_tasklet(struct list_head *sched_list)
136 {
137 unsigned long flags;
138
139 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
140 list_splice_tail(sched_list, &rpcrdma_tasklets_g);
141 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
142 tasklet_schedule(&rpcrdma_tasklet_g);
143 }
144
145 static void
146 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
147 {
148 struct rpcrdma_ep *ep = context;
149
150 pr_err("RPC: %s: %s on device %s ep %p\n",
151 __func__, ASYNC_MSG(event->event),
152 event->device->name, context);
153 if (ep->rep_connected == 1) {
154 ep->rep_connected = -EIO;
155 rpcrdma_conn_func(ep);
156 wake_up_all(&ep->rep_connect_wait);
157 }
158 }
159
160 static void
161 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
162 {
163 struct rpcrdma_ep *ep = context;
164
165 pr_err("RPC: %s: %s on device %s ep %p\n",
166 __func__, ASYNC_MSG(event->event),
167 event->device->name, context);
168 if (ep->rep_connected == 1) {
169 ep->rep_connected = -EIO;
170 rpcrdma_conn_func(ep);
171 wake_up_all(&ep->rep_connect_wait);
172 }
173 }
174
175 static const char * const wc_status[] = {
176 "success",
177 "local length error",
178 "local QP operation error",
179 "local EE context operation error",
180 "local protection error",
181 "WR flushed",
182 "memory management operation error",
183 "bad response error",
184 "local access error",
185 "remote invalid request error",
186 "remote access error",
187 "remote operation error",
188 "transport retry counter exceeded",
189 "RNR retry counter exceeded",
190 "local RDD violation error",
191 "remove invalid RD request",
192 "operation aborted",
193 "invalid EE context number",
194 "invalid EE context state",
195 "fatal error",
196 "response timeout error",
197 "general error",
198 };
199
200 #define COMPLETION_MSG(status) \
201 ((status) < ARRAY_SIZE(wc_status) ? \
202 wc_status[(status)] : "unexpected completion error")
203
204 static void
205 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
206 {
207 /* WARNING: Only wr_id and status are reliable at this point */
208 if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
209 if (wc->status != IB_WC_SUCCESS &&
210 wc->status != IB_WC_WR_FLUSH_ERR)
211 pr_err("RPC: %s: SEND: %s\n",
212 __func__, COMPLETION_MSG(wc->status));
213 } else {
214 struct rpcrdma_mw *r;
215
216 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
217 r->mw_sendcompletion(wc);
218 }
219 }
220
221 static int
222 rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
223 {
224 struct ib_wc *wcs;
225 int budget, count, rc;
226
227 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
228 do {
229 wcs = ep->rep_send_wcs;
230
231 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
232 if (rc <= 0)
233 return rc;
234
235 count = rc;
236 while (count-- > 0)
237 rpcrdma_sendcq_process_wc(wcs++);
238 } while (rc == RPCRDMA_POLLSIZE && --budget);
239 return 0;
240 }
241
242 /*
243 * Handle send, fast_reg_mr, and local_inv completions.
244 *
245 * Send events are typically suppressed and thus do not result
246 * in an upcall. Occasionally one is signaled, however. This
247 * prevents the provider's completion queue from wrapping and
248 * losing a completion.
249 */
250 static void
251 rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
252 {
253 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
254 int rc;
255
256 rc = rpcrdma_sendcq_poll(cq, ep);
257 if (rc) {
258 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
259 __func__, rc);
260 return;
261 }
262
263 rc = ib_req_notify_cq(cq,
264 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
265 if (rc == 0)
266 return;
267 if (rc < 0) {
268 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
269 __func__, rc);
270 return;
271 }
272
273 rpcrdma_sendcq_poll(cq, ep);
274 }
275
276 static void
277 rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
278 {
279 struct rpcrdma_rep *rep =
280 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
281
282 /* WARNING: Only wr_id and status are reliable at this point */
283 if (wc->status != IB_WC_SUCCESS)
284 goto out_fail;
285
286 /* status == SUCCESS means all fields in wc are trustworthy */
287 if (wc->opcode != IB_WC_RECV)
288 return;
289
290 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
291 __func__, rep, wc->byte_len);
292
293 rep->rr_len = wc->byte_len;
294 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
295 rdmab_addr(rep->rr_rdmabuf),
296 rep->rr_len, DMA_FROM_DEVICE);
297 prefetch(rdmab_to_msg(rep->rr_rdmabuf));
298
299 out_schedule:
300 list_add_tail(&rep->rr_list, sched_list);
301 return;
302 out_fail:
303 if (wc->status != IB_WC_WR_FLUSH_ERR)
304 pr_err("RPC: %s: rep %p: %s\n",
305 __func__, rep, COMPLETION_MSG(wc->status));
306 rep->rr_len = ~0U;
307 goto out_schedule;
308 }
309
310 static int
311 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
312 {
313 struct list_head sched_list;
314 struct ib_wc *wcs;
315 int budget, count, rc;
316
317 INIT_LIST_HEAD(&sched_list);
318 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
319 do {
320 wcs = ep->rep_recv_wcs;
321
322 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
323 if (rc <= 0)
324 goto out_schedule;
325
326 count = rc;
327 while (count-- > 0)
328 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
329 } while (rc == RPCRDMA_POLLSIZE && --budget);
330 rc = 0;
331
332 out_schedule:
333 rpcrdma_schedule_tasklet(&sched_list);
334 return rc;
335 }
336
337 /*
338 * Handle receive completions.
339 *
340 * It is reentrant but processes single events in order to maintain
341 * ordering of receives to keep server credits.
342 *
343 * It is the responsibility of the scheduled tasklet to return
344 * recv buffers to the pool. NOTE: this affects synchronization of
345 * connection shutdown. That is, the structures required for
346 * the completion of the reply handler must remain intact until
347 * all memory has been reclaimed.
348 */
349 static void
350 rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
351 {
352 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
353 int rc;
354
355 rc = rpcrdma_recvcq_poll(cq, ep);
356 if (rc) {
357 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
358 __func__, rc);
359 return;
360 }
361
362 rc = ib_req_notify_cq(cq,
363 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
364 if (rc == 0)
365 return;
366 if (rc < 0) {
367 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
368 __func__, rc);
369 return;
370 }
371
372 rpcrdma_recvcq_poll(cq, ep);
373 }
374
375 static void
376 rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
377 {
378 struct ib_wc wc;
379 LIST_HEAD(sched_list);
380
381 while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
382 rpcrdma_recvcq_process_wc(&wc, &sched_list);
383 if (!list_empty(&sched_list))
384 rpcrdma_schedule_tasklet(&sched_list);
385 while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
386 rpcrdma_sendcq_process_wc(&wc);
387 }
388
389 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
390 static const char * const conn[] = {
391 "address resolved",
392 "address error",
393 "route resolved",
394 "route error",
395 "connect request",
396 "connect response",
397 "connect error",
398 "unreachable",
399 "rejected",
400 "established",
401 "disconnected",
402 "device removal",
403 "multicast join",
404 "multicast error",
405 "address change",
406 "timewait exit",
407 };
408
409 #define CONNECTION_MSG(status) \
410 ((status) < ARRAY_SIZE(conn) ? \
411 conn[(status)] : "unrecognized connection error")
412 #endif
413
414 static int
415 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
416 {
417 struct rpcrdma_xprt *xprt = id->context;
418 struct rpcrdma_ia *ia = &xprt->rx_ia;
419 struct rpcrdma_ep *ep = &xprt->rx_ep;
420 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
421 struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
422 #endif
423 struct ib_qp_attr *attr = &ia->ri_qp_attr;
424 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
425 int connstate = 0;
426
427 switch (event->event) {
428 case RDMA_CM_EVENT_ADDR_RESOLVED:
429 case RDMA_CM_EVENT_ROUTE_RESOLVED:
430 ia->ri_async_rc = 0;
431 complete(&ia->ri_done);
432 break;
433 case RDMA_CM_EVENT_ADDR_ERROR:
434 ia->ri_async_rc = -EHOSTUNREACH;
435 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
436 __func__, ep);
437 complete(&ia->ri_done);
438 break;
439 case RDMA_CM_EVENT_ROUTE_ERROR:
440 ia->ri_async_rc = -ENETUNREACH;
441 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
442 __func__, ep);
443 complete(&ia->ri_done);
444 break;
445 case RDMA_CM_EVENT_ESTABLISHED:
446 connstate = 1;
447 ib_query_qp(ia->ri_id->qp, attr,
448 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
449 iattr);
450 dprintk("RPC: %s: %d responder resources"
451 " (%d initiator)\n",
452 __func__, attr->max_dest_rd_atomic,
453 attr->max_rd_atomic);
454 goto connected;
455 case RDMA_CM_EVENT_CONNECT_ERROR:
456 connstate = -ENOTCONN;
457 goto connected;
458 case RDMA_CM_EVENT_UNREACHABLE:
459 connstate = -ENETDOWN;
460 goto connected;
461 case RDMA_CM_EVENT_REJECTED:
462 connstate = -ECONNREFUSED;
463 goto connected;
464 case RDMA_CM_EVENT_DISCONNECTED:
465 connstate = -ECONNABORTED;
466 goto connected;
467 case RDMA_CM_EVENT_DEVICE_REMOVAL:
468 connstate = -ENODEV;
469 connected:
470 dprintk("RPC: %s: %sconnected\n",
471 __func__, connstate > 0 ? "" : "dis");
472 ep->rep_connected = connstate;
473 rpcrdma_conn_func(ep);
474 wake_up_all(&ep->rep_connect_wait);
475 /*FALLTHROUGH*/
476 default:
477 dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n",
478 __func__, sap, rpc_get_port(sap), ep,
479 CONNECTION_MSG(event->event));
480 break;
481 }
482
483 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
484 if (connstate == 1) {
485 int ird = attr->max_dest_rd_atomic;
486 int tird = ep->rep_remote_cma.responder_resources;
487
488 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
489 sap, rpc_get_port(sap),
490 ia->ri_id->device->name,
491 ia->ri_ops->ro_displayname,
492 xprt->rx_buf.rb_max_requests,
493 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
494 } else if (connstate < 0) {
495 pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
496 sap, rpc_get_port(sap), connstate);
497 }
498 #endif
499
500 return 0;
501 }
502
503 static struct rdma_cm_id *
504 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
505 struct rpcrdma_ia *ia, struct sockaddr *addr)
506 {
507 struct rdma_cm_id *id;
508 int rc;
509
510 init_completion(&ia->ri_done);
511
512 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
513 if (IS_ERR(id)) {
514 rc = PTR_ERR(id);
515 dprintk("RPC: %s: rdma_create_id() failed %i\n",
516 __func__, rc);
517 return id;
518 }
519
520 ia->ri_async_rc = -ETIMEDOUT;
521 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
522 if (rc) {
523 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
524 __func__, rc);
525 goto out;
526 }
527 wait_for_completion_interruptible_timeout(&ia->ri_done,
528 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
529 rc = ia->ri_async_rc;
530 if (rc)
531 goto out;
532
533 ia->ri_async_rc = -ETIMEDOUT;
534 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
535 if (rc) {
536 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
537 __func__, rc);
538 goto out;
539 }
540 wait_for_completion_interruptible_timeout(&ia->ri_done,
541 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
542 rc = ia->ri_async_rc;
543 if (rc)
544 goto out;
545
546 return id;
547
548 out:
549 rdma_destroy_id(id);
550 return ERR_PTR(rc);
551 }
552
553 /*
554 * Drain any cq, prior to teardown.
555 */
556 static void
557 rpcrdma_clean_cq(struct ib_cq *cq)
558 {
559 struct ib_wc wc;
560 int count = 0;
561
562 while (1 == ib_poll_cq(cq, 1, &wc))
563 ++count;
564
565 if (count)
566 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
567 __func__, count, wc.opcode);
568 }
569
570 /*
571 * Exported functions.
572 */
573
574 /*
575 * Open and initialize an Interface Adapter.
576 * o initializes fields of struct rpcrdma_ia, including
577 * interface and provider attributes and protection zone.
578 */
579 int
580 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
581 {
582 int rc, mem_priv;
583 struct rpcrdma_ia *ia = &xprt->rx_ia;
584 struct ib_device_attr *devattr = &ia->ri_devattr;
585
586 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
587 if (IS_ERR(ia->ri_id)) {
588 rc = PTR_ERR(ia->ri_id);
589 goto out1;
590 }
591
592 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
593 if (IS_ERR(ia->ri_pd)) {
594 rc = PTR_ERR(ia->ri_pd);
595 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
596 __func__, rc);
597 goto out2;
598 }
599
600 rc = ib_query_device(ia->ri_id->device, devattr);
601 if (rc) {
602 dprintk("RPC: %s: ib_query_device failed %d\n",
603 __func__, rc);
604 goto out3;
605 }
606
607 if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
608 ia->ri_have_dma_lkey = 1;
609 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
610 }
611
612 if (memreg == RPCRDMA_FRMR) {
613 /* Requires both frmr reg and local dma lkey */
614 if (((devattr->device_cap_flags &
615 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
616 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
617 (devattr->max_fast_reg_page_list_len == 0)) {
618 dprintk("RPC: %s: FRMR registration "
619 "not supported by HCA\n", __func__);
620 memreg = RPCRDMA_MTHCAFMR;
621 }
622 }
623 if (memreg == RPCRDMA_MTHCAFMR) {
624 if (!ia->ri_id->device->alloc_fmr) {
625 dprintk("RPC: %s: MTHCAFMR registration "
626 "not supported by HCA\n", __func__);
627 memreg = RPCRDMA_ALLPHYSICAL;
628 }
629 }
630
631 /*
632 * Optionally obtain an underlying physical identity mapping in
633 * order to do a memory window-based bind. This base registration
634 * is protected from remote access - that is enabled only by binding
635 * for the specific bytes targeted during each RPC operation, and
636 * revoked after the corresponding completion similar to a storage
637 * adapter.
638 */
639 switch (memreg) {
640 case RPCRDMA_FRMR:
641 ia->ri_ops = &rpcrdma_frwr_memreg_ops;
642 break;
643 case RPCRDMA_ALLPHYSICAL:
644 ia->ri_ops = &rpcrdma_physical_memreg_ops;
645 mem_priv = IB_ACCESS_LOCAL_WRITE |
646 IB_ACCESS_REMOTE_WRITE |
647 IB_ACCESS_REMOTE_READ;
648 goto register_setup;
649 case RPCRDMA_MTHCAFMR:
650 ia->ri_ops = &rpcrdma_fmr_memreg_ops;
651 if (ia->ri_have_dma_lkey)
652 break;
653 mem_priv = IB_ACCESS_LOCAL_WRITE;
654 register_setup:
655 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
656 if (IS_ERR(ia->ri_bind_mem)) {
657 printk(KERN_ALERT "%s: ib_get_dma_mr for "
658 "phys register failed with %lX\n",
659 __func__, PTR_ERR(ia->ri_bind_mem));
660 rc = -ENOMEM;
661 goto out3;
662 }
663 break;
664 default:
665 printk(KERN_ERR "RPC: Unsupported memory "
666 "registration mode: %d\n", memreg);
667 rc = -ENOMEM;
668 goto out3;
669 }
670 dprintk("RPC: %s: memory registration strategy is '%s'\n",
671 __func__, ia->ri_ops->ro_displayname);
672
673 /* Else will do memory reg/dereg for each chunk */
674 ia->ri_memreg_strategy = memreg;
675
676 rwlock_init(&ia->ri_qplock);
677 return 0;
678
679 out3:
680 ib_dealloc_pd(ia->ri_pd);
681 ia->ri_pd = NULL;
682 out2:
683 rdma_destroy_id(ia->ri_id);
684 ia->ri_id = NULL;
685 out1:
686 return rc;
687 }
688
689 /*
690 * Clean up/close an IA.
691 * o if event handles and PD have been initialized, free them.
692 * o close the IA
693 */
694 void
695 rpcrdma_ia_close(struct rpcrdma_ia *ia)
696 {
697 int rc;
698
699 dprintk("RPC: %s: entering\n", __func__);
700 if (ia->ri_bind_mem != NULL) {
701 rc = ib_dereg_mr(ia->ri_bind_mem);
702 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
703 __func__, rc);
704 }
705 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
706 if (ia->ri_id->qp)
707 rdma_destroy_qp(ia->ri_id);
708 rdma_destroy_id(ia->ri_id);
709 ia->ri_id = NULL;
710 }
711 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
712 rc = ib_dealloc_pd(ia->ri_pd);
713 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
714 __func__, rc);
715 }
716 }
717
718 /*
719 * Create unconnected endpoint.
720 */
721 int
722 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
723 struct rpcrdma_create_data_internal *cdata)
724 {
725 struct ib_device_attr *devattr = &ia->ri_devattr;
726 struct ib_cq *sendcq, *recvcq;
727 int rc, err;
728
729 /* check provider's send/recv wr limits */
730 if (cdata->max_requests > devattr->max_qp_wr)
731 cdata->max_requests = devattr->max_qp_wr;
732
733 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
734 ep->rep_attr.qp_context = ep;
735 ep->rep_attr.srq = NULL;
736 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
737 rc = ia->ri_ops->ro_open(ia, ep, cdata);
738 if (rc)
739 return rc;
740 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
741 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
742 ep->rep_attr.cap.max_recv_sge = 1;
743 ep->rep_attr.cap.max_inline_data = 0;
744 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
745 ep->rep_attr.qp_type = IB_QPT_RC;
746 ep->rep_attr.port_num = ~0;
747
748 if (cdata->padding) {
749 ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
750 GFP_KERNEL);
751 if (IS_ERR(ep->rep_padbuf))
752 return PTR_ERR(ep->rep_padbuf);
753 } else
754 ep->rep_padbuf = NULL;
755
756 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
757 "iovs: send %d recv %d\n",
758 __func__,
759 ep->rep_attr.cap.max_send_wr,
760 ep->rep_attr.cap.max_recv_wr,
761 ep->rep_attr.cap.max_send_sge,
762 ep->rep_attr.cap.max_recv_sge);
763
764 /* set trigger for requesting send completion */
765 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
766 if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
767 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
768 else if (ep->rep_cqinit <= 2)
769 ep->rep_cqinit = 0;
770 INIT_CQCOUNT(ep);
771 init_waitqueue_head(&ep->rep_connect_wait);
772 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
773
774 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
775 rpcrdma_cq_async_error_upcall, ep,
776 ep->rep_attr.cap.max_send_wr + 1, 0);
777 if (IS_ERR(sendcq)) {
778 rc = PTR_ERR(sendcq);
779 dprintk("RPC: %s: failed to create send CQ: %i\n",
780 __func__, rc);
781 goto out1;
782 }
783
784 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
785 if (rc) {
786 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
787 __func__, rc);
788 goto out2;
789 }
790
791 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
792 rpcrdma_cq_async_error_upcall, ep,
793 ep->rep_attr.cap.max_recv_wr + 1, 0);
794 if (IS_ERR(recvcq)) {
795 rc = PTR_ERR(recvcq);
796 dprintk("RPC: %s: failed to create recv CQ: %i\n",
797 __func__, rc);
798 goto out2;
799 }
800
801 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
802 if (rc) {
803 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
804 __func__, rc);
805 ib_destroy_cq(recvcq);
806 goto out2;
807 }
808
809 ep->rep_attr.send_cq = sendcq;
810 ep->rep_attr.recv_cq = recvcq;
811
812 /* Initialize cma parameters */
813
814 /* RPC/RDMA does not use private data */
815 ep->rep_remote_cma.private_data = NULL;
816 ep->rep_remote_cma.private_data_len = 0;
817
818 /* Client offers RDMA Read but does not initiate */
819 ep->rep_remote_cma.initiator_depth = 0;
820 if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
821 ep->rep_remote_cma.responder_resources = 32;
822 else
823 ep->rep_remote_cma.responder_resources =
824 devattr->max_qp_rd_atom;
825
826 ep->rep_remote_cma.retry_count = 7;
827 ep->rep_remote_cma.flow_control = 0;
828 ep->rep_remote_cma.rnr_retry_count = 0;
829
830 return 0;
831
832 out2:
833 err = ib_destroy_cq(sendcq);
834 if (err)
835 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
836 __func__, err);
837 out1:
838 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
839 return rc;
840 }
841
842 /*
843 * rpcrdma_ep_destroy
844 *
845 * Disconnect and destroy endpoint. After this, the only
846 * valid operations on the ep are to free it (if dynamically
847 * allocated) or re-create it.
848 */
849 void
850 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
851 {
852 int rc;
853
854 dprintk("RPC: %s: entering, connected is %d\n",
855 __func__, ep->rep_connected);
856
857 cancel_delayed_work_sync(&ep->rep_connect_worker);
858
859 if (ia->ri_id->qp) {
860 rpcrdma_ep_disconnect(ep, ia);
861 rdma_destroy_qp(ia->ri_id);
862 ia->ri_id->qp = NULL;
863 }
864
865 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
866
867 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
868 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
869 if (rc)
870 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
871 __func__, rc);
872
873 rpcrdma_clean_cq(ep->rep_attr.send_cq);
874 rc = ib_destroy_cq(ep->rep_attr.send_cq);
875 if (rc)
876 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
877 __func__, rc);
878 }
879
880 /*
881 * Connect unconnected endpoint.
882 */
883 int
884 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
885 {
886 struct rdma_cm_id *id, *old;
887 int rc = 0;
888 int retry_count = 0;
889
890 if (ep->rep_connected != 0) {
891 struct rpcrdma_xprt *xprt;
892 retry:
893 dprintk("RPC: %s: reconnecting...\n", __func__);
894
895 rpcrdma_ep_disconnect(ep, ia);
896 rpcrdma_flush_cqs(ep);
897
898 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
899 ia->ri_ops->ro_reset(xprt);
900
901 id = rpcrdma_create_id(xprt, ia,
902 (struct sockaddr *)&xprt->rx_data.addr);
903 if (IS_ERR(id)) {
904 rc = -EHOSTUNREACH;
905 goto out;
906 }
907 /* TEMP TEMP TEMP - fail if new device:
908 * Deregister/remarshal *all* requests!
909 * Close and recreate adapter, pd, etc!
910 * Re-determine all attributes still sane!
911 * More stuff I haven't thought of!
912 * Rrrgh!
913 */
914 if (ia->ri_id->device != id->device) {
915 printk("RPC: %s: can't reconnect on "
916 "different device!\n", __func__);
917 rdma_destroy_id(id);
918 rc = -ENETUNREACH;
919 goto out;
920 }
921 /* END TEMP */
922 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
923 if (rc) {
924 dprintk("RPC: %s: rdma_create_qp failed %i\n",
925 __func__, rc);
926 rdma_destroy_id(id);
927 rc = -ENETUNREACH;
928 goto out;
929 }
930
931 write_lock(&ia->ri_qplock);
932 old = ia->ri_id;
933 ia->ri_id = id;
934 write_unlock(&ia->ri_qplock);
935
936 rdma_destroy_qp(old);
937 rdma_destroy_id(old);
938 } else {
939 dprintk("RPC: %s: connecting...\n", __func__);
940 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
941 if (rc) {
942 dprintk("RPC: %s: rdma_create_qp failed %i\n",
943 __func__, rc);
944 /* do not update ep->rep_connected */
945 return -ENETUNREACH;
946 }
947 }
948
949 ep->rep_connected = 0;
950
951 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
952 if (rc) {
953 dprintk("RPC: %s: rdma_connect() failed with %i\n",
954 __func__, rc);
955 goto out;
956 }
957
958 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
959
960 /*
961 * Check state. A non-peer reject indicates no listener
962 * (ECONNREFUSED), which may be a transient state. All
963 * others indicate a transport condition which has already
964 * undergone a best-effort.
965 */
966 if (ep->rep_connected == -ECONNREFUSED &&
967 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
968 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
969 goto retry;
970 }
971 if (ep->rep_connected <= 0) {
972 /* Sometimes, the only way to reliably connect to remote
973 * CMs is to use same nonzero values for ORD and IRD. */
974 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
975 (ep->rep_remote_cma.responder_resources == 0 ||
976 ep->rep_remote_cma.initiator_depth !=
977 ep->rep_remote_cma.responder_resources)) {
978 if (ep->rep_remote_cma.responder_resources == 0)
979 ep->rep_remote_cma.responder_resources = 1;
980 ep->rep_remote_cma.initiator_depth =
981 ep->rep_remote_cma.responder_resources;
982 goto retry;
983 }
984 rc = ep->rep_connected;
985 } else {
986 dprintk("RPC: %s: connected\n", __func__);
987 }
988
989 out:
990 if (rc)
991 ep->rep_connected = rc;
992 return rc;
993 }
994
995 /*
996 * rpcrdma_ep_disconnect
997 *
998 * This is separate from destroy to facilitate the ability
999 * to reconnect without recreating the endpoint.
1000 *
1001 * This call is not reentrant, and must not be made in parallel
1002 * on the same endpoint.
1003 */
1004 void
1005 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
1006 {
1007 int rc;
1008
1009 rpcrdma_flush_cqs(ep);
1010 rc = rdma_disconnect(ia->ri_id);
1011 if (!rc) {
1012 /* returns without wait if not connected */
1013 wait_event_interruptible(ep->rep_connect_wait,
1014 ep->rep_connected != 1);
1015 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
1016 (ep->rep_connected == 1) ? "still " : "dis");
1017 } else {
1018 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
1019 ep->rep_connected = rc;
1020 }
1021 }
1022
1023 static struct rpcrdma_req *
1024 rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
1025 {
1026 struct rpcrdma_req *req;
1027
1028 req = kzalloc(sizeof(*req), GFP_KERNEL);
1029 if (req == NULL)
1030 return ERR_PTR(-ENOMEM);
1031
1032 req->rl_buffer = &r_xprt->rx_buf;
1033 return req;
1034 }
1035
1036 static struct rpcrdma_rep *
1037 rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
1038 {
1039 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1040 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1041 struct rpcrdma_rep *rep;
1042 int rc;
1043
1044 rc = -ENOMEM;
1045 rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1046 if (rep == NULL)
1047 goto out;
1048
1049 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
1050 GFP_KERNEL);
1051 if (IS_ERR(rep->rr_rdmabuf)) {
1052 rc = PTR_ERR(rep->rr_rdmabuf);
1053 goto out_free;
1054 }
1055
1056 rep->rr_buffer = &r_xprt->rx_buf;
1057 return rep;
1058
1059 out_free:
1060 kfree(rep);
1061 out:
1062 return ERR_PTR(rc);
1063 }
1064
1065 int
1066 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1067 {
1068 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1069 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1070 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1071 char *p;
1072 size_t len;
1073 int i, rc;
1074
1075 buf->rb_max_requests = cdata->max_requests;
1076 spin_lock_init(&buf->rb_lock);
1077
1078 /* Need to allocate:
1079 * 1. arrays for send and recv pointers
1080 * 2. arrays of struct rpcrdma_req to fill in pointers
1081 * 3. array of struct rpcrdma_rep for replies
1082 * Send/recv buffers in req/rep need to be registered
1083 */
1084 len = buf->rb_max_requests *
1085 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1086
1087 p = kzalloc(len, GFP_KERNEL);
1088 if (p == NULL) {
1089 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1090 __func__, len);
1091 rc = -ENOMEM;
1092 goto out;
1093 }
1094 buf->rb_pool = p; /* for freeing it later */
1095
1096 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1097 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1098 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1099 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1100
1101 rc = ia->ri_ops->ro_init(r_xprt);
1102 if (rc)
1103 goto out;
1104
1105 for (i = 0; i < buf->rb_max_requests; i++) {
1106 struct rpcrdma_req *req;
1107 struct rpcrdma_rep *rep;
1108
1109 req = rpcrdma_create_req(r_xprt);
1110 if (IS_ERR(req)) {
1111 dprintk("RPC: %s: request buffer %d alloc"
1112 " failed\n", __func__, i);
1113 rc = PTR_ERR(req);
1114 goto out;
1115 }
1116 buf->rb_send_bufs[i] = req;
1117
1118 rep = rpcrdma_create_rep(r_xprt);
1119 if (IS_ERR(rep)) {
1120 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1121 __func__, i);
1122 rc = PTR_ERR(rep);
1123 goto out;
1124 }
1125 buf->rb_recv_bufs[i] = rep;
1126 }
1127
1128 return 0;
1129 out:
1130 rpcrdma_buffer_destroy(buf);
1131 return rc;
1132 }
1133
1134 static void
1135 rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
1136 {
1137 if (!rep)
1138 return;
1139
1140 rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
1141 kfree(rep);
1142 }
1143
1144 static void
1145 rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
1146 {
1147 if (!req)
1148 return;
1149
1150 rpcrdma_free_regbuf(ia, req->rl_sendbuf);
1151 rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
1152 kfree(req);
1153 }
1154
1155 void
1156 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1157 {
1158 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1159 int i;
1160
1161 /* clean up in reverse order from create
1162 * 1. recv mr memory (mr free, then kfree)
1163 * 2. send mr memory (mr free, then kfree)
1164 * 3. MWs
1165 */
1166 dprintk("RPC: %s: entering\n", __func__);
1167
1168 for (i = 0; i < buf->rb_max_requests; i++) {
1169 if (buf->rb_recv_bufs)
1170 rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
1171 if (buf->rb_send_bufs)
1172 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
1173 }
1174
1175 ia->ri_ops->ro_destroy(buf);
1176
1177 kfree(buf->rb_pool);
1178 }
1179
1180 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1181 * some req segments uninitialized.
1182 */
1183 static void
1184 rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1185 {
1186 if (*mw) {
1187 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1188 *mw = NULL;
1189 }
1190 }
1191
1192 /* Cycle mw's back in reverse order, and "spin" them.
1193 * This delays and scrambles reuse as much as possible.
1194 */
1195 static void
1196 rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1197 {
1198 struct rpcrdma_mr_seg *seg = req->rl_segments;
1199 struct rpcrdma_mr_seg *seg1 = seg;
1200 int i;
1201
1202 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1203 rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
1204 rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
1205 }
1206
1207 static void
1208 rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1209 {
1210 buf->rb_send_bufs[--buf->rb_send_index] = req;
1211 req->rl_niovs = 0;
1212 if (req->rl_reply) {
1213 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1214 req->rl_reply->rr_func = NULL;
1215 req->rl_reply = NULL;
1216 }
1217 }
1218
1219 /* rpcrdma_unmap_one() was already done during deregistration.
1220 * Redo only the ib_post_send().
1221 */
1222 static void
1223 rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1224 {
1225 struct rpcrdma_xprt *r_xprt =
1226 container_of(ia, struct rpcrdma_xprt, rx_ia);
1227 struct ib_send_wr invalidate_wr, *bad_wr;
1228 int rc;
1229
1230 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1231
1232 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
1233 r->r.frmr.fr_state = FRMR_IS_INVALID;
1234
1235 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1236 invalidate_wr.wr_id = (unsigned long)(void *)r;
1237 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1238 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1239 DECR_CQCOUNT(&r_xprt->rx_ep);
1240
1241 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1242 __func__, r, r->r.frmr.fr_mr->rkey);
1243
1244 read_lock(&ia->ri_qplock);
1245 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1246 read_unlock(&ia->ri_qplock);
1247 if (rc) {
1248 /* Force rpcrdma_buffer_get() to retry */
1249 r->r.frmr.fr_state = FRMR_IS_STALE;
1250 dprintk("RPC: %s: ib_post_send failed, %i\n",
1251 __func__, rc);
1252 }
1253 }
1254
1255 static void
1256 rpcrdma_retry_flushed_linv(struct list_head *stale,
1257 struct rpcrdma_buffer *buf)
1258 {
1259 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1260 struct list_head *pos;
1261 struct rpcrdma_mw *r;
1262 unsigned long flags;
1263
1264 list_for_each(pos, stale) {
1265 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1266 rpcrdma_retry_local_inv(r, ia);
1267 }
1268
1269 spin_lock_irqsave(&buf->rb_lock, flags);
1270 list_splice_tail(stale, &buf->rb_mws);
1271 spin_unlock_irqrestore(&buf->rb_lock, flags);
1272 }
1273
1274 static struct rpcrdma_req *
1275 rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1276 struct list_head *stale)
1277 {
1278 struct rpcrdma_mw *r;
1279 int i;
1280
1281 i = RPCRDMA_MAX_SEGS - 1;
1282 while (!list_empty(&buf->rb_mws)) {
1283 r = list_entry(buf->rb_mws.next,
1284 struct rpcrdma_mw, mw_list);
1285 list_del(&r->mw_list);
1286 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1287 list_add(&r->mw_list, stale);
1288 continue;
1289 }
1290 req->rl_segments[i].rl_mw = r;
1291 if (unlikely(i-- == 0))
1292 return req; /* Success */
1293 }
1294
1295 /* Not enough entries on rb_mws for this req */
1296 rpcrdma_buffer_put_sendbuf(req, buf);
1297 rpcrdma_buffer_put_mrs(req, buf);
1298 return NULL;
1299 }
1300
1301 static struct rpcrdma_req *
1302 rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1303 {
1304 struct rpcrdma_mw *r;
1305 int i;
1306
1307 i = RPCRDMA_MAX_SEGS - 1;
1308 while (!list_empty(&buf->rb_mws)) {
1309 r = list_entry(buf->rb_mws.next,
1310 struct rpcrdma_mw, mw_list);
1311 list_del(&r->mw_list);
1312 req->rl_segments[i].rl_mw = r;
1313 if (unlikely(i-- == 0))
1314 return req; /* Success */
1315 }
1316
1317 /* Not enough entries on rb_mws for this req */
1318 rpcrdma_buffer_put_sendbuf(req, buf);
1319 rpcrdma_buffer_put_mrs(req, buf);
1320 return NULL;
1321 }
1322
1323 /*
1324 * Get a set of request/reply buffers.
1325 *
1326 * Reply buffer (if needed) is attached to send buffer upon return.
1327 * Rule:
1328 * rb_send_index and rb_recv_index MUST always be pointing to the
1329 * *next* available buffer (non-NULL). They are incremented after
1330 * removing buffers, and decremented *before* returning them.
1331 */
1332 struct rpcrdma_req *
1333 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1334 {
1335 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1336 struct list_head stale;
1337 struct rpcrdma_req *req;
1338 unsigned long flags;
1339
1340 spin_lock_irqsave(&buffers->rb_lock, flags);
1341 if (buffers->rb_send_index == buffers->rb_max_requests) {
1342 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1343 dprintk("RPC: %s: out of request buffers\n", __func__);
1344 return ((struct rpcrdma_req *)NULL);
1345 }
1346
1347 req = buffers->rb_send_bufs[buffers->rb_send_index];
1348 if (buffers->rb_send_index < buffers->rb_recv_index) {
1349 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1350 __func__,
1351 buffers->rb_recv_index - buffers->rb_send_index);
1352 req->rl_reply = NULL;
1353 } else {
1354 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1355 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1356 }
1357 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1358
1359 INIT_LIST_HEAD(&stale);
1360 switch (ia->ri_memreg_strategy) {
1361 case RPCRDMA_FRMR:
1362 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1363 break;
1364 case RPCRDMA_MTHCAFMR:
1365 req = rpcrdma_buffer_get_fmrs(req, buffers);
1366 break;
1367 default:
1368 break;
1369 }
1370 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1371 if (!list_empty(&stale))
1372 rpcrdma_retry_flushed_linv(&stale, buffers);
1373 return req;
1374 }
1375
1376 /*
1377 * Put request/reply buffers back into pool.
1378 * Pre-decrement counter/array index.
1379 */
1380 void
1381 rpcrdma_buffer_put(struct rpcrdma_req *req)
1382 {
1383 struct rpcrdma_buffer *buffers = req->rl_buffer;
1384 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1385 unsigned long flags;
1386
1387 spin_lock_irqsave(&buffers->rb_lock, flags);
1388 rpcrdma_buffer_put_sendbuf(req, buffers);
1389 switch (ia->ri_memreg_strategy) {
1390 case RPCRDMA_FRMR:
1391 case RPCRDMA_MTHCAFMR:
1392 rpcrdma_buffer_put_mrs(req, buffers);
1393 break;
1394 default:
1395 break;
1396 }
1397 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1398 }
1399
1400 /*
1401 * Recover reply buffers from pool.
1402 * This happens when recovering from error conditions.
1403 * Post-increment counter/array index.
1404 */
1405 void
1406 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1407 {
1408 struct rpcrdma_buffer *buffers = req->rl_buffer;
1409 unsigned long flags;
1410
1411 spin_lock_irqsave(&buffers->rb_lock, flags);
1412 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1413 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1414 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1415 }
1416 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1417 }
1418
1419 /*
1420 * Put reply buffers back into pool when not attached to
1421 * request. This happens in error conditions.
1422 */
1423 void
1424 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1425 {
1426 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1427 unsigned long flags;
1428
1429 rep->rr_func = NULL;
1430 spin_lock_irqsave(&buffers->rb_lock, flags);
1431 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1432 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1433 }
1434
1435 /*
1436 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1437 */
1438
1439 void
1440 rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
1441 {
1442 dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
1443 seg->mr_offset,
1444 (unsigned long long)seg->mr_dma, seg->mr_dmalen);
1445 }
1446
1447 static int
1448 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1449 struct ib_mr **mrp, struct ib_sge *iov)
1450 {
1451 struct ib_phys_buf ipb;
1452 struct ib_mr *mr;
1453 int rc;
1454
1455 /*
1456 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1457 */
1458 iov->addr = ib_dma_map_single(ia->ri_id->device,
1459 va, len, DMA_BIDIRECTIONAL);
1460 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1461 return -ENOMEM;
1462
1463 iov->length = len;
1464
1465 if (ia->ri_have_dma_lkey) {
1466 *mrp = NULL;
1467 iov->lkey = ia->ri_dma_lkey;
1468 return 0;
1469 } else if (ia->ri_bind_mem != NULL) {
1470 *mrp = NULL;
1471 iov->lkey = ia->ri_bind_mem->lkey;
1472 return 0;
1473 }
1474
1475 ipb.addr = iov->addr;
1476 ipb.size = iov->length;
1477 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1478 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1479
1480 dprintk("RPC: %s: phys convert: 0x%llx "
1481 "registered 0x%llx length %d\n",
1482 __func__, (unsigned long long)ipb.addr,
1483 (unsigned long long)iov->addr, len);
1484
1485 if (IS_ERR(mr)) {
1486 *mrp = NULL;
1487 rc = PTR_ERR(mr);
1488 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1489 } else {
1490 *mrp = mr;
1491 iov->lkey = mr->lkey;
1492 rc = 0;
1493 }
1494
1495 return rc;
1496 }
1497
1498 static int
1499 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1500 struct ib_mr *mr, struct ib_sge *iov)
1501 {
1502 int rc;
1503
1504 ib_dma_unmap_single(ia->ri_id->device,
1505 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1506
1507 if (NULL == mr)
1508 return 0;
1509
1510 rc = ib_dereg_mr(mr);
1511 if (rc)
1512 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1513 return rc;
1514 }
1515
1516 /**
1517 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
1518 * @ia: controlling rpcrdma_ia
1519 * @size: size of buffer to be allocated, in bytes
1520 * @flags: GFP flags
1521 *
1522 * Returns pointer to private header of an area of internally
1523 * registered memory, or an ERR_PTR. The registered buffer follows
1524 * the end of the private header.
1525 *
1526 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
1527 * receiving the payload of RDMA RECV operations. regbufs are not
1528 * used for RDMA READ/WRITE operations, thus are registered only for
1529 * LOCAL access.
1530 */
1531 struct rpcrdma_regbuf *
1532 rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
1533 {
1534 struct rpcrdma_regbuf *rb;
1535 int rc;
1536
1537 rc = -ENOMEM;
1538 rb = kmalloc(sizeof(*rb) + size, flags);
1539 if (rb == NULL)
1540 goto out;
1541
1542 rb->rg_size = size;
1543 rb->rg_owner = NULL;
1544 rc = rpcrdma_register_internal(ia, rb->rg_base, size,
1545 &rb->rg_mr, &rb->rg_iov);
1546 if (rc)
1547 goto out_free;
1548
1549 return rb;
1550
1551 out_free:
1552 kfree(rb);
1553 out:
1554 return ERR_PTR(rc);
1555 }
1556
1557 /**
1558 * rpcrdma_free_regbuf - deregister and free registered buffer
1559 * @ia: controlling rpcrdma_ia
1560 * @rb: regbuf to be deregistered and freed
1561 */
1562 void
1563 rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1564 {
1565 if (rb) {
1566 rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
1567 kfree(rb);
1568 }
1569 }
1570
1571 /*
1572 * Prepost any receive buffer, then post send.
1573 *
1574 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1575 */
1576 int
1577 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1578 struct rpcrdma_ep *ep,
1579 struct rpcrdma_req *req)
1580 {
1581 struct ib_send_wr send_wr, *send_wr_fail;
1582 struct rpcrdma_rep *rep = req->rl_reply;
1583 int rc;
1584
1585 if (rep) {
1586 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1587 if (rc)
1588 goto out;
1589 req->rl_reply = NULL;
1590 }
1591
1592 send_wr.next = NULL;
1593 send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
1594 send_wr.sg_list = req->rl_send_iov;
1595 send_wr.num_sge = req->rl_niovs;
1596 send_wr.opcode = IB_WR_SEND;
1597 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1598 ib_dma_sync_single_for_device(ia->ri_id->device,
1599 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1600 DMA_TO_DEVICE);
1601 ib_dma_sync_single_for_device(ia->ri_id->device,
1602 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1603 DMA_TO_DEVICE);
1604 ib_dma_sync_single_for_device(ia->ri_id->device,
1605 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1606 DMA_TO_DEVICE);
1607
1608 if (DECR_CQCOUNT(ep) > 0)
1609 send_wr.send_flags = 0;
1610 else { /* Provider must take a send completion every now and then */
1611 INIT_CQCOUNT(ep);
1612 send_wr.send_flags = IB_SEND_SIGNALED;
1613 }
1614
1615 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1616 if (rc)
1617 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1618 rc);
1619 out:
1620 return rc;
1621 }
1622
1623 /*
1624 * (Re)post a receive buffer.
1625 */
1626 int
1627 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1628 struct rpcrdma_ep *ep,
1629 struct rpcrdma_rep *rep)
1630 {
1631 struct ib_recv_wr recv_wr, *recv_wr_fail;
1632 int rc;
1633
1634 recv_wr.next = NULL;
1635 recv_wr.wr_id = (u64) (unsigned long) rep;
1636 recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
1637 recv_wr.num_sge = 1;
1638
1639 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1640 rdmab_addr(rep->rr_rdmabuf),
1641 rdmab_length(rep->rr_rdmabuf),
1642 DMA_BIDIRECTIONAL);
1643
1644 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1645
1646 if (rc)
1647 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1648 rc);
1649 return rc;
1650 }
1651
1652 /* How many chunk list items fit within our inline buffers?
1653 */
1654 unsigned int
1655 rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
1656 {
1657 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1658 int bytes, segments;
1659
1660 bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
1661 bytes -= RPCRDMA_HDRLEN_MIN;
1662 if (bytes < sizeof(struct rpcrdma_segment) * 2) {
1663 pr_warn("RPC: %s: inline threshold too small\n",
1664 __func__);
1665 return 0;
1666 }
1667
1668 segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
1669 dprintk("RPC: %s: max chunk list size = %d segments\n",
1670 __func__, segments);
1671 return segments;
1672 }