]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/sunrpc/xprtrdma/verbs.c
net: replace NIPQUAD() in net/*/
[mirror_ubuntu-bionic-kernel.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50#include <linux/pci.h> /* for Tavor hack below */
51
f58851e6
TT
52#include "xprt_rdma.h"
53
c56c65fb
TT
54/*
55 * Globals/Macros
56 */
57
58#ifdef RPC_DEBUG
59# define RPCDBG_FACILITY RPCDBG_TRANS
60#endif
61
62/*
63 * internal functions
64 */
65
66/*
67 * handle replies in tasklet context, using a single, global list
68 * rdma tasklet function -- just turn around and call the func
69 * for all replies on the list
70 */
71
72static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
73static LIST_HEAD(rpcrdma_tasklets_g);
74
75static void
76rpcrdma_run_tasklet(unsigned long data)
77{
78 struct rpcrdma_rep *rep;
79 void (*func)(struct rpcrdma_rep *);
80 unsigned long flags;
81
82 data = data;
83 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
84 while (!list_empty(&rpcrdma_tasklets_g)) {
85 rep = list_entry(rpcrdma_tasklets_g.next,
86 struct rpcrdma_rep, rr_list);
87 list_del(&rep->rr_list);
88 func = rep->rr_func;
89 rep->rr_func = NULL;
90 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
91
92 if (func)
93 func(rep);
94 else
95 rpcrdma_recv_buffer_put(rep);
96
97 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
98 }
99 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
100}
101
102static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
103
104static inline void
105rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
106{
107 unsigned long flags;
108
109 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
110 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
111 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
112 tasklet_schedule(&rpcrdma_tasklet_g);
113}
114
115static void
116rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
117{
118 struct rpcrdma_ep *ep = context;
119
120 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
121 __func__, event->event, event->device->name, context);
122 if (ep->rep_connected == 1) {
123 ep->rep_connected = -EIO;
124 ep->rep_func(ep);
125 wake_up_all(&ep->rep_connect_wait);
126 }
127}
128
129static void
130rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
131{
132 struct rpcrdma_ep *ep = context;
133
134 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
135 __func__, event->event, event->device->name, context);
136 if (ep->rep_connected == 1) {
137 ep->rep_connected = -EIO;
138 ep->rep_func(ep);
139 wake_up_all(&ep->rep_connect_wait);
140 }
141}
142
143static inline
144void rpcrdma_event_process(struct ib_wc *wc)
145{
146 struct rpcrdma_rep *rep =
147 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
148
149 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
150 __func__, rep, wc->status, wc->opcode, wc->byte_len);
151
152 if (!rep) /* send or bind completion that we don't care about */
153 return;
154
155 if (IB_WC_SUCCESS != wc->status) {
156 dprintk("RPC: %s: %s WC status %X, connection lost\n",
157 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
158 wc->status);
159 rep->rr_len = ~0U;
160 rpcrdma_schedule_tasklet(rep);
161 return;
162 }
163
164 switch (wc->opcode) {
165 case IB_WC_RECV:
166 rep->rr_len = wc->byte_len;
167 ib_dma_sync_single_for_cpu(
168 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
169 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
170 /* Keep (only) the most recent credits, after check validity */
171 if (rep->rr_len >= 16) {
172 struct rpcrdma_msg *p =
173 (struct rpcrdma_msg *) rep->rr_base;
174 unsigned int credits = ntohl(p->rm_credit);
175 if (credits == 0) {
176 dprintk("RPC: %s: server"
177 " dropped credits to 0!\n", __func__);
178 /* don't deadlock */
179 credits = 1;
180 } else if (credits > rep->rr_buffer->rb_max_requests) {
181 dprintk("RPC: %s: server"
182 " over-crediting: %d (%d)\n",
183 __func__, credits,
184 rep->rr_buffer->rb_max_requests);
185 credits = rep->rr_buffer->rb_max_requests;
186 }
187 atomic_set(&rep->rr_buffer->rb_credits, credits);
188 }
189 /* fall through */
190 case IB_WC_BIND_MW:
191 rpcrdma_schedule_tasklet(rep);
192 break;
193 default:
194 dprintk("RPC: %s: unexpected WC event %X\n",
195 __func__, wc->opcode);
196 break;
197 }
198}
199
200static inline int
201rpcrdma_cq_poll(struct ib_cq *cq)
202{
203 struct ib_wc wc;
204 int rc;
205
206 for (;;) {
207 rc = ib_poll_cq(cq, 1, &wc);
208 if (rc < 0) {
209 dprintk("RPC: %s: ib_poll_cq failed %i\n",
210 __func__, rc);
211 return rc;
212 }
213 if (rc == 0)
214 break;
215
216 rpcrdma_event_process(&wc);
217 }
218
219 return 0;
220}
221
222/*
223 * rpcrdma_cq_event_upcall
224 *
225 * This upcall handles recv, send, bind and unbind events.
226 * It is reentrant but processes single events in order to maintain
227 * ordering of receives to keep server credits.
228 *
229 * It is the responsibility of the scheduled tasklet to return
230 * recv buffers to the pool. NOTE: this affects synchronization of
231 * connection shutdown. That is, the structures required for
232 * the completion of the reply handler must remain intact until
233 * all memory has been reclaimed.
234 *
235 * Note that send events are suppressed and do not result in an upcall.
236 */
237static void
238rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
239{
240 int rc;
241
242 rc = rpcrdma_cq_poll(cq);
243 if (rc)
244 return;
245
246 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
247 if (rc) {
248 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
249 __func__, rc);
250 return;
251 }
252
253 rpcrdma_cq_poll(cq);
254}
255
256#ifdef RPC_DEBUG
257static const char * const conn[] = {
258 "address resolved",
259 "address error",
260 "route resolved",
261 "route error",
262 "connect request",
263 "connect response",
264 "connect error",
265 "unreachable",
266 "rejected",
267 "established",
268 "disconnected",
269 "device removal"
270};
271#endif
272
273static int
274rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
275{
276 struct rpcrdma_xprt *xprt = id->context;
277 struct rpcrdma_ia *ia = &xprt->rx_ia;
278 struct rpcrdma_ep *ep = &xprt->rx_ep;
279 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
280 struct ib_qp_attr attr;
281 struct ib_qp_init_attr iattr;
282 int connstate = 0;
283
284 switch (event->event) {
285 case RDMA_CM_EVENT_ADDR_RESOLVED:
286 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 287 ia->ri_async_rc = 0;
c56c65fb
TT
288 complete(&ia->ri_done);
289 break;
290 case RDMA_CM_EVENT_ADDR_ERROR:
291 ia->ri_async_rc = -EHOSTUNREACH;
292 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
293 __func__, ep);
294 complete(&ia->ri_done);
295 break;
296 case RDMA_CM_EVENT_ROUTE_ERROR:
297 ia->ri_async_rc = -ENETUNREACH;
298 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
299 __func__, ep);
300 complete(&ia->ri_done);
301 break;
302 case RDMA_CM_EVENT_ESTABLISHED:
303 connstate = 1;
304 ib_query_qp(ia->ri_id->qp, &attr,
305 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
306 &iattr);
307 dprintk("RPC: %s: %d responder resources"
308 " (%d initiator)\n",
309 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
310 goto connected;
311 case RDMA_CM_EVENT_CONNECT_ERROR:
312 connstate = -ENOTCONN;
313 goto connected;
314 case RDMA_CM_EVENT_UNREACHABLE:
315 connstate = -ENETDOWN;
316 goto connected;
317 case RDMA_CM_EVENT_REJECTED:
318 connstate = -ECONNREFUSED;
319 goto connected;
320 case RDMA_CM_EVENT_DISCONNECTED:
321 connstate = -ECONNABORTED;
322 goto connected;
323 case RDMA_CM_EVENT_DEVICE_REMOVAL:
324 connstate = -ENODEV;
325connected:
21454aaa 326 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
c56c65fb
TT
327 __func__,
328 (event->event <= 11) ? conn[event->event] :
329 "unknown connection error",
21454aaa 330 &addr->sin_addr.s_addr,
c56c65fb
TT
331 ntohs(addr->sin_port),
332 ep, event->event);
333 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
334 dprintk("RPC: %s: %sconnected\n",
335 __func__, connstate > 0 ? "" : "dis");
336 ep->rep_connected = connstate;
337 ep->rep_func(ep);
338 wake_up_all(&ep->rep_connect_wait);
339 break;
340 default:
1a954051 341 dprintk("RPC: %s: unexpected CM event %d\n",
c56c65fb 342 __func__, event->event);
c56c65fb
TT
343 break;
344 }
345
b3cd8d45
TT
346#ifdef RPC_DEBUG
347 if (connstate == 1) {
348 int ird = attr.max_dest_rd_atomic;
349 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 350 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 351 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 352 &addr->sin_addr.s_addr,
b3cd8d45
TT
353 ntohs(addr->sin_port),
354 ia->ri_id->device->name,
355 ia->ri_memreg_strategy,
356 xprt->rx_buf.rb_max_requests,
357 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
358 } else if (connstate < 0) {
21454aaa
HH
359 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
360 &addr->sin_addr.s_addr,
b3cd8d45
TT
361 ntohs(addr->sin_port),
362 connstate);
363 }
364#endif
365
c56c65fb
TT
366 return 0;
367}
368
369static struct rdma_cm_id *
370rpcrdma_create_id(struct rpcrdma_xprt *xprt,
371 struct rpcrdma_ia *ia, struct sockaddr *addr)
372{
373 struct rdma_cm_id *id;
374 int rc;
375
1a954051
TT
376 init_completion(&ia->ri_done);
377
c56c65fb
TT
378 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
379 if (IS_ERR(id)) {
380 rc = PTR_ERR(id);
381 dprintk("RPC: %s: rdma_create_id() failed %i\n",
382 __func__, rc);
383 return id;
384 }
385
5675add3 386 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
387 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
388 if (rc) {
389 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
390 __func__, rc);
391 goto out;
392 }
5675add3
TT
393 wait_for_completion_interruptible_timeout(&ia->ri_done,
394 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
395 rc = ia->ri_async_rc;
396 if (rc)
397 goto out;
398
5675add3 399 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
400 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
401 if (rc) {
402 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
403 __func__, rc);
404 goto out;
405 }
5675add3
TT
406 wait_for_completion_interruptible_timeout(&ia->ri_done,
407 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
408 rc = ia->ri_async_rc;
409 if (rc)
410 goto out;
411
412 return id;
413
414out:
415 rdma_destroy_id(id);
416 return ERR_PTR(rc);
417}
418
419/*
420 * Drain any cq, prior to teardown.
421 */
422static void
423rpcrdma_clean_cq(struct ib_cq *cq)
424{
425 struct ib_wc wc;
426 int count = 0;
427
428 while (1 == ib_poll_cq(cq, 1, &wc))
429 ++count;
430
431 if (count)
432 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
433 __func__, count, wc.opcode);
434}
435
436/*
437 * Exported functions.
438 */
439
440/*
441 * Open and initialize an Interface Adapter.
442 * o initializes fields of struct rpcrdma_ia, including
443 * interface and provider attributes and protection zone.
444 */
445int
446rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
447{
bd7ed1d1
TT
448 int rc, mem_priv;
449 struct ib_device_attr devattr;
c56c65fb
TT
450 struct rpcrdma_ia *ia = &xprt->rx_ia;
451
c56c65fb
TT
452 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
453 if (IS_ERR(ia->ri_id)) {
454 rc = PTR_ERR(ia->ri_id);
455 goto out1;
456 }
457
458 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
459 if (IS_ERR(ia->ri_pd)) {
460 rc = PTR_ERR(ia->ri_pd);
461 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
462 __func__, rc);
463 goto out2;
464 }
465
bd7ed1d1
TT
466 /*
467 * Query the device to determine if the requested memory
468 * registration strategy is supported. If it isn't, set the
469 * strategy to a globally supported model.
470 */
471 rc = ib_query_device(ia->ri_id->device, &devattr);
472 if (rc) {
473 dprintk("RPC: %s: ib_query_device failed %d\n",
474 __func__, rc);
475 goto out2;
476 }
477
478 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
479 ia->ri_have_dma_lkey = 1;
480 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
481 }
482
483 switch (memreg) {
484 case RPCRDMA_MEMWINDOWS:
485 case RPCRDMA_MEMWINDOWS_ASYNC:
486 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
487 dprintk("RPC: %s: MEMWINDOWS registration "
488 "specified but not supported by adapter, "
489 "using slower RPCRDMA_REGISTER\n",
490 __func__);
491 memreg = RPCRDMA_REGISTER;
492 }
493 break;
494 case RPCRDMA_MTHCAFMR:
495 if (!ia->ri_id->device->alloc_fmr) {
496#if RPCRDMA_PERSISTENT_REGISTRATION
497 dprintk("RPC: %s: MTHCAFMR registration "
498 "specified but not supported by adapter, "
499 "using riskier RPCRDMA_ALLPHYSICAL\n",
500 __func__);
501 memreg = RPCRDMA_ALLPHYSICAL;
502#else
503 dprintk("RPC: %s: MTHCAFMR registration "
504 "specified but not supported by adapter, "
505 "using slower RPCRDMA_REGISTER\n",
506 __func__);
507 memreg = RPCRDMA_REGISTER;
3197d309
TT
508#endif
509 }
510 break;
511 case RPCRDMA_FRMR:
512 /* Requires both frmr reg and local dma lkey */
513 if ((devattr.device_cap_flags &
514 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
515 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
516#if RPCRDMA_PERSISTENT_REGISTRATION
517 dprintk("RPC: %s: FRMR registration "
518 "specified but not supported by adapter, "
519 "using riskier RPCRDMA_ALLPHYSICAL\n",
520 __func__);
521 memreg = RPCRDMA_ALLPHYSICAL;
522#else
523 dprintk("RPC: %s: FRMR registration "
524 "specified but not supported by adapter, "
525 "using slower RPCRDMA_REGISTER\n",
526 __func__);
527 memreg = RPCRDMA_REGISTER;
bd7ed1d1
TT
528#endif
529 }
530 break;
531 }
532
c56c65fb
TT
533 /*
534 * Optionally obtain an underlying physical identity mapping in
535 * order to do a memory window-based bind. This base registration
536 * is protected from remote access - that is enabled only by binding
537 * for the specific bytes targeted during each RPC operation, and
538 * revoked after the corresponding completion similar to a storage
539 * adapter.
540 */
bd7ed1d1
TT
541 switch (memreg) {
542 case RPCRDMA_BOUNCEBUFFERS:
543 case RPCRDMA_REGISTER:
3197d309 544 case RPCRDMA_FRMR:
bd7ed1d1 545 break;
c56c65fb 546#if RPCRDMA_PERSISTENT_REGISTRATION
bd7ed1d1
TT
547 case RPCRDMA_ALLPHYSICAL:
548 mem_priv = IB_ACCESS_LOCAL_WRITE |
549 IB_ACCESS_REMOTE_WRITE |
550 IB_ACCESS_REMOTE_READ;
551 goto register_setup;
c56c65fb 552#endif
bd7ed1d1
TT
553 case RPCRDMA_MEMWINDOWS_ASYNC:
554 case RPCRDMA_MEMWINDOWS:
555 mem_priv = IB_ACCESS_LOCAL_WRITE |
556 IB_ACCESS_MW_BIND;
557 goto register_setup;
558 case RPCRDMA_MTHCAFMR:
559 if (ia->ri_have_dma_lkey)
c56c65fb 560 break;
bd7ed1d1
TT
561 mem_priv = IB_ACCESS_LOCAL_WRITE;
562 register_setup:
c56c65fb
TT
563 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
564 if (IS_ERR(ia->ri_bind_mem)) {
565 printk(KERN_ALERT "%s: ib_get_dma_mr for "
566 "phys register failed with %lX\n\t"
567 "Will continue with degraded performance\n",
568 __func__, PTR_ERR(ia->ri_bind_mem));
569 memreg = RPCRDMA_REGISTER;
570 ia->ri_bind_mem = NULL;
571 }
bd7ed1d1
TT
572 break;
573 default:
574 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
575 __func__, memreg);
576 rc = -EINVAL;
577 goto out2;
c56c65fb 578 }
bd7ed1d1
TT
579 dprintk("RPC: %s: memory registration strategy is %d\n",
580 __func__, memreg);
c56c65fb
TT
581
582 /* Else will do memory reg/dereg for each chunk */
583 ia->ri_memreg_strategy = memreg;
584
585 return 0;
586out2:
587 rdma_destroy_id(ia->ri_id);
fee08caf 588 ia->ri_id = NULL;
c56c65fb
TT
589out1:
590 return rc;
591}
592
593/*
594 * Clean up/close an IA.
595 * o if event handles and PD have been initialized, free them.
596 * o close the IA
597 */
598void
599rpcrdma_ia_close(struct rpcrdma_ia *ia)
600{
601 int rc;
602
603 dprintk("RPC: %s: entering\n", __func__);
604 if (ia->ri_bind_mem != NULL) {
605 rc = ib_dereg_mr(ia->ri_bind_mem);
606 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
607 __func__, rc);
608 }
fee08caf
TT
609 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
610 if (ia->ri_id->qp)
611 rdma_destroy_qp(ia->ri_id);
612 rdma_destroy_id(ia->ri_id);
613 ia->ri_id = NULL;
614 }
c56c65fb
TT
615 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
616 rc = ib_dealloc_pd(ia->ri_pd);
617 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
618 __func__, rc);
619 }
c56c65fb
TT
620}
621
622/*
623 * Create unconnected endpoint.
624 */
625int
626rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
627 struct rpcrdma_create_data_internal *cdata)
628{
629 struct ib_device_attr devattr;
5d40a8a5 630 int rc, err;
c56c65fb
TT
631
632 rc = ib_query_device(ia->ri_id->device, &devattr);
633 if (rc) {
634 dprintk("RPC: %s: ib_query_device failed %d\n",
635 __func__, rc);
636 return rc;
637 }
638
639 /* check provider's send/recv wr limits */
640 if (cdata->max_requests > devattr.max_qp_wr)
641 cdata->max_requests = devattr.max_qp_wr;
642
643 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
644 ep->rep_attr.qp_context = ep;
645 /* send_cq and recv_cq initialized below */
646 ep->rep_attr.srq = NULL;
647 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
648 switch (ia->ri_memreg_strategy) {
3197d309
TT
649 case RPCRDMA_FRMR:
650 /* Add room for frmr register and invalidate WRs */
651 ep->rep_attr.cap.max_send_wr *= 3;
652 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
653 return -EINVAL;
654 break;
c56c65fb
TT
655 case RPCRDMA_MEMWINDOWS_ASYNC:
656 case RPCRDMA_MEMWINDOWS:
657 /* Add room for mw_binds+unbinds - overkill! */
658 ep->rep_attr.cap.max_send_wr++;
659 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
660 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
661 return -EINVAL;
662 break;
663 default:
664 break;
665 }
666 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
667 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
668 ep->rep_attr.cap.max_recv_sge = 1;
669 ep->rep_attr.cap.max_inline_data = 0;
670 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
671 ep->rep_attr.qp_type = IB_QPT_RC;
672 ep->rep_attr.port_num = ~0;
673
674 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
675 "iovs: send %d recv %d\n",
676 __func__,
677 ep->rep_attr.cap.max_send_wr,
678 ep->rep_attr.cap.max_recv_wr,
679 ep->rep_attr.cap.max_send_sge,
680 ep->rep_attr.cap.max_recv_sge);
681
682 /* set trigger for requesting send completion */
683 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
684 switch (ia->ri_memreg_strategy) {
685 case RPCRDMA_MEMWINDOWS_ASYNC:
686 case RPCRDMA_MEMWINDOWS:
687 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
688 break;
689 default:
690 break;
691 }
692 if (ep->rep_cqinit <= 2)
693 ep->rep_cqinit = 0;
694 INIT_CQCOUNT(ep);
695 ep->rep_ia = ia;
696 init_waitqueue_head(&ep->rep_connect_wait);
697
698 /*
699 * Create a single cq for receive dto and mw_bind (only ever
700 * care about unbind, really). Send completions are suppressed.
701 * Use single threaded tasklet upcalls to maintain ordering.
702 */
703 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
704 rpcrdma_cq_async_error_upcall, NULL,
705 ep->rep_attr.cap.max_recv_wr +
706 ep->rep_attr.cap.max_send_wr + 1, 0);
707 if (IS_ERR(ep->rep_cq)) {
708 rc = PTR_ERR(ep->rep_cq);
709 dprintk("RPC: %s: ib_create_cq failed: %i\n",
710 __func__, rc);
711 goto out1;
712 }
713
714 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
715 if (rc) {
716 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
717 __func__, rc);
718 goto out2;
719 }
720
721 ep->rep_attr.send_cq = ep->rep_cq;
722 ep->rep_attr.recv_cq = ep->rep_cq;
723
724 /* Initialize cma parameters */
725
726 /* RPC/RDMA does not use private data */
727 ep->rep_remote_cma.private_data = NULL;
728 ep->rep_remote_cma.private_data_len = 0;
729
730 /* Client offers RDMA Read but does not initiate */
b334eaab
TT
731 ep->rep_remote_cma.initiator_depth = 0;
732 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
c56c65fb 733 ep->rep_remote_cma.responder_resources = 0;
b334eaab
TT
734 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
735 ep->rep_remote_cma.responder_resources = 32;
736 else
c56c65fb 737 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
c56c65fb
TT
738
739 ep->rep_remote_cma.retry_count = 7;
740 ep->rep_remote_cma.flow_control = 0;
741 ep->rep_remote_cma.rnr_retry_count = 0;
742
743 return 0;
744
745out2:
5d40a8a5
CL
746 err = ib_destroy_cq(ep->rep_cq);
747 if (err)
748 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
749 __func__, err);
c56c65fb
TT
750out1:
751 return rc;
752}
753
754/*
755 * rpcrdma_ep_destroy
756 *
757 * Disconnect and destroy endpoint. After this, the only
758 * valid operations on the ep are to free it (if dynamically
759 * allocated) or re-create it.
760 *
761 * The caller's error handling must be sure to not leak the endpoint
762 * if this function fails.
763 */
764int
765rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
766{
767 int rc;
768
769 dprintk("RPC: %s: entering, connected is %d\n",
770 __func__, ep->rep_connected);
771
772 if (ia->ri_id->qp) {
773 rc = rpcrdma_ep_disconnect(ep, ia);
774 if (rc)
775 dprintk("RPC: %s: rpcrdma_ep_disconnect"
776 " returned %i\n", __func__, rc);
fee08caf
TT
777 rdma_destroy_qp(ia->ri_id);
778 ia->ri_id->qp = NULL;
c56c65fb
TT
779 }
780
c56c65fb
TT
781 /* padding - could be done in rpcrdma_buffer_destroy... */
782 if (ep->rep_pad_mr) {
783 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
784 ep->rep_pad_mr = NULL;
785 }
786
c56c65fb
TT
787 rpcrdma_clean_cq(ep->rep_cq);
788 rc = ib_destroy_cq(ep->rep_cq);
789 if (rc)
790 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
791 __func__, rc);
792
793 return rc;
794}
795
796/*
797 * Connect unconnected endpoint.
798 */
799int
800rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
801{
802 struct rdma_cm_id *id;
803 int rc = 0;
804 int retry_count = 0;
c56c65fb 805
c055551e 806 if (ep->rep_connected != 0) {
c56c65fb
TT
807 struct rpcrdma_xprt *xprt;
808retry:
809 rc = rpcrdma_ep_disconnect(ep, ia);
810 if (rc && rc != -ENOTCONN)
811 dprintk("RPC: %s: rpcrdma_ep_disconnect"
812 " status %i\n", __func__, rc);
813 rpcrdma_clean_cq(ep->rep_cq);
814
815 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
816 id = rpcrdma_create_id(xprt, ia,
817 (struct sockaddr *)&xprt->rx_data.addr);
818 if (IS_ERR(id)) {
819 rc = PTR_ERR(id);
820 goto out;
821 }
822 /* TEMP TEMP TEMP - fail if new device:
823 * Deregister/remarshal *all* requests!
824 * Close and recreate adapter, pd, etc!
825 * Re-determine all attributes still sane!
826 * More stuff I haven't thought of!
827 * Rrrgh!
828 */
829 if (ia->ri_id->device != id->device) {
830 printk("RPC: %s: can't reconnect on "
831 "different device!\n", __func__);
832 rdma_destroy_id(id);
833 rc = -ENETDOWN;
834 goto out;
835 }
836 /* END TEMP */
1a954051 837 rdma_destroy_qp(ia->ri_id);
c56c65fb
TT
838 rdma_destroy_id(ia->ri_id);
839 ia->ri_id = id;
840 }
841
842 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
843 if (rc) {
844 dprintk("RPC: %s: rdma_create_qp failed %i\n",
845 __func__, rc);
846 goto out;
847 }
848
849/* XXX Tavor device performs badly with 2K MTU! */
850if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
851 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
852 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
853 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
854 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
855 struct ib_qp_attr attr = {
856 .path_mtu = IB_MTU_1024
857 };
858 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
859 }
860}
861
c56c65fb
TT
862 ep->rep_connected = 0;
863
864 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
865 if (rc) {
866 dprintk("RPC: %s: rdma_connect() failed with %i\n",
867 __func__, rc);
868 goto out;
869 }
870
c56c65fb
TT
871 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
872
873 /*
874 * Check state. A non-peer reject indicates no listener
875 * (ECONNREFUSED), which may be a transient state. All
876 * others indicate a transport condition which has already
877 * undergone a best-effort.
878 */
879 if (ep->rep_connected == -ECONNREFUSED
880 && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
881 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
882 goto retry;
883 }
884 if (ep->rep_connected <= 0) {
885 /* Sometimes, the only way to reliably connect to remote
886 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
887 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
888 (ep->rep_remote_cma.responder_resources == 0 ||
889 ep->rep_remote_cma.initiator_depth !=
890 ep->rep_remote_cma.responder_resources)) {
891 if (ep->rep_remote_cma.responder_resources == 0)
892 ep->rep_remote_cma.responder_resources = 1;
893 ep->rep_remote_cma.initiator_depth =
894 ep->rep_remote_cma.responder_resources;
c56c65fb 895 goto retry;
b334eaab 896 }
c56c65fb
TT
897 rc = ep->rep_connected;
898 } else {
899 dprintk("RPC: %s: connected\n", __func__);
900 }
901
902out:
903 if (rc)
904 ep->rep_connected = rc;
905 return rc;
906}
907
908/*
909 * rpcrdma_ep_disconnect
910 *
911 * This is separate from destroy to facilitate the ability
912 * to reconnect without recreating the endpoint.
913 *
914 * This call is not reentrant, and must not be made in parallel
915 * on the same endpoint.
916 */
917int
918rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
919{
920 int rc;
921
922 rpcrdma_clean_cq(ep->rep_cq);
923 rc = rdma_disconnect(ia->ri_id);
924 if (!rc) {
925 /* returns without wait if not connected */
926 wait_event_interruptible(ep->rep_connect_wait,
927 ep->rep_connected != 1);
928 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
929 (ep->rep_connected == 1) ? "still " : "dis");
930 } else {
931 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
932 ep->rep_connected = rc;
933 }
934 return rc;
935}
936
937/*
938 * Initialize buffer memory
939 */
940int
941rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
942 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
943{
944 char *p;
945 size_t len;
946 int i, rc;
8d4ba034 947 struct rpcrdma_mw *r;
c56c65fb
TT
948
949 buf->rb_max_requests = cdata->max_requests;
950 spin_lock_init(&buf->rb_lock);
951 atomic_set(&buf->rb_credits, 1);
952
953 /* Need to allocate:
954 * 1. arrays for send and recv pointers
955 * 2. arrays of struct rpcrdma_req to fill in pointers
956 * 3. array of struct rpcrdma_rep for replies
957 * 4. padding, if any
3197d309 958 * 5. mw's, fmr's or frmr's, if any
c56c65fb
TT
959 * Send/recv buffers in req/rep need to be registered
960 */
961
962 len = buf->rb_max_requests *
963 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
964 len += cdata->padding;
965 switch (ia->ri_memreg_strategy) {
3197d309
TT
966 case RPCRDMA_FRMR:
967 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
968 sizeof(struct rpcrdma_mw);
969 break;
c56c65fb
TT
970 case RPCRDMA_MTHCAFMR:
971 /* TBD we are perhaps overallocating here */
972 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
973 sizeof(struct rpcrdma_mw);
974 break;
975 case RPCRDMA_MEMWINDOWS_ASYNC:
976 case RPCRDMA_MEMWINDOWS:
977 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
978 sizeof(struct rpcrdma_mw);
979 break;
980 default:
981 break;
982 }
983
984 /* allocate 1, 4 and 5 in one shot */
985 p = kzalloc(len, GFP_KERNEL);
986 if (p == NULL) {
987 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
988 __func__, len);
989 rc = -ENOMEM;
990 goto out;
991 }
992 buf->rb_pool = p; /* for freeing it later */
993
994 buf->rb_send_bufs = (struct rpcrdma_req **) p;
995 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
996 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
997 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
998
999 /*
1000 * Register the zeroed pad buffer, if any.
1001 */
1002 if (cdata->padding) {
1003 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1004 &ep->rep_pad_mr, &ep->rep_pad);
1005 if (rc)
1006 goto out;
1007 }
1008 p += cdata->padding;
1009
1010 /*
1011 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1012 * We "cycle" the mw's in order to minimize rkey reuse,
1013 * and also reduce unbind-to-bind collision.
1014 */
1015 INIT_LIST_HEAD(&buf->rb_mws);
8d4ba034 1016 r = (struct rpcrdma_mw *)p;
c56c65fb 1017 switch (ia->ri_memreg_strategy) {
3197d309
TT
1018 case RPCRDMA_FRMR:
1019 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1020 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1021 RPCRDMA_MAX_SEGS);
1022 if (IS_ERR(r->r.frmr.fr_mr)) {
1023 rc = PTR_ERR(r->r.frmr.fr_mr);
1024 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1025 " failed %i\n", __func__, rc);
1026 goto out;
1027 }
1028 r->r.frmr.fr_pgl =
1029 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1030 RPCRDMA_MAX_SEGS);
1031 if (IS_ERR(r->r.frmr.fr_pgl)) {
1032 rc = PTR_ERR(r->r.frmr.fr_pgl);
1033 dprintk("RPC: %s: "
1034 "ib_alloc_fast_reg_page_list "
1035 "failed %i\n", __func__, rc);
1036 goto out;
1037 }
1038 list_add(&r->mw_list, &buf->rb_mws);
1039 ++r;
1040 }
1041 break;
c56c65fb 1042 case RPCRDMA_MTHCAFMR:
c56c65fb
TT
1043 /* TBD we are perhaps overallocating here */
1044 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
8d4ba034
TT
1045 static struct ib_fmr_attr fa =
1046 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
c56c65fb
TT
1047 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1048 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1049 &fa);
1050 if (IS_ERR(r->r.fmr)) {
1051 rc = PTR_ERR(r->r.fmr);
1052 dprintk("RPC: %s: ib_alloc_fmr"
1053 " failed %i\n", __func__, rc);
1054 goto out;
1055 }
1056 list_add(&r->mw_list, &buf->rb_mws);
1057 ++r;
1058 }
c56c65fb
TT
1059 break;
1060 case RPCRDMA_MEMWINDOWS_ASYNC:
1061 case RPCRDMA_MEMWINDOWS:
c56c65fb
TT
1062 /* Allocate one extra request's worth, for full cycling */
1063 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1064 r->r.mw = ib_alloc_mw(ia->ri_pd);
1065 if (IS_ERR(r->r.mw)) {
1066 rc = PTR_ERR(r->r.mw);
1067 dprintk("RPC: %s: ib_alloc_mw"
1068 " failed %i\n", __func__, rc);
1069 goto out;
1070 }
1071 list_add(&r->mw_list, &buf->rb_mws);
1072 ++r;
1073 }
c56c65fb
TT
1074 break;
1075 default:
1076 break;
1077 }
1078
1079 /*
1080 * Allocate/init the request/reply buffers. Doing this
1081 * using kmalloc for now -- one for each buf.
1082 */
1083 for (i = 0; i < buf->rb_max_requests; i++) {
1084 struct rpcrdma_req *req;
1085 struct rpcrdma_rep *rep;
1086
1087 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1088 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1089 /* Typical ~2400b, so rounding up saves work later */
1090 if (len < 4096)
1091 len = 4096;
1092 req = kmalloc(len, GFP_KERNEL);
1093 if (req == NULL) {
1094 dprintk("RPC: %s: request buffer %d alloc"
1095 " failed\n", __func__, i);
1096 rc = -ENOMEM;
1097 goto out;
1098 }
1099 memset(req, 0, sizeof(struct rpcrdma_req));
1100 buf->rb_send_bufs[i] = req;
1101 buf->rb_send_bufs[i]->rl_buffer = buf;
1102
1103 rc = rpcrdma_register_internal(ia, req->rl_base,
1104 len - offsetof(struct rpcrdma_req, rl_base),
1105 &buf->rb_send_bufs[i]->rl_handle,
1106 &buf->rb_send_bufs[i]->rl_iov);
1107 if (rc)
1108 goto out;
1109
1110 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1111
1112 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1113 rep = kmalloc(len, GFP_KERNEL);
1114 if (rep == NULL) {
1115 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1116 __func__, i);
1117 rc = -ENOMEM;
1118 goto out;
1119 }
1120 memset(rep, 0, sizeof(struct rpcrdma_rep));
1121 buf->rb_recv_bufs[i] = rep;
1122 buf->rb_recv_bufs[i]->rr_buffer = buf;
1123 init_waitqueue_head(&rep->rr_unbind);
1124
1125 rc = rpcrdma_register_internal(ia, rep->rr_base,
1126 len - offsetof(struct rpcrdma_rep, rr_base),
1127 &buf->rb_recv_bufs[i]->rr_handle,
1128 &buf->rb_recv_bufs[i]->rr_iov);
1129 if (rc)
1130 goto out;
1131
1132 }
1133 dprintk("RPC: %s: max_requests %d\n",
1134 __func__, buf->rb_max_requests);
1135 /* done */
1136 return 0;
1137out:
1138 rpcrdma_buffer_destroy(buf);
1139 return rc;
1140}
1141
1142/*
1143 * Unregister and destroy buffer memory. Need to deal with
1144 * partial initialization, so it's callable from failed create.
1145 * Must be called before destroying endpoint, as registrations
1146 * reference it.
1147 */
1148void
1149rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1150{
1151 int rc, i;
1152 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
8d4ba034 1153 struct rpcrdma_mw *r;
c56c65fb
TT
1154
1155 /* clean up in reverse order from create
1156 * 1. recv mr memory (mr free, then kfree)
1157 * 1a. bind mw memory
1158 * 2. send mr memory (mr free, then kfree)
1159 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1160 * 4. arrays
1161 */
1162 dprintk("RPC: %s: entering\n", __func__);
1163
1164 for (i = 0; i < buf->rb_max_requests; i++) {
1165 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1166 rpcrdma_deregister_internal(ia,
1167 buf->rb_recv_bufs[i]->rr_handle,
1168 &buf->rb_recv_bufs[i]->rr_iov);
1169 kfree(buf->rb_recv_bufs[i]);
1170 }
1171 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1172 while (!list_empty(&buf->rb_mws)) {
c56c65fb
TT
1173 r = list_entry(buf->rb_mws.next,
1174 struct rpcrdma_mw, mw_list);
1175 list_del(&r->mw_list);
1176 switch (ia->ri_memreg_strategy) {
3197d309
TT
1177 case RPCRDMA_FRMR:
1178 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1179 if (rc)
1180 dprintk("RPC: %s:"
1181 " ib_dereg_mr"
1182 " failed %i\n",
1183 __func__, rc);
1184 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1185 break;
c56c65fb
TT
1186 case RPCRDMA_MTHCAFMR:
1187 rc = ib_dealloc_fmr(r->r.fmr);
1188 if (rc)
1189 dprintk("RPC: %s:"
1190 " ib_dealloc_fmr"
1191 " failed %i\n",
1192 __func__, rc);
1193 break;
1194 case RPCRDMA_MEMWINDOWS_ASYNC:
1195 case RPCRDMA_MEMWINDOWS:
1196 rc = ib_dealloc_mw(r->r.mw);
1197 if (rc)
1198 dprintk("RPC: %s:"
1199 " ib_dealloc_mw"
1200 " failed %i\n",
1201 __func__, rc);
1202 break;
1203 default:
1204 break;
1205 }
1206 }
1207 rpcrdma_deregister_internal(ia,
1208 buf->rb_send_bufs[i]->rl_handle,
1209 &buf->rb_send_bufs[i]->rl_iov);
1210 kfree(buf->rb_send_bufs[i]);
1211 }
1212 }
1213
1214 kfree(buf->rb_pool);
1215}
1216
1217/*
1218 * Get a set of request/reply buffers.
1219 *
1220 * Reply buffer (if needed) is attached to send buffer upon return.
1221 * Rule:
1222 * rb_send_index and rb_recv_index MUST always be pointing to the
1223 * *next* available buffer (non-NULL). They are incremented after
1224 * removing buffers, and decremented *before* returning them.
1225 */
1226struct rpcrdma_req *
1227rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1228{
1229 struct rpcrdma_req *req;
1230 unsigned long flags;
8d4ba034
TT
1231 int i;
1232 struct rpcrdma_mw *r;
c56c65fb
TT
1233
1234 spin_lock_irqsave(&buffers->rb_lock, flags);
1235 if (buffers->rb_send_index == buffers->rb_max_requests) {
1236 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1237 dprintk("RPC: %s: out of request buffers\n", __func__);
1238 return ((struct rpcrdma_req *)NULL);
1239 }
1240
1241 req = buffers->rb_send_bufs[buffers->rb_send_index];
1242 if (buffers->rb_send_index < buffers->rb_recv_index) {
1243 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1244 __func__,
1245 buffers->rb_recv_index - buffers->rb_send_index);
1246 req->rl_reply = NULL;
1247 } else {
1248 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1249 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1250 }
1251 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1252 if (!list_empty(&buffers->rb_mws)) {
8d4ba034 1253 i = RPCRDMA_MAX_SEGS - 1;
c56c65fb 1254 do {
c56c65fb
TT
1255 r = list_entry(buffers->rb_mws.next,
1256 struct rpcrdma_mw, mw_list);
1257 list_del(&r->mw_list);
1258 req->rl_segments[i].mr_chunk.rl_mw = r;
1259 } while (--i >= 0);
1260 }
1261 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1262 return req;
1263}
1264
1265/*
1266 * Put request/reply buffers back into pool.
1267 * Pre-decrement counter/array index.
1268 */
1269void
1270rpcrdma_buffer_put(struct rpcrdma_req *req)
1271{
1272 struct rpcrdma_buffer *buffers = req->rl_buffer;
1273 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1274 int i;
1275 unsigned long flags;
1276
1277 BUG_ON(req->rl_nchunks != 0);
1278 spin_lock_irqsave(&buffers->rb_lock, flags);
1279 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1280 req->rl_niovs = 0;
1281 if (req->rl_reply) {
1282 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1283 init_waitqueue_head(&req->rl_reply->rr_unbind);
1284 req->rl_reply->rr_func = NULL;
1285 req->rl_reply = NULL;
1286 }
1287 switch (ia->ri_memreg_strategy) {
3197d309 1288 case RPCRDMA_FRMR:
c56c65fb
TT
1289 case RPCRDMA_MTHCAFMR:
1290 case RPCRDMA_MEMWINDOWS_ASYNC:
1291 case RPCRDMA_MEMWINDOWS:
1292 /*
1293 * Cycle mw's back in reverse order, and "spin" them.
1294 * This delays and scrambles reuse as much as possible.
1295 */
1296 i = 1;
1297 do {
1298 struct rpcrdma_mw **mw;
1299 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1300 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1301 *mw = NULL;
1302 } while (++i < RPCRDMA_MAX_SEGS);
1303 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1304 &buffers->rb_mws);
1305 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1306 break;
1307 default:
1308 break;
1309 }
1310 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1311}
1312
1313/*
1314 * Recover reply buffers from pool.
1315 * This happens when recovering from error conditions.
1316 * Post-increment counter/array index.
1317 */
1318void
1319rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1320{
1321 struct rpcrdma_buffer *buffers = req->rl_buffer;
1322 unsigned long flags;
1323
1324 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1325 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1326 spin_lock_irqsave(&buffers->rb_lock, flags);
1327 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1328 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1329 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1330 }
1331 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1332}
1333
1334/*
1335 * Put reply buffers back into pool when not attached to
1336 * request. This happens in error conditions, and when
1337 * aborting unbinds. Pre-decrement counter/array index.
1338 */
1339void
1340rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1341{
1342 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1343 unsigned long flags;
1344
1345 rep->rr_func = NULL;
1346 spin_lock_irqsave(&buffers->rb_lock, flags);
1347 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1348 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1349}
1350
1351/*
1352 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1353 */
1354
1355int
1356rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1357 struct ib_mr **mrp, struct ib_sge *iov)
1358{
1359 struct ib_phys_buf ipb;
1360 struct ib_mr *mr;
1361 int rc;
1362
1363 /*
1364 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1365 */
1366 iov->addr = ib_dma_map_single(ia->ri_id->device,
1367 va, len, DMA_BIDIRECTIONAL);
1368 iov->length = len;
1369
bd7ed1d1
TT
1370 if (ia->ri_have_dma_lkey) {
1371 *mrp = NULL;
1372 iov->lkey = ia->ri_dma_lkey;
1373 return 0;
1374 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1375 *mrp = NULL;
1376 iov->lkey = ia->ri_bind_mem->lkey;
1377 return 0;
1378 }
1379
1380 ipb.addr = iov->addr;
1381 ipb.size = iov->length;
1382 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1383 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1384
1385 dprintk("RPC: %s: phys convert: 0x%llx "
1386 "registered 0x%llx length %d\n",
a56daeb7
AM
1387 __func__, (unsigned long long)ipb.addr,
1388 (unsigned long long)iov->addr, len);
c56c65fb
TT
1389
1390 if (IS_ERR(mr)) {
1391 *mrp = NULL;
1392 rc = PTR_ERR(mr);
1393 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1394 } else {
1395 *mrp = mr;
1396 iov->lkey = mr->lkey;
1397 rc = 0;
1398 }
1399
1400 return rc;
1401}
1402
1403int
1404rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1405 struct ib_mr *mr, struct ib_sge *iov)
1406{
1407 int rc;
1408
1409 ib_dma_unmap_single(ia->ri_id->device,
1410 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1411
1412 if (NULL == mr)
1413 return 0;
1414
1415 rc = ib_dereg_mr(mr);
1416 if (rc)
1417 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1418 return rc;
1419}
1420
1421/*
1422 * Wrappers for chunk registration, shared by read/write chunk code.
1423 */
1424
1425static void
1426rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1427{
1428 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1429 seg->mr_dmalen = seg->mr_len;
1430 if (seg->mr_page)
1431 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1432 seg->mr_page, offset_in_page(seg->mr_offset),
1433 seg->mr_dmalen, seg->mr_dir);
1434 else
1435 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1436 seg->mr_offset,
1437 seg->mr_dmalen, seg->mr_dir);
1438}
1439
1440static void
1441rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1442{
1443 if (seg->mr_page)
1444 ib_dma_unmap_page(ia->ri_id->device,
1445 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1446 else
1447 ib_dma_unmap_single(ia->ri_id->device,
1448 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1449}
1450
3197d309
TT
1451static int
1452rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1453 int *nsegs, int writing, struct rpcrdma_ia *ia,
1454 struct rpcrdma_xprt *r_xprt)
1455{
1456 struct rpcrdma_mr_seg *seg1 = seg;
1457 struct ib_send_wr frmr_wr, *bad_wr;
1458 u8 key;
1459 int len, pageoff;
1460 int i, rc;
1461
1462 pageoff = offset_in_page(seg1->mr_offset);
1463 seg1->mr_offset -= pageoff; /* start of page */
1464 seg1->mr_len += pageoff;
1465 len = -pageoff;
1466 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1467 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1468 for (i = 0; i < *nsegs;) {
1469 rpcrdma_map_one(ia, seg, writing);
1470 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1471 len += seg->mr_len;
1472 ++seg;
1473 ++i;
1474 /* Check for holes */
1475 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1476 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1477 break;
1478 }
1479 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1480 __func__, seg1->mr_chunk.rl_mw, i);
1481
1482 /* Bump the key */
1483 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1484 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1485
1486 /* Prepare FRMR WR */
1487 memset(&frmr_wr, 0, sizeof frmr_wr);
1488 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1489 frmr_wr.send_flags = 0; /* unsignaled */
1490 frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
1491 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1492 frmr_wr.wr.fast_reg.page_list_len = i;
1493 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1494 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1495 frmr_wr.wr.fast_reg.access_flags = (writing ?
1496 IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ);
1497 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1498 DECR_CQCOUNT(&r_xprt->rx_ep);
1499
1500 rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
1501
1502 if (rc) {
1503 dprintk("RPC: %s: failed ib_post_send for register,"
1504 " status %i\n", __func__, rc);
1505 while (i--)
1506 rpcrdma_unmap_one(ia, --seg);
1507 } else {
1508 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1509 seg1->mr_base = seg1->mr_dma + pageoff;
1510 seg1->mr_nsegs = i;
1511 seg1->mr_len = len;
1512 }
1513 *nsegs = i;
1514 return rc;
1515}
1516
1517static int
1518rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1519 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1520{
1521 struct rpcrdma_mr_seg *seg1 = seg;
1522 struct ib_send_wr invalidate_wr, *bad_wr;
1523 int rc;
1524
1525 while (seg1->mr_nsegs--)
1526 rpcrdma_unmap_one(ia, seg++);
1527
1528 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1529 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1530 invalidate_wr.send_flags = 0; /* unsignaled */
1531 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1532 DECR_CQCOUNT(&r_xprt->rx_ep);
1533
1534 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1535 if (rc)
1536 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1537 " status %i\n", __func__, rc);
1538 return rc;
1539}
1540
8d4ba034
TT
1541static int
1542rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1543 int *nsegs, int writing, struct rpcrdma_ia *ia)
1544{
1545 struct rpcrdma_mr_seg *seg1 = seg;
1546 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1547 int len, pageoff, i, rc;
1548
1549 pageoff = offset_in_page(seg1->mr_offset);
1550 seg1->mr_offset -= pageoff; /* start of page */
1551 seg1->mr_len += pageoff;
1552 len = -pageoff;
1553 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1554 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1555 for (i = 0; i < *nsegs;) {
1556 rpcrdma_map_one(ia, seg, writing);
1557 physaddrs[i] = seg->mr_dma;
1558 len += seg->mr_len;
1559 ++seg;
1560 ++i;
1561 /* Check for holes */
1562 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1563 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1564 break;
1565 }
1566 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1567 physaddrs, i, seg1->mr_dma);
1568 if (rc) {
1569 dprintk("RPC: %s: failed ib_map_phys_fmr "
1570 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1571 len, (unsigned long long)seg1->mr_dma,
1572 pageoff, i, rc);
1573 while (i--)
1574 rpcrdma_unmap_one(ia, --seg);
1575 } else {
1576 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1577 seg1->mr_base = seg1->mr_dma + pageoff;
1578 seg1->mr_nsegs = i;
1579 seg1->mr_len = len;
1580 }
1581 *nsegs = i;
1582 return rc;
1583}
1584
1585static int
1586rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1587 struct rpcrdma_ia *ia)
1588{
1589 struct rpcrdma_mr_seg *seg1 = seg;
1590 LIST_HEAD(l);
1591 int rc;
1592
1593 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1594 rc = ib_unmap_fmr(&l);
1595 while (seg1->mr_nsegs--)
1596 rpcrdma_unmap_one(ia, seg++);
1597 if (rc)
1598 dprintk("RPC: %s: failed ib_unmap_fmr,"
1599 " status %i\n", __func__, rc);
1600 return rc;
1601}
1602
1603static int
1604rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1605 int *nsegs, int writing, struct rpcrdma_ia *ia,
1606 struct rpcrdma_xprt *r_xprt)
1607{
1608 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1609 IB_ACCESS_REMOTE_READ);
1610 struct ib_mw_bind param;
1611 int rc;
1612
1613 *nsegs = 1;
1614 rpcrdma_map_one(ia, seg, writing);
1615 param.mr = ia->ri_bind_mem;
1616 param.wr_id = 0ULL; /* no send cookie */
1617 param.addr = seg->mr_dma;
1618 param.length = seg->mr_len;
1619 param.send_flags = 0;
1620 param.mw_access_flags = mem_priv;
1621
1622 DECR_CQCOUNT(&r_xprt->rx_ep);
1623 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1624 if (rc) {
1625 dprintk("RPC: %s: failed ib_bind_mw "
1626 "%u@0x%llx status %i\n",
1627 __func__, seg->mr_len,
1628 (unsigned long long)seg->mr_dma, rc);
1629 rpcrdma_unmap_one(ia, seg);
1630 } else {
1631 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1632 seg->mr_base = param.addr;
1633 seg->mr_nsegs = 1;
1634 }
1635 return rc;
1636}
1637
1638static int
1639rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1640 struct rpcrdma_ia *ia,
1641 struct rpcrdma_xprt *r_xprt, void **r)
1642{
1643 struct ib_mw_bind param;
1644 LIST_HEAD(l);
1645 int rc;
1646
1647 BUG_ON(seg->mr_nsegs != 1);
1648 param.mr = ia->ri_bind_mem;
1649 param.addr = 0ULL; /* unbind */
1650 param.length = 0;
1651 param.mw_access_flags = 0;
1652 if (*r) {
1653 param.wr_id = (u64) (unsigned long) *r;
1654 param.send_flags = IB_SEND_SIGNALED;
1655 INIT_CQCOUNT(&r_xprt->rx_ep);
1656 } else {
1657 param.wr_id = 0ULL;
1658 param.send_flags = 0;
1659 DECR_CQCOUNT(&r_xprt->rx_ep);
1660 }
1661 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1662 rpcrdma_unmap_one(ia, seg);
1663 if (rc)
1664 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1665 " status %i\n", __func__, rc);
1666 else
1667 *r = NULL; /* will upcall on completion */
1668 return rc;
1669}
1670
1671static int
1672rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1673 int *nsegs, int writing, struct rpcrdma_ia *ia)
1674{
1675 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1676 IB_ACCESS_REMOTE_READ);
1677 struct rpcrdma_mr_seg *seg1 = seg;
1678 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1679 int len, i, rc = 0;
1680
1681 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1682 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1683 for (len = 0, i = 0; i < *nsegs;) {
1684 rpcrdma_map_one(ia, seg, writing);
1685 ipb[i].addr = seg->mr_dma;
1686 ipb[i].size = seg->mr_len;
1687 len += seg->mr_len;
1688 ++seg;
1689 ++i;
1690 /* Check for holes */
1691 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1692 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1693 break;
1694 }
1695 seg1->mr_base = seg1->mr_dma;
1696 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1697 ipb, i, mem_priv, &seg1->mr_base);
1698 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1699 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1700 dprintk("RPC: %s: failed ib_reg_phys_mr "
1701 "%u@0x%llx (%d)... status %i\n",
1702 __func__, len,
1703 (unsigned long long)seg1->mr_dma, i, rc);
1704 while (i--)
1705 rpcrdma_unmap_one(ia, --seg);
1706 } else {
1707 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1708 seg1->mr_nsegs = i;
1709 seg1->mr_len = len;
1710 }
1711 *nsegs = i;
1712 return rc;
1713}
1714
1715static int
1716rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1717 struct rpcrdma_ia *ia)
1718{
1719 struct rpcrdma_mr_seg *seg1 = seg;
1720 int rc;
1721
1722 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1723 seg1->mr_chunk.rl_mr = NULL;
1724 while (seg1->mr_nsegs--)
1725 rpcrdma_unmap_one(ia, seg++);
1726 if (rc)
1727 dprintk("RPC: %s: failed ib_dereg_mr,"
1728 " status %i\n", __func__, rc);
1729 return rc;
1730}
1731
c56c65fb
TT
1732int
1733rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1734 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1735{
1736 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1737 int rc = 0;
1738
1739 switch (ia->ri_memreg_strategy) {
1740
1741#if RPCRDMA_PERSISTENT_REGISTRATION
1742 case RPCRDMA_ALLPHYSICAL:
1743 rpcrdma_map_one(ia, seg, writing);
1744 seg->mr_rkey = ia->ri_bind_mem->rkey;
1745 seg->mr_base = seg->mr_dma;
1746 seg->mr_nsegs = 1;
1747 nsegs = 1;
1748 break;
1749#endif
1750
3197d309
TT
1751 /* Registration using frmr registration */
1752 case RPCRDMA_FRMR:
1753 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1754 break;
1755
8d4ba034 1756 /* Registration using fmr memory registration */
c56c65fb 1757 case RPCRDMA_MTHCAFMR:
8d4ba034 1758 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1759 break;
1760
1761 /* Registration using memory windows */
1762 case RPCRDMA_MEMWINDOWS_ASYNC:
1763 case RPCRDMA_MEMWINDOWS:
8d4ba034 1764 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
c56c65fb
TT
1765 break;
1766
1767 /* Default registration each time */
1768 default:
8d4ba034 1769 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
c56c65fb
TT
1770 break;
1771 }
1772 if (rc)
1773 return -1;
1774
1775 return nsegs;
1776}
1777
1778int
1779rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1780 struct rpcrdma_xprt *r_xprt, void *r)
1781{
1782 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1783 int nsegs = seg->mr_nsegs, rc;
1784
1785 switch (ia->ri_memreg_strategy) {
1786
1787#if RPCRDMA_PERSISTENT_REGISTRATION
1788 case RPCRDMA_ALLPHYSICAL:
1789 BUG_ON(nsegs != 1);
1790 rpcrdma_unmap_one(ia, seg);
1791 rc = 0;
1792 break;
1793#endif
1794
3197d309
TT
1795 case RPCRDMA_FRMR:
1796 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1797 break;
1798
c56c65fb 1799 case RPCRDMA_MTHCAFMR:
8d4ba034 1800 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
1801 break;
1802
1803 case RPCRDMA_MEMWINDOWS_ASYNC:
1804 case RPCRDMA_MEMWINDOWS:
8d4ba034 1805 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
c56c65fb
TT
1806 break;
1807
1808 default:
8d4ba034 1809 rc = rpcrdma_deregister_default_external(seg, ia);
c56c65fb
TT
1810 break;
1811 }
1812 if (r) {
1813 struct rpcrdma_rep *rep = r;
1814 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1815 rep->rr_func = NULL;
1816 func(rep); /* dereg done, callback now */
1817 }
1818 return nsegs;
1819}
1820
1821/*
1822 * Prepost any receive buffer, then post send.
1823 *
1824 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1825 */
1826int
1827rpcrdma_ep_post(struct rpcrdma_ia *ia,
1828 struct rpcrdma_ep *ep,
1829 struct rpcrdma_req *req)
1830{
1831 struct ib_send_wr send_wr, *send_wr_fail;
1832 struct rpcrdma_rep *rep = req->rl_reply;
1833 int rc;
1834
1835 if (rep) {
1836 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1837 if (rc)
1838 goto out;
1839 req->rl_reply = NULL;
1840 }
1841
1842 send_wr.next = NULL;
1843 send_wr.wr_id = 0ULL; /* no send cookie */
1844 send_wr.sg_list = req->rl_send_iov;
1845 send_wr.num_sge = req->rl_niovs;
1846 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
1847 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1848 ib_dma_sync_single_for_device(ia->ri_id->device,
1849 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1850 DMA_TO_DEVICE);
1851 ib_dma_sync_single_for_device(ia->ri_id->device,
1852 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1853 DMA_TO_DEVICE);
1854 ib_dma_sync_single_for_device(ia->ri_id->device,
1855 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1856 DMA_TO_DEVICE);
1857
1858 if (DECR_CQCOUNT(ep) > 0)
1859 send_wr.send_flags = 0;
1860 else { /* Provider must take a send completion every now and then */
1861 INIT_CQCOUNT(ep);
1862 send_wr.send_flags = IB_SEND_SIGNALED;
1863 }
1864
1865 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1866 if (rc)
1867 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1868 rc);
1869out:
1870 return rc;
1871}
1872
1873/*
1874 * (Re)post a receive buffer.
1875 */
1876int
1877rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1878 struct rpcrdma_ep *ep,
1879 struct rpcrdma_rep *rep)
1880{
1881 struct ib_recv_wr recv_wr, *recv_wr_fail;
1882 int rc;
1883
1884 recv_wr.next = NULL;
1885 recv_wr.wr_id = (u64) (unsigned long) rep;
1886 recv_wr.sg_list = &rep->rr_iov;
1887 recv_wr.num_sge = 1;
1888
1889 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1890 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1891
1892 DECR_CQCOUNT(ep);
1893 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1894
1895 if (rc)
1896 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1897 rc);
1898 return rc;
1899}