]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
f13193f5 | 2 | /* |
ecf85b23 | 3 | * Copyright (c) 2016-2018 Oracle. All rights reserved. |
f13193f5 CL |
4 | * |
5 | * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. | |
6 | */ | |
7 | ||
98895edb CL |
8 | #include <rdma/rw.h> |
9 | ||
f13193f5 CL |
10 | #include <linux/sunrpc/rpc_rdma.h> |
11 | #include <linux/sunrpc/svc_rdma.h> | |
12 | #include <linux/sunrpc/debug.h> | |
13 | ||
98895edb CL |
14 | #include "xprt_rdma.h" |
15 | #include <trace/events/rpcrdma.h> | |
f13193f5 CL |
16 | |
17 | #define RPCDBG_FACILITY RPCDBG_SVCXPRT | |
18 | ||
026d958b CL |
19 | static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc); |
20 | static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); | |
21 | ||
f13193f5 CL |
22 | /* Each R/W context contains state for one chain of RDMA Read or |
23 | * Write Work Requests. | |
24 | * | |
25 | * Each WR chain handles a single contiguous server-side buffer, | |
26 | * because scatterlist entries after the first have to start on | |
27 | * page alignment. xdr_buf iovecs cannot guarantee alignment. | |
28 | * | |
29 | * Each WR chain handles only one R_key. Each RPC-over-RDMA segment | |
30 | * from a client may contain a unique R_key, so each WR chain moves | |
31 | * up to one segment at a time. | |
32 | * | |
33 | * The scatterlist makes this data structure over 4KB in size. To | |
34 | * make it less likely to fail, and to handle the allocation for | |
35 | * smaller I/O requests without disabling bottom-halves, these | |
36 | * contexts are created on demand, but cached and reused until the | |
37 | * controlling svcxprt_rdma is destroyed. | |
38 | */ | |
39 | struct svc_rdma_rw_ctxt { | |
40 | struct list_head rw_list; | |
41 | struct rdma_rw_ctx rw_ctx; | |
42 | int rw_nents; | |
43 | struct sg_table rw_sg_table; | |
44 | struct scatterlist rw_first_sgl[0]; | |
45 | }; | |
46 | ||
47 | static inline struct svc_rdma_rw_ctxt * | |
48 | svc_rdma_next_ctxt(struct list_head *list) | |
49 | { | |
50 | return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt, | |
51 | rw_list); | |
52 | } | |
53 | ||
54 | static struct svc_rdma_rw_ctxt * | |
55 | svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) | |
56 | { | |
57 | struct svc_rdma_rw_ctxt *ctxt; | |
58 | ||
59 | spin_lock(&rdma->sc_rw_ctxt_lock); | |
60 | ||
61 | ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts); | |
62 | if (ctxt) { | |
63 | list_del(&ctxt->rw_list); | |
64 | spin_unlock(&rdma->sc_rw_ctxt_lock); | |
65 | } else { | |
66 | spin_unlock(&rdma->sc_rw_ctxt_lock); | |
14cfbd94 | 67 | ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), |
f13193f5 CL |
68 | GFP_KERNEL); |
69 | if (!ctxt) | |
70 | goto out; | |
71 | INIT_LIST_HEAD(&ctxt->rw_list); | |
72 | } | |
73 | ||
74 | ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; | |
75 | if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, | |
76 | ctxt->rw_sg_table.sgl)) { | |
77 | kfree(ctxt); | |
78 | ctxt = NULL; | |
79 | } | |
80 | out: | |
81 | return ctxt; | |
82 | } | |
83 | ||
84 | static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, | |
85 | struct svc_rdma_rw_ctxt *ctxt) | |
86 | { | |
87 | sg_free_table_chained(&ctxt->rw_sg_table, true); | |
88 | ||
89 | spin_lock(&rdma->sc_rw_ctxt_lock); | |
90 | list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts); | |
91 | spin_unlock(&rdma->sc_rw_ctxt_lock); | |
92 | } | |
93 | ||
94 | /** | |
95 | * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts | |
96 | * @rdma: transport about to be destroyed | |
97 | * | |
98 | */ | |
99 | void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) | |
100 | { | |
101 | struct svc_rdma_rw_ctxt *ctxt; | |
102 | ||
103 | while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) { | |
104 | list_del(&ctxt->rw_list); | |
105 | kfree(ctxt); | |
106 | } | |
107 | } | |
108 | ||
109 | /* A chunk context tracks all I/O for moving one Read or Write | |
110 | * chunk. This is a a set of rdma_rw's that handle data movement | |
111 | * for all segments of one chunk. | |
112 | * | |
113 | * These are small, acquired with a single allocator call, and | |
114 | * no more than one is needed per chunk. They are allocated on | |
115 | * demand, and not cached. | |
116 | */ | |
117 | struct svc_rdma_chunk_ctxt { | |
118 | struct ib_cqe cc_cqe; | |
119 | struct svcxprt_rdma *cc_rdma; | |
120 | struct list_head cc_rwctxts; | |
121 | int cc_sqecount; | |
f13193f5 CL |
122 | }; |
123 | ||
124 | static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, | |
35a30fc3 | 125 | struct svc_rdma_chunk_ctxt *cc) |
f13193f5 CL |
126 | { |
127 | cc->cc_rdma = rdma; | |
128 | svc_xprt_get(&rdma->sc_xprt); | |
129 | ||
130 | INIT_LIST_HEAD(&cc->cc_rwctxts); | |
131 | cc->cc_sqecount = 0; | |
f13193f5 CL |
132 | } |
133 | ||
35a30fc3 CL |
134 | static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, |
135 | enum dma_data_direction dir) | |
f13193f5 CL |
136 | { |
137 | struct svcxprt_rdma *rdma = cc->cc_rdma; | |
138 | struct svc_rdma_rw_ctxt *ctxt; | |
139 | ||
140 | while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { | |
141 | list_del(&ctxt->rw_list); | |
142 | ||
143 | rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, | |
144 | rdma->sc_port_num, ctxt->rw_sg_table.sgl, | |
35a30fc3 | 145 | ctxt->rw_nents, dir); |
f13193f5 CL |
146 | svc_rdma_put_rw_ctxt(rdma, ctxt); |
147 | } | |
148 | svc_xprt_put(&rdma->sc_xprt); | |
149 | } | |
150 | ||
151 | /* State for sending a Write or Reply chunk. | |
152 | * - Tracks progress of writing one chunk over all its segments | |
153 | * - Stores arguments for the SGL constructor functions | |
154 | */ | |
155 | struct svc_rdma_write_info { | |
156 | /* write state of this chunk */ | |
157 | unsigned int wi_seg_off; | |
158 | unsigned int wi_seg_no; | |
159 | unsigned int wi_nsegs; | |
160 | __be32 *wi_segs; | |
161 | ||
162 | /* SGL constructor arguments */ | |
163 | struct xdr_buf *wi_xdr; | |
164 | unsigned char *wi_base; | |
165 | unsigned int wi_next_off; | |
166 | ||
167 | struct svc_rdma_chunk_ctxt wi_cc; | |
168 | }; | |
169 | ||
170 | static struct svc_rdma_write_info * | |
171 | svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) | |
172 | { | |
173 | struct svc_rdma_write_info *info; | |
174 | ||
175 | info = kmalloc(sizeof(*info), GFP_KERNEL); | |
176 | if (!info) | |
177 | return info; | |
178 | ||
179 | info->wi_seg_off = 0; | |
180 | info->wi_seg_no = 0; | |
181 | info->wi_nsegs = be32_to_cpup(++chunk); | |
182 | info->wi_segs = ++chunk; | |
35a30fc3 | 183 | svc_rdma_cc_init(rdma, &info->wi_cc); |
026d958b | 184 | info->wi_cc.cc_cqe.done = svc_rdma_write_done; |
f13193f5 CL |
185 | return info; |
186 | } | |
187 | ||
188 | static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) | |
189 | { | |
35a30fc3 | 190 | svc_rdma_cc_release(&info->wi_cc, DMA_TO_DEVICE); |
f13193f5 CL |
191 | kfree(info); |
192 | } | |
193 | ||
194 | /** | |
195 | * svc_rdma_write_done - Write chunk completion | |
196 | * @cq: controlling Completion Queue | |
197 | * @wc: Work Completion | |
198 | * | |
199 | * Pages under I/O are freed by a subsequent Send completion. | |
200 | */ | |
201 | static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) | |
202 | { | |
203 | struct ib_cqe *cqe = wc->wr_cqe; | |
204 | struct svc_rdma_chunk_ctxt *cc = | |
205 | container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); | |
206 | struct svcxprt_rdma *rdma = cc->cc_rdma; | |
207 | struct svc_rdma_write_info *info = | |
208 | container_of(cc, struct svc_rdma_write_info, wi_cc); | |
209 | ||
bd2abef3 CL |
210 | trace_svcrdma_wc_write(wc); |
211 | ||
f13193f5 CL |
212 | atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); |
213 | wake_up(&rdma->sc_send_wait); | |
214 | ||
8820bcaa | 215 | if (unlikely(wc->status != IB_WC_SUCCESS)) |
f13193f5 | 216 | set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); |
f13193f5 CL |
217 | |
218 | svc_rdma_write_info_free(info); | |
219 | } | |
220 | ||
026d958b CL |
221 | /* State for pulling a Read chunk. |
222 | */ | |
223 | struct svc_rdma_read_info { | |
ecf85b23 | 224 | struct svc_rdma_recv_ctxt *ri_readctxt; |
026d958b CL |
225 | unsigned int ri_position; |
226 | unsigned int ri_pageno; | |
227 | unsigned int ri_pageoff; | |
228 | unsigned int ri_chunklen; | |
229 | ||
230 | struct svc_rdma_chunk_ctxt ri_cc; | |
231 | }; | |
232 | ||
233 | static struct svc_rdma_read_info * | |
234 | svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma) | |
235 | { | |
236 | struct svc_rdma_read_info *info; | |
237 | ||
238 | info = kmalloc(sizeof(*info), GFP_KERNEL); | |
239 | if (!info) | |
240 | return info; | |
241 | ||
35a30fc3 | 242 | svc_rdma_cc_init(rdma, &info->ri_cc); |
026d958b CL |
243 | info->ri_cc.cc_cqe.done = svc_rdma_wc_read_done; |
244 | return info; | |
245 | } | |
246 | ||
247 | static void svc_rdma_read_info_free(struct svc_rdma_read_info *info) | |
248 | { | |
35a30fc3 | 249 | svc_rdma_cc_release(&info->ri_cc, DMA_FROM_DEVICE); |
026d958b CL |
250 | kfree(info); |
251 | } | |
252 | ||
253 | /** | |
254 | * svc_rdma_wc_read_done - Handle completion of an RDMA Read ctx | |
255 | * @cq: controlling Completion Queue | |
256 | * @wc: Work Completion | |
257 | * | |
258 | */ | |
259 | static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) | |
260 | { | |
261 | struct ib_cqe *cqe = wc->wr_cqe; | |
262 | struct svc_rdma_chunk_ctxt *cc = | |
263 | container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); | |
264 | struct svcxprt_rdma *rdma = cc->cc_rdma; | |
265 | struct svc_rdma_read_info *info = | |
266 | container_of(cc, struct svc_rdma_read_info, ri_cc); | |
267 | ||
bd2abef3 CL |
268 | trace_svcrdma_wc_read(wc); |
269 | ||
026d958b CL |
270 | atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); |
271 | wake_up(&rdma->sc_send_wait); | |
272 | ||
273 | if (unlikely(wc->status != IB_WC_SUCCESS)) { | |
274 | set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); | |
1e5f4160 | 275 | svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt); |
026d958b CL |
276 | } else { |
277 | spin_lock(&rdma->sc_rq_dto_lock); | |
ecf85b23 | 278 | list_add_tail(&info->ri_readctxt->rc_list, |
026d958b | 279 | &rdma->sc_read_complete_q); |
95503d29 BF |
280 | /* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */ |
281 | set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); | |
026d958b CL |
282 | spin_unlock(&rdma->sc_rq_dto_lock); |
283 | ||
026d958b CL |
284 | svc_xprt_enqueue(&rdma->sc_xprt); |
285 | } | |
286 | ||
287 | svc_rdma_read_info_free(info); | |
288 | } | |
289 | ||
f13193f5 CL |
290 | /* This function sleeps when the transport's Send Queue is congested. |
291 | * | |
292 | * Assumptions: | |
293 | * - If ib_post_send() succeeds, only one completion is expected, | |
294 | * even if one or more WRs are flushed. This is true when posting | |
295 | * an rdma_rw_ctx or when posting a single signaled WR. | |
296 | */ | |
297 | static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) | |
298 | { | |
299 | struct svcxprt_rdma *rdma = cc->cc_rdma; | |
300 | struct svc_xprt *xprt = &rdma->sc_xprt; | |
d34ac5cd BVA |
301 | struct ib_send_wr *first_wr; |
302 | const struct ib_send_wr *bad_wr; | |
f13193f5 CL |
303 | struct list_head *tmp; |
304 | struct ib_cqe *cqe; | |
305 | int ret; | |
306 | ||
107c1d0a CL |
307 | if (cc->cc_sqecount > rdma->sc_sq_depth) |
308 | return -EINVAL; | |
309 | ||
f13193f5 CL |
310 | first_wr = NULL; |
311 | cqe = &cc->cc_cqe; | |
312 | list_for_each(tmp, &cc->cc_rwctxts) { | |
313 | struct svc_rdma_rw_ctxt *ctxt; | |
314 | ||
315 | ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list); | |
316 | first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp, | |
317 | rdma->sc_port_num, cqe, first_wr); | |
318 | cqe = NULL; | |
319 | } | |
320 | ||
321 | do { | |
322 | if (atomic_sub_return(cc->cc_sqecount, | |
323 | &rdma->sc_sq_avail) > 0) { | |
5d85a822 | 324 | ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); |
bd2abef3 CL |
325 | trace_svcrdma_post_rw(&cc->cc_cqe, |
326 | cc->cc_sqecount, ret); | |
f13193f5 CL |
327 | if (ret) |
328 | break; | |
329 | return 0; | |
330 | } | |
331 | ||
bd2abef3 | 332 | trace_svcrdma_sq_full(rdma); |
f13193f5 CL |
333 | atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); |
334 | wait_event(rdma->sc_send_wait, | |
335 | atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); | |
bd2abef3 | 336 | trace_svcrdma_sq_retry(rdma); |
f13193f5 CL |
337 | } while (1); |
338 | ||
f13193f5 CL |
339 | set_bit(XPT_CLOSE, &xprt->xpt_flags); |
340 | ||
341 | /* If even one was posted, there will be a completion. */ | |
342 | if (bad_wr != first_wr) | |
343 | return 0; | |
344 | ||
345 | atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); | |
346 | wake_up(&rdma->sc_send_wait); | |
347 | return -ENOTCONN; | |
348 | } | |
349 | ||
350 | /* Build and DMA-map an SGL that covers one kvec in an xdr_buf | |
351 | */ | |
352 | static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info, | |
353 | unsigned int len, | |
354 | struct svc_rdma_rw_ctxt *ctxt) | |
355 | { | |
356 | struct scatterlist *sg = ctxt->rw_sg_table.sgl; | |
357 | ||
358 | sg_set_buf(&sg[0], info->wi_base, len); | |
359 | info->wi_base += len; | |
360 | ||
361 | ctxt->rw_nents = 1; | |
362 | } | |
363 | ||
364 | /* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist. | |
365 | */ | |
366 | static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, | |
367 | unsigned int remaining, | |
368 | struct svc_rdma_rw_ctxt *ctxt) | |
369 | { | |
370 | unsigned int sge_no, sge_bytes, page_off, page_no; | |
371 | struct xdr_buf *xdr = info->wi_xdr; | |
372 | struct scatterlist *sg; | |
373 | struct page **page; | |
374 | ||
91b022ec CL |
375 | page_off = info->wi_next_off + xdr->page_base; |
376 | page_no = page_off >> PAGE_SHIFT; | |
377 | page_off = offset_in_page(page_off); | |
f13193f5 CL |
378 | page = xdr->pages + page_no; |
379 | info->wi_next_off += remaining; | |
380 | sg = ctxt->rw_sg_table.sgl; | |
381 | sge_no = 0; | |
382 | do { | |
383 | sge_bytes = min_t(unsigned int, remaining, | |
384 | PAGE_SIZE - page_off); | |
385 | sg_set_page(sg, *page, sge_bytes, page_off); | |
386 | ||
387 | remaining -= sge_bytes; | |
388 | sg = sg_next(sg); | |
389 | page_off = 0; | |
390 | sge_no++; | |
391 | page++; | |
392 | } while (remaining); | |
393 | ||
394 | ctxt->rw_nents = sge_no; | |
395 | } | |
396 | ||
397 | /* Construct RDMA Write WRs to send a portion of an xdr_buf containing | |
398 | * an RPC Reply. | |
399 | */ | |
400 | static int | |
401 | svc_rdma_build_writes(struct svc_rdma_write_info *info, | |
402 | void (*constructor)(struct svc_rdma_write_info *info, | |
403 | unsigned int len, | |
404 | struct svc_rdma_rw_ctxt *ctxt), | |
405 | unsigned int remaining) | |
406 | { | |
407 | struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; | |
408 | struct svcxprt_rdma *rdma = cc->cc_rdma; | |
409 | struct svc_rdma_rw_ctxt *ctxt; | |
410 | __be32 *seg; | |
411 | int ret; | |
412 | ||
f13193f5 CL |
413 | seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; |
414 | do { | |
415 | unsigned int write_len; | |
416 | u32 seg_length, seg_handle; | |
417 | u64 seg_offset; | |
418 | ||
419 | if (info->wi_seg_no >= info->wi_nsegs) | |
420 | goto out_overflow; | |
421 | ||
422 | seg_handle = be32_to_cpup(seg); | |
423 | seg_length = be32_to_cpup(seg + 1); | |
424 | xdr_decode_hyper(seg + 2, &seg_offset); | |
425 | seg_offset += info->wi_seg_off; | |
426 | ||
427 | write_len = min(remaining, seg_length - info->wi_seg_off); | |
428 | ctxt = svc_rdma_get_rw_ctxt(rdma, | |
429 | (write_len >> PAGE_SHIFT) + 2); | |
430 | if (!ctxt) | |
431 | goto out_noctx; | |
432 | ||
433 | constructor(info, write_len, ctxt); | |
434 | ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, | |
435 | rdma->sc_port_num, ctxt->rw_sg_table.sgl, | |
436 | ctxt->rw_nents, 0, seg_offset, | |
437 | seg_handle, DMA_TO_DEVICE); | |
438 | if (ret < 0) | |
439 | goto out_initerr; | |
440 | ||
98895edb | 441 | trace_svcrdma_encode_wseg(seg_handle, write_len, seg_offset); |
f13193f5 CL |
442 | list_add(&ctxt->rw_list, &cc->cc_rwctxts); |
443 | cc->cc_sqecount += ret; | |
444 | if (write_len == seg_length - info->wi_seg_off) { | |
445 | seg += 4; | |
446 | info->wi_seg_no++; | |
447 | info->wi_seg_off = 0; | |
448 | } else { | |
449 | info->wi_seg_off += write_len; | |
450 | } | |
451 | remaining -= write_len; | |
452 | } while (remaining); | |
453 | ||
454 | return 0; | |
455 | ||
456 | out_overflow: | |
457 | dprintk("svcrdma: inadequate space in Write chunk (%u)\n", | |
458 | info->wi_nsegs); | |
459 | return -E2BIG; | |
460 | ||
461 | out_noctx: | |
462 | dprintk("svcrdma: no R/W ctxs available\n"); | |
463 | return -ENOMEM; | |
464 | ||
465 | out_initerr: | |
466 | svc_rdma_put_rw_ctxt(rdma, ctxt); | |
bd2abef3 | 467 | trace_svcrdma_dma_map_rwctx(rdma, ret); |
f13193f5 CL |
468 | return -EIO; |
469 | } | |
470 | ||
471 | /* Send one of an xdr_buf's kvecs by itself. To send a Reply | |
472 | * chunk, the whole RPC Reply is written back to the client. | |
473 | * This function writes either the head or tail of the xdr_buf | |
474 | * containing the Reply. | |
475 | */ | |
476 | static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info, | |
477 | struct kvec *vec) | |
478 | { | |
479 | info->wi_base = vec->iov_base; | |
480 | return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, | |
481 | vec->iov_len); | |
482 | } | |
483 | ||
484 | /* Send an xdr_buf's page list by itself. A Write chunk is | |
485 | * just the page list. a Reply chunk is the head, page list, | |
486 | * and tail. This function is shared between the two types | |
487 | * of chunk. | |
488 | */ | |
489 | static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, | |
490 | struct xdr_buf *xdr) | |
491 | { | |
492 | info->wi_xdr = xdr; | |
493 | info->wi_next_off = 0; | |
494 | return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, | |
495 | xdr->page_len); | |
496 | } | |
497 | ||
498 | /** | |
499 | * svc_rdma_send_write_chunk - Write all segments in a Write chunk | |
500 | * @rdma: controlling RDMA transport | |
501 | * @wr_ch: Write chunk provided by client | |
502 | * @xdr: xdr_buf containing the data payload | |
503 | * | |
504 | * Returns a non-negative number of bytes the chunk consumed, or | |
505 | * %-E2BIG if the payload was larger than the Write chunk, | |
107c1d0a | 506 | * %-EINVAL if client provided too many segments, |
f13193f5 CL |
507 | * %-ENOMEM if rdma_rw context pool was exhausted, |
508 | * %-ENOTCONN if posting failed (connection is lost), | |
509 | * %-EIO if rdma_rw initialization failed (DMA mapping, etc). | |
510 | */ | |
511 | int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, | |
512 | struct xdr_buf *xdr) | |
513 | { | |
514 | struct svc_rdma_write_info *info; | |
515 | int ret; | |
516 | ||
517 | if (!xdr->page_len) | |
518 | return 0; | |
519 | ||
520 | info = svc_rdma_write_info_alloc(rdma, wr_ch); | |
521 | if (!info) | |
522 | return -ENOMEM; | |
523 | ||
524 | ret = svc_rdma_send_xdr_pagelist(info, xdr); | |
525 | if (ret < 0) | |
526 | goto out_err; | |
527 | ||
528 | ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); | |
529 | if (ret < 0) | |
530 | goto out_err; | |
98895edb CL |
531 | |
532 | trace_svcrdma_encode_write(xdr->page_len); | |
f13193f5 CL |
533 | return xdr->page_len; |
534 | ||
535 | out_err: | |
536 | svc_rdma_write_info_free(info); | |
537 | return ret; | |
538 | } | |
539 | ||
540 | /** | |
541 | * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk | |
542 | * @rdma: controlling RDMA transport | |
543 | * @rp_ch: Reply chunk provided by client | |
544 | * @writelist: true if client provided a Write list | |
545 | * @xdr: xdr_buf containing an RPC Reply | |
546 | * | |
547 | * Returns a non-negative number of bytes the chunk consumed, or | |
548 | * %-E2BIG if the payload was larger than the Reply chunk, | |
107c1d0a | 549 | * %-EINVAL if client provided too many segments, |
f13193f5 CL |
550 | * %-ENOMEM if rdma_rw context pool was exhausted, |
551 | * %-ENOTCONN if posting failed (connection is lost), | |
552 | * %-EIO if rdma_rw initialization failed (DMA mapping, etc). | |
553 | */ | |
554 | int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch, | |
555 | bool writelist, struct xdr_buf *xdr) | |
556 | { | |
557 | struct svc_rdma_write_info *info; | |
558 | int consumed, ret; | |
559 | ||
560 | info = svc_rdma_write_info_alloc(rdma, rp_ch); | |
561 | if (!info) | |
562 | return -ENOMEM; | |
563 | ||
564 | ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]); | |
565 | if (ret < 0) | |
566 | goto out_err; | |
567 | consumed = xdr->head[0].iov_len; | |
568 | ||
569 | /* Send the page list in the Reply chunk only if the | |
570 | * client did not provide Write chunks. | |
571 | */ | |
572 | if (!writelist && xdr->page_len) { | |
573 | ret = svc_rdma_send_xdr_pagelist(info, xdr); | |
574 | if (ret < 0) | |
575 | goto out_err; | |
576 | consumed += xdr->page_len; | |
577 | } | |
578 | ||
579 | if (xdr->tail[0].iov_len) { | |
580 | ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]); | |
581 | if (ret < 0) | |
582 | goto out_err; | |
583 | consumed += xdr->tail[0].iov_len; | |
584 | } | |
585 | ||
586 | ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); | |
587 | if (ret < 0) | |
588 | goto out_err; | |
98895edb CL |
589 | |
590 | trace_svcrdma_encode_reply(consumed); | |
f13193f5 CL |
591 | return consumed; |
592 | ||
593 | out_err: | |
594 | svc_rdma_write_info_free(info); | |
595 | return ret; | |
596 | } | |
026d958b CL |
597 | |
598 | static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, | |
599 | struct svc_rqst *rqstp, | |
600 | u32 rkey, u32 len, u64 offset) | |
601 | { | |
ecf85b23 | 602 | struct svc_rdma_recv_ctxt *head = info->ri_readctxt; |
026d958b CL |
603 | struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; |
604 | struct svc_rdma_rw_ctxt *ctxt; | |
605 | unsigned int sge_no, seg_len; | |
606 | struct scatterlist *sg; | |
607 | int ret; | |
608 | ||
609 | sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; | |
610 | ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); | |
611 | if (!ctxt) | |
612 | goto out_noctx; | |
613 | ctxt->rw_nents = sge_no; | |
614 | ||
026d958b CL |
615 | sg = ctxt->rw_sg_table.sgl; |
616 | for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { | |
617 | seg_len = min_t(unsigned int, len, | |
618 | PAGE_SIZE - info->ri_pageoff); | |
619 | ||
ecf85b23 | 620 | head->rc_arg.pages[info->ri_pageno] = |
026d958b CL |
621 | rqstp->rq_pages[info->ri_pageno]; |
622 | if (!info->ri_pageoff) | |
ecf85b23 | 623 | head->rc_page_count++; |
026d958b CL |
624 | |
625 | sg_set_page(sg, rqstp->rq_pages[info->ri_pageno], | |
626 | seg_len, info->ri_pageoff); | |
627 | sg = sg_next(sg); | |
628 | ||
629 | info->ri_pageoff += seg_len; | |
630 | if (info->ri_pageoff == PAGE_SIZE) { | |
631 | info->ri_pageno++; | |
632 | info->ri_pageoff = 0; | |
633 | } | |
634 | len -= seg_len; | |
635 | ||
636 | /* Safety check */ | |
637 | if (len && | |
638 | &rqstp->rq_pages[info->ri_pageno + 1] > rqstp->rq_page_end) | |
639 | goto out_overrun; | |
640 | } | |
641 | ||
642 | ret = rdma_rw_ctx_init(&ctxt->rw_ctx, cc->cc_rdma->sc_qp, | |
643 | cc->cc_rdma->sc_port_num, | |
644 | ctxt->rw_sg_table.sgl, ctxt->rw_nents, | |
645 | 0, offset, rkey, DMA_FROM_DEVICE); | |
646 | if (ret < 0) | |
647 | goto out_initerr; | |
648 | ||
649 | list_add(&ctxt->rw_list, &cc->cc_rwctxts); | |
650 | cc->cc_sqecount += ret; | |
651 | return 0; | |
652 | ||
653 | out_noctx: | |
654 | dprintk("svcrdma: no R/W ctxs available\n"); | |
655 | return -ENOMEM; | |
656 | ||
657 | out_overrun: | |
658 | dprintk("svcrdma: request overruns rq_pages\n"); | |
659 | return -EINVAL; | |
660 | ||
661 | out_initerr: | |
bd2abef3 | 662 | trace_svcrdma_dma_map_rwctx(cc->cc_rdma, ret); |
026d958b | 663 | svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt); |
026d958b CL |
664 | return -EIO; |
665 | } | |
666 | ||
7075a867 CL |
667 | /* Walk the segments in the Read chunk starting at @p and construct |
668 | * RDMA Read operations to pull the chunk to the server. | |
669 | */ | |
026d958b CL |
670 | static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, |
671 | struct svc_rdma_read_info *info, | |
672 | __be32 *p) | |
673 | { | |
07d0ff3b | 674 | unsigned int i; |
026d958b CL |
675 | int ret; |
676 | ||
7075a867 | 677 | ret = -EINVAL; |
026d958b | 678 | info->ri_chunklen = 0; |
7075a867 | 679 | while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) { |
026d958b CL |
680 | u32 rs_handle, rs_length; |
681 | u64 rs_offset; | |
682 | ||
026d958b CL |
683 | rs_handle = be32_to_cpup(p++); |
684 | rs_length = be32_to_cpup(p++); | |
685 | p = xdr_decode_hyper(p, &rs_offset); | |
686 | ||
687 | ret = svc_rdma_build_read_segment(info, rqstp, | |
688 | rs_handle, rs_length, | |
689 | rs_offset); | |
690 | if (ret < 0) | |
691 | break; | |
692 | ||
98895edb | 693 | trace_svcrdma_encode_rseg(rs_handle, rs_length, rs_offset); |
026d958b CL |
694 | info->ri_chunklen += rs_length; |
695 | } | |
696 | ||
07d0ff3b CL |
697 | /* Pages under I/O have been copied to head->rc_pages. |
698 | * Prevent their premature release by svc_xprt_release() . | |
699 | */ | |
700 | for (i = 0; i < info->ri_readctxt->rc_page_count; i++) | |
701 | rqstp->rq_pages[i] = NULL; | |
702 | ||
026d958b CL |
703 | return ret; |
704 | } | |
705 | ||
026d958b | 706 | /* Construct RDMA Reads to pull over a normal Read chunk. The chunk |
ecf85b23 | 707 | * data lands in the page list of head->rc_arg.pages. |
026d958b | 708 | * |
ecf85b23 | 709 | * Currently NFSD does not look at the head->rc_arg.tail[0] iovec. |
026d958b CL |
710 | * Therefore, XDR round-up of the Read chunk and trailing |
711 | * inline content must both be added at the end of the pagelist. | |
712 | */ | |
713 | static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, | |
714 | struct svc_rdma_read_info *info, | |
715 | __be32 *p) | |
716 | { | |
ecf85b23 | 717 | struct svc_rdma_recv_ctxt *head = info->ri_readctxt; |
026d958b CL |
718 | int ret; |
719 | ||
026d958b CL |
720 | ret = svc_rdma_build_read_chunk(rqstp, info, p); |
721 | if (ret < 0) | |
722 | goto out; | |
723 | ||
98895edb CL |
724 | trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position); |
725 | ||
3316f063 CL |
726 | head->rc_hdr_count = 0; |
727 | ||
193bcb7b CL |
728 | /* Split the Receive buffer between the head and tail |
729 | * buffers at Read chunk's position. XDR roundup of the | |
730 | * chunk is not included in either the pagelist or in | |
731 | * the tail. | |
026d958b | 732 | */ |
ecf85b23 CL |
733 | head->rc_arg.tail[0].iov_base = |
734 | head->rc_arg.head[0].iov_base + info->ri_position; | |
735 | head->rc_arg.tail[0].iov_len = | |
736 | head->rc_arg.head[0].iov_len - info->ri_position; | |
737 | head->rc_arg.head[0].iov_len = info->ri_position; | |
026d958b | 738 | |
175e0310 | 739 | /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). |
193bcb7b | 740 | * |
175e0310 CL |
741 | * If the client already rounded up the chunk length, the |
742 | * length does not change. Otherwise, the length of the page | |
743 | * list is increased to include XDR round-up. | |
744 | * | |
745 | * Currently these chunks always start at page offset 0, | |
746 | * thus the rounded-up length never crosses a page boundary. | |
193bcb7b | 747 | */ |
175e0310 | 748 | info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2; |
026d958b | 749 | |
ecf85b23 CL |
750 | head->rc_arg.page_len = info->ri_chunklen; |
751 | head->rc_arg.len += info->ri_chunklen; | |
752 | head->rc_arg.buflen += info->ri_chunklen; | |
026d958b | 753 | |
026d958b CL |
754 | out: |
755 | return ret; | |
756 | } | |
757 | ||
758 | /* Construct RDMA Reads to pull over a Position Zero Read chunk. | |
759 | * The start of the data lands in the first page just after | |
760 | * the Transport header, and the rest lands in the page list of | |
ecf85b23 | 761 | * head->rc_arg.pages. |
026d958b CL |
762 | * |
763 | * Assumptions: | |
764 | * - A PZRC has an XDR-aligned length (no implicit round-up). | |
765 | * - There can be no trailing inline content (IOW, we assume | |
766 | * a PZRC is never sent in an RDMA_MSG message, though it's | |
767 | * allowed by spec). | |
768 | */ | |
769 | static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, | |
770 | struct svc_rdma_read_info *info, | |
771 | __be32 *p) | |
772 | { | |
ecf85b23 | 773 | struct svc_rdma_recv_ctxt *head = info->ri_readctxt; |
026d958b CL |
774 | int ret; |
775 | ||
026d958b CL |
776 | ret = svc_rdma_build_read_chunk(rqstp, info, p); |
777 | if (ret < 0) | |
778 | goto out; | |
779 | ||
98895edb CL |
780 | trace_svcrdma_encode_pzr(info->ri_chunklen); |
781 | ||
ecf85b23 CL |
782 | head->rc_arg.len += info->ri_chunklen; |
783 | head->rc_arg.buflen += info->ri_chunklen; | |
026d958b | 784 | |
3316f063 CL |
785 | head->rc_hdr_count = 1; |
786 | head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]); | |
787 | head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE, | |
788 | info->ri_chunklen); | |
789 | ||
790 | head->rc_arg.page_len = info->ri_chunklen - | |
791 | head->rc_arg.head[0].iov_len; | |
026d958b CL |
792 | |
793 | out: | |
794 | return ret; | |
795 | } | |
796 | ||
797 | /** | |
798 | * svc_rdma_recv_read_chunk - Pull a Read chunk from the client | |
799 | * @rdma: controlling RDMA transport | |
800 | * @rqstp: set of pages to use as Read sink buffers | |
801 | * @head: pages under I/O collect here | |
802 | * @p: pointer to start of Read chunk | |
803 | * | |
804 | * Returns: | |
805 | * %0 if all needed RDMA Reads were posted successfully, | |
806 | * %-EINVAL if client provided too many segments, | |
807 | * %-ENOMEM if rdma_rw context pool was exhausted, | |
808 | * %-ENOTCONN if posting failed (connection is lost), | |
809 | * %-EIO if rdma_rw initialization failed (DMA mapping, etc). | |
810 | * | |
811 | * Assumptions: | |
812 | * - All Read segments in @p have the same Position value. | |
813 | */ | |
814 | int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, | |
ecf85b23 | 815 | struct svc_rdma_recv_ctxt *head, __be32 *p) |
026d958b CL |
816 | { |
817 | struct svc_rdma_read_info *info; | |
026d958b CL |
818 | int ret; |
819 | ||
820 | /* The request (with page list) is constructed in | |
ecf85b23 | 821 | * head->rc_arg. Pages involved with RDMA Read I/O are |
026d958b CL |
822 | * transferred there. |
823 | */ | |
ecf85b23 CL |
824 | head->rc_arg.head[0] = rqstp->rq_arg.head[0]; |
825 | head->rc_arg.tail[0] = rqstp->rq_arg.tail[0]; | |
826 | head->rc_arg.pages = head->rc_pages; | |
827 | head->rc_arg.page_base = 0; | |
828 | head->rc_arg.page_len = 0; | |
829 | head->rc_arg.len = rqstp->rq_arg.len; | |
830 | head->rc_arg.buflen = rqstp->rq_arg.buflen; | |
026d958b CL |
831 | |
832 | info = svc_rdma_read_info_alloc(rdma); | |
833 | if (!info) | |
834 | return -ENOMEM; | |
835 | info->ri_readctxt = head; | |
3316f063 CL |
836 | info->ri_pageno = 0; |
837 | info->ri_pageoff = 0; | |
026d958b CL |
838 | |
839 | info->ri_position = be32_to_cpup(p + 1); | |
840 | if (info->ri_position) | |
841 | ret = svc_rdma_build_normal_read_chunk(rqstp, info, p); | |
842 | else | |
843 | ret = svc_rdma_build_pz_read_chunk(rqstp, info, p); | |
026d958b | 844 | if (ret < 0) |
07d0ff3b | 845 | goto out_err; |
026d958b CL |
846 | |
847 | ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); | |
026d958b | 848 | if (ret < 0) |
07d0ff3b CL |
849 | goto out_err; |
850 | return 0; | |
851 | ||
852 | out_err: | |
853 | svc_rdma_read_info_free(info); | |
026d958b CL |
854 | return ret; |
855 | } |