]>
Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
1dc563a6 | 30 | * Copyright (c) 2011, 2015, Intel Corporation. |
d7e09d03 PT |
31 | */ |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | /** Implementation of client-side PortalRPC interfaces */ | |
38 | ||
39 | #define DEBUG_SUBSYSTEM S_RPC | |
40 | ||
e27db149 GKH |
41 | #include "../include/obd_support.h" |
42 | #include "../include/obd_class.h" | |
43 | #include "../include/lustre_lib.h" | |
44 | #include "../include/lustre_ha.h" | |
45 | #include "../include/lustre_import.h" | |
46 | #include "../include/lustre_req_layout.h" | |
d7e09d03 PT |
47 | |
48 | #include "ptlrpc_internal.h" | |
49 | ||
50 | static int ptlrpc_send_new_req(struct ptlrpc_request *req); | |
82a373ae | 51 | static int ptlrpcd_check_work(struct ptlrpc_request *req); |
d7e09d03 PT |
52 | |
53 | /** | |
54 | * Initialize passed in client structure \a cl. | |
55 | */ | |
56 | void ptlrpc_init_client(int req_portal, int rep_portal, char *name, | |
57 | struct ptlrpc_client *cl) | |
58 | { | |
59 | cl->cli_request_portal = req_portal; | |
d0bfef31 CH |
60 | cl->cli_reply_portal = rep_portal; |
61 | cl->cli_name = name; | |
d7e09d03 PT |
62 | } |
63 | EXPORT_SYMBOL(ptlrpc_init_client); | |
64 | ||
65 | /** | |
930cef9a | 66 | * Return PortalRPC connection for remote uud \a uuid |
d7e09d03 PT |
67 | */ |
68 | struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid) | |
69 | { | |
70 | struct ptlrpc_connection *c; | |
d0bfef31 CH |
71 | lnet_nid_t self; |
72 | lnet_process_id_t peer; | |
73 | int err; | |
d7e09d03 | 74 | |
ce1c42ed SW |
75 | /* |
76 | * ptlrpc_uuid_to_peer() initializes its 2nd parameter | |
77 | * before accessing its values. | |
78 | * coverity[uninit_use_in_call] | |
79 | */ | |
d7e09d03 PT |
80 | err = ptlrpc_uuid_to_peer(uuid, &peer, &self); |
81 | if (err != 0) { | |
82 | CNETERR("cannot find peer %s!\n", uuid->uuid); | |
83 | return NULL; | |
84 | } | |
85 | ||
86 | c = ptlrpc_connection_get(peer, self, uuid); | |
87 | if (c) { | |
88 | memcpy(c->c_remote_uuid.uuid, | |
89 | uuid->uuid, sizeof(c->c_remote_uuid.uuid)); | |
90 | } | |
91 | ||
92 | CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c); | |
93 | ||
94 | return c; | |
95 | } | |
96 | EXPORT_SYMBOL(ptlrpc_uuid_to_connection); | |
97 | ||
98 | /** | |
99 | * Allocate and initialize new bulk descriptor on the sender. | |
100 | * Returns pointer to the descriptor or NULL on error. | |
101 | */ | |
102 | struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw, | |
103 | unsigned type, unsigned portal) | |
104 | { | |
105 | struct ptlrpc_bulk_desc *desc; | |
106 | int i; | |
107 | ||
9ae10597 JL |
108 | desc = kzalloc(offsetof(struct ptlrpc_bulk_desc, bd_iov[npages]), |
109 | GFP_NOFS); | |
d7e09d03 PT |
110 | if (!desc) |
111 | return NULL; | |
112 | ||
113 | spin_lock_init(&desc->bd_lock); | |
114 | init_waitqueue_head(&desc->bd_waitq); | |
115 | desc->bd_max_iov = npages; | |
116 | desc->bd_iov_count = 0; | |
117 | desc->bd_portal = portal; | |
118 | desc->bd_type = type; | |
119 | desc->bd_md_count = 0; | |
120 | LASSERT(max_brw > 0); | |
121 | desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT); | |
ce1c42ed SW |
122 | /* |
123 | * PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this | |
124 | * node. Negotiated ocd_brw_size will always be <= this number. | |
125 | */ | |
d7e09d03 PT |
126 | for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++) |
127 | LNetInvalidateHandle(&desc->bd_mds[i]); | |
128 | ||
129 | return desc; | |
130 | } | |
131 | ||
132 | /** | |
133 | * Prepare bulk descriptor for specified outgoing request \a req that | |
134 | * can fit \a npages * pages. \a type is bulk type. \a portal is where | |
135 | * the bulk to be sent. Used on client-side. | |
930cef9a | 136 | * Returns pointer to newly allocated initialized bulk descriptor or NULL on |
d7e09d03 PT |
137 | * error. |
138 | */ | |
139 | struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, | |
140 | unsigned npages, unsigned max_brw, | |
141 | unsigned type, unsigned portal) | |
142 | { | |
143 | struct obd_import *imp = req->rq_import; | |
144 | struct ptlrpc_bulk_desc *desc; | |
145 | ||
d7e09d03 PT |
146 | LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE); |
147 | desc = ptlrpc_new_bulk(npages, max_brw, type, portal); | |
8b382089 | 148 | if (!desc) |
0a3bdb00 | 149 | return NULL; |
d7e09d03 PT |
150 | |
151 | desc->bd_import_generation = req->rq_import_generation; | |
152 | desc->bd_import = class_import_get(imp); | |
153 | desc->bd_req = req; | |
154 | ||
d0bfef31 | 155 | desc->bd_cbid.cbid_fn = client_bulk_callback; |
d7e09d03 PT |
156 | desc->bd_cbid.cbid_arg = desc; |
157 | ||
158 | /* This makes req own desc, and free it when she frees herself */ | |
159 | req->rq_bulk = desc; | |
160 | ||
161 | return desc; | |
162 | } | |
163 | EXPORT_SYMBOL(ptlrpc_prep_bulk_imp); | |
164 | ||
165 | /** | |
166 | * Add a page \a page to the bulk descriptor \a desc. | |
167 | * Data to transfer in the page starts at offset \a pageoffset and | |
168 | * amount of data to transfer from the page is \a len | |
169 | */ | |
170 | void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, | |
171 | struct page *page, int pageoffset, int len, int pin) | |
172 | { | |
173 | LASSERT(desc->bd_iov_count < desc->bd_max_iov); | |
8b382089 | 174 | LASSERT(page); |
d7e09d03 PT |
175 | LASSERT(pageoffset >= 0); |
176 | LASSERT(len > 0); | |
09cbfeaf | 177 | LASSERT(pageoffset + len <= PAGE_SIZE); |
d7e09d03 PT |
178 | |
179 | desc->bd_nob += len; | |
180 | ||
181 | if (pin) | |
09cbfeaf | 182 | get_page(page); |
d7e09d03 PT |
183 | |
184 | ptlrpc_add_bulk_page(desc, page, pageoffset, len); | |
185 | } | |
186 | EXPORT_SYMBOL(__ptlrpc_prep_bulk_page); | |
187 | ||
188 | /** | |
189 | * Uninitialize and free bulk descriptor \a desc. | |
190 | * Works on bulk descriptors both from server and client side. | |
191 | */ | |
192 | void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin) | |
193 | { | |
194 | int i; | |
d7e09d03 | 195 | |
d7e09d03 PT |
196 | LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */ |
197 | LASSERT(desc->bd_md_count == 0); /* network hands off */ | |
198 | LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); | |
199 | ||
200 | sptlrpc_enc_pool_put_pages(desc); | |
201 | ||
202 | if (desc->bd_export) | |
203 | class_export_put(desc->bd_export); | |
204 | else | |
205 | class_import_put(desc->bd_import); | |
206 | ||
207 | if (unpin) { | |
7b8633de | 208 | for (i = 0; i < desc->bd_iov_count; i++) |
09cbfeaf | 209 | put_page(desc->bd_iov[i].kiov_page); |
d7e09d03 PT |
210 | } |
211 | ||
9ae10597 | 212 | kfree(desc); |
d7e09d03 PT |
213 | } |
214 | EXPORT_SYMBOL(__ptlrpc_free_bulk); | |
215 | ||
216 | /** | |
217 | * Set server timelimit for this req, i.e. how long are we willing to wait | |
218 | * for reply before timing out this request. | |
219 | */ | |
220 | void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) | |
221 | { | |
222 | __u32 serv_est; | |
223 | int idx; | |
224 | struct imp_at *at; | |
225 | ||
226 | LASSERT(req->rq_import); | |
227 | ||
228 | if (AT_OFF) { | |
ce1c42ed SW |
229 | /* |
230 | * non-AT settings | |
231 | * | |
d7e09d03 PT |
232 | * \a imp_server_timeout means this is reverse import and |
233 | * we send (currently only) ASTs to the client and cannot afford | |
234 | * to wait too long for the reply, otherwise the other client | |
235 | * (because of which we are sending this request) would | |
236 | * timeout waiting for us | |
237 | */ | |
238 | req->rq_timeout = req->rq_import->imp_server_timeout ? | |
239 | obd_timeout / 2 : obd_timeout; | |
240 | } else { | |
241 | at = &req->rq_import->imp_at; | |
242 | idx = import_at_get_index(req->rq_import, | |
243 | req->rq_request_portal); | |
244 | serv_est = at_get(&at->iat_service_estimate[idx]); | |
245 | req->rq_timeout = at_est2timeout(serv_est); | |
246 | } | |
ce1c42ed SW |
247 | /* |
248 | * We could get even fancier here, using history to predict increased | |
249 | * loading... | |
250 | */ | |
d7e09d03 | 251 | |
ce1c42ed SW |
252 | /* |
253 | * Let the server know what this RPC timeout is by putting it in the | |
254 | * reqmsg | |
255 | */ | |
d7e09d03 PT |
256 | lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); |
257 | } | |
258 | EXPORT_SYMBOL(ptlrpc_at_set_req_timeout); | |
259 | ||
260 | /* Adjust max service estimate based on server value */ | |
261 | static void ptlrpc_at_adj_service(struct ptlrpc_request *req, | |
262 | unsigned int serv_est) | |
263 | { | |
264 | int idx; | |
265 | unsigned int oldse; | |
266 | struct imp_at *at; | |
267 | ||
268 | LASSERT(req->rq_import); | |
269 | at = &req->rq_import->imp_at; | |
270 | ||
271 | idx = import_at_get_index(req->rq_import, req->rq_request_portal); | |
ce1c42ed SW |
272 | /* |
273 | * max service estimates are tracked on the server side, | |
274 | * so just keep minimal history here | |
275 | */ | |
d7e09d03 PT |
276 | oldse = at_measured(&at->iat_service_estimate[idx], serv_est); |
277 | if (oldse != 0) | |
2d00bd17 | 278 | CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d has changed from %d to %d\n", |
0ae015be | 279 | req->rq_import->imp_obd->obd_name, req->rq_request_portal, |
d7e09d03 PT |
280 | oldse, at_get(&at->iat_service_estimate[idx])); |
281 | } | |
282 | ||
283 | /* Expected network latency per remote node (secs) */ | |
284 | int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) | |
285 | { | |
286 | return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency); | |
287 | } | |
288 | ||
289 | /* Adjust expected network latency */ | |
290 | static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, | |
291 | unsigned int service_time) | |
292 | { | |
293 | unsigned int nl, oldnl; | |
294 | struct imp_at *at; | |
219e6de6 | 295 | time64_t now = ktime_get_real_seconds(); |
d7e09d03 PT |
296 | |
297 | LASSERT(req->rq_import); | |
06fbc01a LZ |
298 | |
299 | if (service_time > now - req->rq_sent + 3) { | |
ce1c42ed SW |
300 | /* |
301 | * bz16408, however, this can also happen if early reply | |
06fbc01a LZ |
302 | * is lost and client RPC is expired and resent, early reply |
303 | * or reply of original RPC can still be fit in reply buffer | |
304 | * of resent RPC, now client is measuring time from the | |
305 | * resent time, but server sent back service time of original | |
306 | * RPC. | |
307 | */ | |
308 | CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ? | |
309 | D_ADAPTTO : D_WARNING, | |
310 | "Reported service time %u > total measured time " | |
311 | CFS_DURATION_T"\n", service_time, | |
219e6de6 | 312 | (long)(now - req->rq_sent)); |
06fbc01a LZ |
313 | return; |
314 | } | |
d7e09d03 PT |
315 | |
316 | /* Network latency is total time less server processing time */ | |
06fbc01a LZ |
317 | nl = max_t(int, now - req->rq_sent - |
318 | service_time, 0) + 1; /* st rounding */ | |
319 | at = &req->rq_import->imp_at; | |
d7e09d03 PT |
320 | |
321 | oldnl = at_measured(&at->iat_net_latency, nl); | |
322 | if (oldnl != 0) | |
2d00bd17 | 323 | CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) has changed from %d to %d\n", |
d7e09d03 PT |
324 | req->rq_import->imp_obd->obd_name, |
325 | obd_uuid2str( | |
326 | &req->rq_import->imp_connection->c_remote_uuid), | |
327 | oldnl, at_get(&at->iat_net_latency)); | |
328 | } | |
329 | ||
330 | static int unpack_reply(struct ptlrpc_request *req) | |
331 | { | |
332 | int rc; | |
333 | ||
334 | if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { | |
335 | rc = ptlrpc_unpack_rep_msg(req, req->rq_replen); | |
336 | if (rc) { | |
337 | DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); | |
fbe7c6c7 | 338 | return -EPROTO; |
d7e09d03 PT |
339 | } |
340 | } | |
341 | ||
342 | rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); | |
343 | if (rc) { | |
344 | DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); | |
fbe7c6c7 | 345 | return -EPROTO; |
d7e09d03 PT |
346 | } |
347 | return 0; | |
348 | } | |
349 | ||
350 | /** | |
351 | * Handle an early reply message, called with the rq_lock held. | |
352 | * If anything goes wrong just ignore it - same as if it never happened | |
353 | */ | |
354 | static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) | |
a161de86 | 355 | __must_hold(&req->rq_lock) |
d7e09d03 PT |
356 | { |
357 | struct ptlrpc_request *early_req; | |
219e6de6 | 358 | time64_t olddl; |
d0bfef31 | 359 | int rc; |
d7e09d03 PT |
360 | |
361 | req->rq_early = 0; | |
362 | spin_unlock(&req->rq_lock); | |
363 | ||
364 | rc = sptlrpc_cli_unwrap_early_reply(req, &early_req); | |
365 | if (rc) { | |
366 | spin_lock(&req->rq_lock); | |
0a3bdb00 | 367 | return rc; |
d7e09d03 PT |
368 | } |
369 | ||
370 | rc = unpack_reply(early_req); | |
371 | if (rc == 0) { | |
372 | /* Expecting to increase the service time estimate here */ | |
373 | ptlrpc_at_adj_service(req, | |
374 | lustre_msg_get_timeout(early_req->rq_repmsg)); | |
375 | ptlrpc_at_adj_net_latency(req, | |
376 | lustre_msg_get_service_time(early_req->rq_repmsg)); | |
377 | } | |
378 | ||
379 | sptlrpc_cli_finish_early_reply(early_req); | |
380 | ||
381 | if (rc != 0) { | |
382 | spin_lock(&req->rq_lock); | |
0a3bdb00 | 383 | return rc; |
d7e09d03 PT |
384 | } |
385 | ||
386 | /* Adjust the local timeout for this req */ | |
387 | ptlrpc_at_set_req_timeout(req); | |
388 | ||
389 | spin_lock(&req->rq_lock); | |
390 | olddl = req->rq_deadline; | |
ce1c42ed SW |
391 | /* |
392 | * server assumes it now has rq_timeout from when it sent the | |
393 | * early reply, so client should give it at least that long. | |
394 | */ | |
219e6de6 | 395 | req->rq_deadline = ktime_get_real_seconds() + req->rq_timeout + |
d7e09d03 PT |
396 | ptlrpc_at_get_net_latency(req); |
397 | ||
398 | DEBUG_REQ(D_ADAPTTO, req, | |
219e6de6 | 399 | "Early reply #%d, new deadline in %lds (%lds)", |
2d00bd17 | 400 | req->rq_early_count, |
219e6de6 AB |
401 | (long)(req->rq_deadline - ktime_get_real_seconds()), |
402 | (long)(req->rq_deadline - olddl)); | |
d7e09d03 | 403 | |
0a3bdb00 | 404 | return rc; |
d7e09d03 PT |
405 | } |
406 | ||
7257f9d1 | 407 | static struct kmem_cache *request_cache; |
35b2e1b7 AS |
408 | |
409 | int ptlrpc_request_cache_init(void) | |
410 | { | |
411 | request_cache = kmem_cache_create("ptlrpc_cache", | |
412 | sizeof(struct ptlrpc_request), | |
413 | 0, SLAB_HWCACHE_ALIGN, NULL); | |
8b382089 | 414 | return !request_cache ? -ENOMEM : 0; |
35b2e1b7 AS |
415 | } |
416 | ||
417 | void ptlrpc_request_cache_fini(void) | |
418 | { | |
419 | kmem_cache_destroy(request_cache); | |
420 | } | |
421 | ||
f1c571dc | 422 | struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags) |
35b2e1b7 AS |
423 | { |
424 | struct ptlrpc_request *req; | |
425 | ||
1c147c83 | 426 | req = kmem_cache_zalloc(request_cache, flags); |
35b2e1b7 AS |
427 | return req; |
428 | } | |
429 | ||
430 | void ptlrpc_request_cache_free(struct ptlrpc_request *req) | |
431 | { | |
50d30362 | 432 | kmem_cache_free(request_cache, req); |
35b2e1b7 AS |
433 | } |
434 | ||
d7e09d03 PT |
435 | /** |
436 | * Wind down request pool \a pool. | |
437 | * Frees all requests from the pool too | |
438 | */ | |
439 | void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) | |
440 | { | |
441 | struct list_head *l, *tmp; | |
442 | struct ptlrpc_request *req; | |
443 | ||
d7e09d03 PT |
444 | spin_lock(&pool->prp_lock); |
445 | list_for_each_safe(l, tmp, &pool->prp_req_list) { | |
446 | req = list_entry(l, struct ptlrpc_request, rq_list); | |
447 | list_del(&req->rq_list); | |
448 | LASSERT(req->rq_reqbuf); | |
449 | LASSERT(req->rq_reqbuf_len == pool->prp_rq_size); | |
ee0ec194 | 450 | kvfree(req->rq_reqbuf); |
35b2e1b7 | 451 | ptlrpc_request_cache_free(req); |
d7e09d03 PT |
452 | } |
453 | spin_unlock(&pool->prp_lock); | |
9ae10597 | 454 | kfree(pool); |
d7e09d03 PT |
455 | } |
456 | EXPORT_SYMBOL(ptlrpc_free_rq_pool); | |
457 | ||
458 | /** | |
459 | * Allocates, initializes and adds \a num_rq requests to the pool \a pool | |
460 | */ | |
aefd9d71 | 461 | int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) |
d7e09d03 PT |
462 | { |
463 | int i; | |
464 | int size = 1; | |
465 | ||
466 | while (size < pool->prp_rq_size) | |
467 | size <<= 1; | |
468 | ||
469 | LASSERTF(list_empty(&pool->prp_req_list) || | |
470 | size == pool->prp_rq_size, | |
2d00bd17 JP |
471 | "Trying to change pool size with nonempty pool from %d to %d bytes\n", |
472 | pool->prp_rq_size, size); | |
d7e09d03 PT |
473 | |
474 | spin_lock(&pool->prp_lock); | |
475 | pool->prp_rq_size = size; | |
476 | for (i = 0; i < num_rq; i++) { | |
477 | struct ptlrpc_request *req; | |
478 | struct lustre_msg *msg; | |
479 | ||
480 | spin_unlock(&pool->prp_lock); | |
0be19afa | 481 | req = ptlrpc_request_cache_alloc(GFP_NOFS); |
d7e09d03 | 482 | if (!req) |
aefd9d71 | 483 | return i; |
ee0ec194 | 484 | msg = libcfs_kvzalloc(size, GFP_NOFS); |
d7e09d03 | 485 | if (!msg) { |
35b2e1b7 | 486 | ptlrpc_request_cache_free(req); |
aefd9d71 | 487 | return i; |
d7e09d03 PT |
488 | } |
489 | req->rq_reqbuf = msg; | |
490 | req->rq_reqbuf_len = size; | |
491 | req->rq_pool = pool; | |
492 | spin_lock(&pool->prp_lock); | |
493 | list_add_tail(&req->rq_list, &pool->prp_req_list); | |
494 | } | |
495 | spin_unlock(&pool->prp_lock); | |
aefd9d71 | 496 | return num_rq; |
d7e09d03 PT |
497 | } |
498 | EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); | |
499 | ||
500 | /** | |
501 | * Create and initialize new request pool with given attributes: | |
502 | * \a num_rq - initial number of requests to create for the pool | |
503 | * \a msgsize - maximum message size possible for requests in thid pool | |
504 | * \a populate_pool - function to be called when more requests need to be added | |
505 | * to the pool | |
506 | * Returns pointer to newly created pool or NULL on error. | |
507 | */ | |
508 | struct ptlrpc_request_pool * | |
509 | ptlrpc_init_rq_pool(int num_rq, int msgsize, | |
aefd9d71 | 510 | int (*populate_pool)(struct ptlrpc_request_pool *, int)) |
d7e09d03 PT |
511 | { |
512 | struct ptlrpc_request_pool *pool; | |
513 | ||
9ae10597 | 514 | pool = kzalloc(sizeof(struct ptlrpc_request_pool), GFP_NOFS); |
d7e09d03 PT |
515 | if (!pool) |
516 | return NULL; | |
517 | ||
ce1c42ed SW |
518 | /* |
519 | * Request next power of two for the allocation, because internally | |
520 | * kernel would do exactly this | |
521 | */ | |
d7e09d03 PT |
522 | |
523 | spin_lock_init(&pool->prp_lock); | |
524 | INIT_LIST_HEAD(&pool->prp_req_list); | |
525 | pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD; | |
526 | pool->prp_populate = populate_pool; | |
527 | ||
528 | populate_pool(pool, num_rq); | |
529 | ||
d7e09d03 PT |
530 | return pool; |
531 | } | |
532 | EXPORT_SYMBOL(ptlrpc_init_rq_pool); | |
533 | ||
534 | /** | |
535 | * Fetches one request from pool \a pool | |
536 | */ | |
537 | static struct ptlrpc_request * | |
538 | ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool) | |
539 | { | |
540 | struct ptlrpc_request *request; | |
541 | struct lustre_msg *reqbuf; | |
542 | ||
543 | if (!pool) | |
544 | return NULL; | |
545 | ||
546 | spin_lock(&pool->prp_lock); | |
547 | ||
ce1c42ed SW |
548 | /* |
549 | * See if we have anything in a pool, and bail out if nothing, | |
d7e09d03 PT |
550 | * in writeout path, where this matters, this is safe to do, because |
551 | * nothing is lost in this case, and when some in-flight requests | |
ce1c42ed SW |
552 | * complete, this code will be called again. |
553 | */ | |
d7e09d03 PT |
554 | if (unlikely(list_empty(&pool->prp_req_list))) { |
555 | spin_unlock(&pool->prp_lock); | |
556 | return NULL; | |
557 | } | |
558 | ||
559 | request = list_entry(pool->prp_req_list.next, struct ptlrpc_request, | |
30c0aa39 | 560 | rq_list); |
d7e09d03 PT |
561 | list_del_init(&request->rq_list); |
562 | spin_unlock(&pool->prp_lock); | |
563 | ||
564 | LASSERT(request->rq_reqbuf); | |
565 | LASSERT(request->rq_pool); | |
566 | ||
567 | reqbuf = request->rq_reqbuf; | |
568 | memset(request, 0, sizeof(*request)); | |
569 | request->rq_reqbuf = reqbuf; | |
570 | request->rq_reqbuf_len = pool->prp_rq_size; | |
571 | request->rq_pool = pool; | |
572 | ||
573 | return request; | |
574 | } | |
575 | ||
576 | /** | |
577 | * Returns freed \a request to pool. | |
578 | */ | |
579 | static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request) | |
580 | { | |
581 | struct ptlrpc_request_pool *pool = request->rq_pool; | |
582 | ||
583 | spin_lock(&pool->prp_lock); | |
584 | LASSERT(list_empty(&request->rq_list)); | |
585 | LASSERT(!request->rq_receiving_reply); | |
586 | list_add_tail(&request->rq_list, &pool->prp_req_list); | |
587 | spin_unlock(&pool->prp_lock); | |
588 | } | |
589 | ||
590 | static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request, | |
591 | __u32 version, int opcode, | |
592 | int count, __u32 *lengths, char **bufs, | |
593 | struct ptlrpc_cli_ctx *ctx) | |
594 | { | |
d0bfef31 CH |
595 | struct obd_import *imp = request->rq_import; |
596 | int rc; | |
d7e09d03 PT |
597 | |
598 | if (unlikely(ctx)) | |
599 | request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx); | |
600 | else { | |
601 | rc = sptlrpc_req_get_ctx(request); | |
602 | if (rc) | |
a9b3e8f3 | 603 | goto out_free; |
d7e09d03 PT |
604 | } |
605 | ||
606 | sptlrpc_req_set_flavor(request, opcode); | |
607 | ||
608 | rc = lustre_pack_request(request, imp->imp_msg_magic, count, | |
609 | lengths, bufs); | |
610 | if (rc) { | |
611 | LASSERT(!request->rq_pool); | |
a9b3e8f3 | 612 | goto out_ctx; |
d7e09d03 PT |
613 | } |
614 | ||
615 | lustre_msg_add_version(request->rq_reqmsg, version); | |
616 | request->rq_send_state = LUSTRE_IMP_FULL; | |
617 | request->rq_type = PTL_RPC_MSG_REQUEST; | |
618 | request->rq_export = NULL; | |
619 | ||
d0bfef31 | 620 | request->rq_req_cbid.cbid_fn = request_out_callback; |
d7e09d03 PT |
621 | request->rq_req_cbid.cbid_arg = request; |
622 | ||
d0bfef31 | 623 | request->rq_reply_cbid.cbid_fn = reply_in_callback; |
d7e09d03 PT |
624 | request->rq_reply_cbid.cbid_arg = request; |
625 | ||
626 | request->rq_reply_deadline = 0; | |
627 | request->rq_phase = RQ_PHASE_NEW; | |
628 | request->rq_next_phase = RQ_PHASE_UNDEFINED; | |
629 | ||
630 | request->rq_request_portal = imp->imp_client->cli_request_portal; | |
631 | request->rq_reply_portal = imp->imp_client->cli_reply_portal; | |
632 | ||
633 | ptlrpc_at_set_req_timeout(request); | |
634 | ||
635 | spin_lock_init(&request->rq_lock); | |
636 | INIT_LIST_HEAD(&request->rq_list); | |
637 | INIT_LIST_HEAD(&request->rq_timed_list); | |
638 | INIT_LIST_HEAD(&request->rq_replay_list); | |
639 | INIT_LIST_HEAD(&request->rq_ctx_chain); | |
640 | INIT_LIST_HEAD(&request->rq_set_chain); | |
641 | INIT_LIST_HEAD(&request->rq_history_list); | |
642 | INIT_LIST_HEAD(&request->rq_exp_list); | |
643 | init_waitqueue_head(&request->rq_reply_waitq); | |
644 | init_waitqueue_head(&request->rq_set_waitq); | |
645 | request->rq_xid = ptlrpc_next_xid(); | |
646 | atomic_set(&request->rq_refcount, 1); | |
647 | ||
648 | lustre_msg_set_opc(request->rq_reqmsg, opcode); | |
649 | ||
0a3bdb00 | 650 | return 0; |
d7e09d03 PT |
651 | out_ctx: |
652 | sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1); | |
653 | out_free: | |
654 | class_import_put(imp); | |
655 | return rc; | |
656 | } | |
657 | ||
658 | int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, | |
659 | __u32 version, int opcode, char **bufs, | |
660 | struct ptlrpc_cli_ctx *ctx) | |
661 | { | |
662 | int count; | |
663 | ||
664 | count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT); | |
665 | return __ptlrpc_request_bufs_pack(request, version, opcode, count, | |
666 | request->rq_pill.rc_area[RCL_CLIENT], | |
667 | bufs, ctx); | |
668 | } | |
669 | EXPORT_SYMBOL(ptlrpc_request_bufs_pack); | |
670 | ||
671 | /** | |
672 | * Pack request buffers for network transfer, performing necessary encryption | |
673 | * steps if necessary. | |
674 | */ | |
675 | int ptlrpc_request_pack(struct ptlrpc_request *request, | |
676 | __u32 version, int opcode) | |
677 | { | |
678 | int rc; | |
50ffcb7e | 679 | |
d7e09d03 PT |
680 | rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL); |
681 | if (rc) | |
682 | return rc; | |
683 | ||
ce1c42ed SW |
684 | /* |
685 | * For some old 1.8 clients (< 1.8.7), they will LASSERT the size of | |
d7e09d03 | 686 | * ptlrpc_body sent from server equal to local ptlrpc_body size, so we |
930cef9a | 687 | * have to send old ptlrpc_body to keep interoperability with these |
d7e09d03 PT |
688 | * clients. |
689 | * | |
690 | * Only three kinds of server->client RPCs so far: | |
691 | * - LDLM_BL_CALLBACK | |
692 | * - LDLM_CP_CALLBACK | |
693 | * - LDLM_GL_CALLBACK | |
694 | * | |
930cef9a | 695 | * XXX This should be removed whenever we drop the interoperability with |
d7e09d03 PT |
696 | * the these old clients. |
697 | */ | |
698 | if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK || | |
699 | opcode == LDLM_GL_CALLBACK) | |
700 | req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY, | |
701 | sizeof(struct ptlrpc_body_v2), RCL_CLIENT); | |
702 | ||
703 | return rc; | |
704 | } | |
705 | EXPORT_SYMBOL(ptlrpc_request_pack); | |
706 | ||
707 | /** | |
708 | * Helper function to allocate new request on import \a imp | |
709 | * and possibly using existing request from pool \a pool if provided. | |
710 | * Returns allocated request structure with import field filled or | |
711 | * NULL on error. | |
712 | */ | |
713 | static inline | |
714 | struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp, | |
715 | struct ptlrpc_request_pool *pool) | |
716 | { | |
b5504406 | 717 | struct ptlrpc_request *request; |
d7e09d03 | 718 | |
aefd9d71 | 719 | request = ptlrpc_request_cache_alloc(GFP_NOFS); |
d7e09d03 | 720 | |
aefd9d71 LX |
721 | if (!request && pool) |
722 | request = ptlrpc_prep_req_from_pool(pool); | |
d7e09d03 PT |
723 | |
724 | if (request) { | |
19b2056f | 725 | LASSERTF((unsigned long)imp > 0x1000, "%p\n", imp); |
d7e09d03 | 726 | LASSERT(imp != LP_POISON); |
19b2056f | 727 | LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p\n", |
30c0aa39 | 728 | imp->imp_client); |
d7e09d03 PT |
729 | LASSERT(imp->imp_client != LP_POISON); |
730 | ||
731 | request->rq_import = class_import_get(imp); | |
732 | } else { | |
733 | CERROR("request allocation out of memory\n"); | |
734 | } | |
735 | ||
736 | return request; | |
737 | } | |
738 | ||
739 | /** | |
740 | * Helper function for creating a request. | |
930cef9a | 741 | * Calls __ptlrpc_request_alloc to allocate new request structure and inits |
d7e09d03 PT |
742 | * buffer structures according to capsule template \a format. |
743 | * Returns allocated request structure pointer or NULL on error. | |
744 | */ | |
745 | static struct ptlrpc_request * | |
746 | ptlrpc_request_alloc_internal(struct obd_import *imp, | |
0028d585 | 747 | struct ptlrpc_request_pool *pool, |
d7e09d03 PT |
748 | const struct req_format *format) |
749 | { | |
750 | struct ptlrpc_request *request; | |
751 | ||
752 | request = __ptlrpc_request_alloc(imp, pool); | |
8b382089 | 753 | if (!request) |
d7e09d03 PT |
754 | return NULL; |
755 | ||
756 | req_capsule_init(&request->rq_pill, request, RCL_CLIENT); | |
757 | req_capsule_set(&request->rq_pill, format); | |
758 | return request; | |
759 | } | |
760 | ||
761 | /** | |
762 | * Allocate new request structure for import \a imp and initialize its | |
763 | * buffer structure according to capsule template \a format. | |
764 | */ | |
765 | struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, | |
766 | const struct req_format *format) | |
767 | { | |
768 | return ptlrpc_request_alloc_internal(imp, NULL, format); | |
769 | } | |
770 | EXPORT_SYMBOL(ptlrpc_request_alloc); | |
771 | ||
772 | /** | |
773 | * Allocate new request structure for import \a imp from pool \a pool and | |
774 | * initialize its buffer structure according to capsule template \a format. | |
775 | */ | |
776 | struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, | |
d0bfef31 CH |
777 | struct ptlrpc_request_pool *pool, |
778 | const struct req_format *format) | |
d7e09d03 PT |
779 | { |
780 | return ptlrpc_request_alloc_internal(imp, pool, format); | |
781 | } | |
782 | EXPORT_SYMBOL(ptlrpc_request_alloc_pool); | |
783 | ||
784 | /** | |
785 | * For requests not from pool, free memory of the request structure. | |
786 | * For requests obtained from a pool earlier, return request back to pool. | |
787 | */ | |
788 | void ptlrpc_request_free(struct ptlrpc_request *request) | |
789 | { | |
790 | if (request->rq_pool) | |
791 | __ptlrpc_free_req_to_pool(request); | |
792 | else | |
35b2e1b7 | 793 | ptlrpc_request_cache_free(request); |
d7e09d03 PT |
794 | } |
795 | EXPORT_SYMBOL(ptlrpc_request_free); | |
796 | ||
797 | /** | |
930cef9a | 798 | * Allocate new request for operation \a opcode and immediately pack it for |
d7e09d03 PT |
799 | * network transfer. |
800 | * Only used for simple requests like OBD_PING where the only important | |
801 | * part of the request is operation itself. | |
802 | * Returns allocated request or NULL on error. | |
803 | */ | |
804 | struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, | |
d0bfef31 CH |
805 | const struct req_format *format, |
806 | __u32 version, int opcode) | |
d7e09d03 PT |
807 | { |
808 | struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format); | |
d0bfef31 | 809 | int rc; |
d7e09d03 PT |
810 | |
811 | if (req) { | |
812 | rc = ptlrpc_request_pack(req, version, opcode); | |
813 | if (rc) { | |
814 | ptlrpc_request_free(req); | |
815 | req = NULL; | |
816 | } | |
817 | } | |
818 | return req; | |
819 | } | |
820 | EXPORT_SYMBOL(ptlrpc_request_alloc_pack); | |
821 | ||
d7e09d03 | 822 | /** |
c5c4c6fa | 823 | * Allocate and initialize new request set structure on the current CPT. |
d7e09d03 PT |
824 | * Returns a pointer to the newly allocated set structure or NULL on error. |
825 | */ | |
826 | struct ptlrpc_request_set *ptlrpc_prep_set(void) | |
827 | { | |
828 | struct ptlrpc_request_set *set; | |
c5c4c6fa | 829 | int cpt; |
d7e09d03 | 830 | |
c5c4c6fa OW |
831 | cpt = cfs_cpt_current(cfs_cpt_table, 0); |
832 | set = kzalloc_node(sizeof(*set), GFP_NOFS, | |
833 | cfs_cpt_spread_node(cfs_cpt_table, cpt)); | |
d7e09d03 | 834 | if (!set) |
0a3bdb00 | 835 | return NULL; |
d7e09d03 PT |
836 | atomic_set(&set->set_refcount, 1); |
837 | INIT_LIST_HEAD(&set->set_requests); | |
838 | init_waitqueue_head(&set->set_waitq); | |
839 | atomic_set(&set->set_new_count, 0); | |
840 | atomic_set(&set->set_remaining, 0); | |
841 | spin_lock_init(&set->set_new_req_lock); | |
842 | INIT_LIST_HEAD(&set->set_new_requests); | |
843 | INIT_LIST_HEAD(&set->set_cblist); | |
844 | set->set_max_inflight = UINT_MAX; | |
d0bfef31 | 845 | set->set_producer = NULL; |
d7e09d03 | 846 | set->set_producer_arg = NULL; |
d0bfef31 | 847 | set->set_rc = 0; |
d7e09d03 | 848 | |
0a3bdb00 | 849 | return set; |
d7e09d03 PT |
850 | } |
851 | EXPORT_SYMBOL(ptlrpc_prep_set); | |
852 | ||
853 | /** | |
854 | * Allocate and initialize new request set structure with flow control | |
855 | * extension. This extension allows to control the number of requests in-flight | |
856 | * for the whole set. A callback function to generate requests must be provided | |
857 | * and the request set will keep the number of requests sent over the wire to | |
858 | * @max_inflight. | |
859 | * Returns a pointer to the newly allocated set structure or NULL on error. | |
860 | */ | |
861 | struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, | |
862 | void *arg) | |
863 | ||
864 | { | |
865 | struct ptlrpc_request_set *set; | |
866 | ||
867 | set = ptlrpc_prep_set(); | |
868 | if (!set) | |
0a3bdb00 | 869 | return NULL; |
d7e09d03 | 870 | |
d0bfef31 CH |
871 | set->set_max_inflight = max; |
872 | set->set_producer = func; | |
873 | set->set_producer_arg = arg; | |
d7e09d03 | 874 | |
0a3bdb00 | 875 | return set; |
d7e09d03 PT |
876 | } |
877 | EXPORT_SYMBOL(ptlrpc_prep_fcset); | |
878 | ||
879 | /** | |
880 | * Wind down and free request set structure previously allocated with | |
881 | * ptlrpc_prep_set. | |
882 | * Ensures that all requests on the set have completed and removes | |
883 | * all requests from the request list in a set. | |
884 | * If any unsent request happen to be on the list, pretends that they got | |
885 | * an error in flight and calls their completion handler. | |
886 | */ | |
887 | void ptlrpc_set_destroy(struct ptlrpc_request_set *set) | |
888 | { | |
d0bfef31 CH |
889 | struct list_head *tmp; |
890 | struct list_head *next; | |
891 | int expected_phase; | |
892 | int n = 0; | |
d7e09d03 PT |
893 | |
894 | /* Requests on the set should either all be completed, or all be new */ | |
895 | expected_phase = (atomic_read(&set->set_remaining) == 0) ? | |
896 | RQ_PHASE_COMPLETE : RQ_PHASE_NEW; | |
3949015e | 897 | list_for_each(tmp, &set->set_requests) { |
d7e09d03 | 898 | struct ptlrpc_request *req = |
30c0aa39 | 899 | list_entry(tmp, struct ptlrpc_request, rq_set_chain); |
d7e09d03 PT |
900 | |
901 | LASSERT(req->rq_phase == expected_phase); | |
902 | n++; | |
903 | } | |
904 | ||
905 | LASSERTF(atomic_read(&set->set_remaining) == 0 || | |
906 | atomic_read(&set->set_remaining) == n, "%d / %d\n", | |
907 | atomic_read(&set->set_remaining), n); | |
908 | ||
909 | list_for_each_safe(tmp, next, &set->set_requests) { | |
910 | struct ptlrpc_request *req = | |
30c0aa39 | 911 | list_entry(tmp, struct ptlrpc_request, rq_set_chain); |
d7e09d03 PT |
912 | list_del_init(&req->rq_set_chain); |
913 | ||
914 | LASSERT(req->rq_phase == expected_phase); | |
915 | ||
916 | if (req->rq_phase == RQ_PHASE_NEW) { | |
917 | ptlrpc_req_interpret(NULL, req, -EBADR); | |
918 | atomic_dec(&set->set_remaining); | |
919 | } | |
920 | ||
921 | spin_lock(&req->rq_lock); | |
922 | req->rq_set = NULL; | |
923 | req->rq_invalid_rqset = 0; | |
924 | spin_unlock(&req->rq_lock); | |
925 | ||
3949015e | 926 | ptlrpc_req_finished(req); |
d7e09d03 PT |
927 | } |
928 | ||
929 | LASSERT(atomic_read(&set->set_remaining) == 0); | |
930 | ||
931 | ptlrpc_reqset_put(set); | |
d7e09d03 PT |
932 | } |
933 | EXPORT_SYMBOL(ptlrpc_set_destroy); | |
934 | ||
d7e09d03 PT |
935 | /** |
936 | * Add a new request to the general purpose request set. | |
937 | * Assumes request reference from the caller. | |
938 | */ | |
939 | void ptlrpc_set_add_req(struct ptlrpc_request_set *set, | |
940 | struct ptlrpc_request *req) | |
941 | { | |
942 | LASSERT(list_empty(&req->rq_set_chain)); | |
943 | ||
944 | /* The set takes over the caller's request reference */ | |
945 | list_add_tail(&req->rq_set_chain, &set->set_requests); | |
946 | req->rq_set = set; | |
947 | atomic_inc(&set->set_remaining); | |
948 | req->rq_queued_time = cfs_time_current(); | |
949 | ||
8b382089 | 950 | if (req->rq_reqmsg) |
d7e09d03 PT |
951 | lustre_msg_set_jobid(req->rq_reqmsg, NULL); |
952 | ||
8b382089 | 953 | if (set->set_producer) |
ce1c42ed SW |
954 | /* |
955 | * If the request set has a producer callback, the RPC must be | |
956 | * sent straight away | |
957 | */ | |
d7e09d03 PT |
958 | ptlrpc_send_new_req(req); |
959 | } | |
960 | EXPORT_SYMBOL(ptlrpc_set_add_req); | |
961 | ||
962 | /** | |
963 | * Add a request to a request with dedicated server thread | |
964 | * and wake the thread to make any necessary processing. | |
965 | * Currently only used for ptlrpcd. | |
966 | */ | |
967 | void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, | |
d0bfef31 | 968 | struct ptlrpc_request *req) |
d7e09d03 PT |
969 | { |
970 | struct ptlrpc_request_set *set = pc->pc_set; | |
971 | int count, i; | |
972 | ||
8b382089 | 973 | LASSERT(!req->rq_set); |
d7e09d03 PT |
974 | LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0); |
975 | ||
976 | spin_lock(&set->set_new_req_lock); | |
ce1c42ed | 977 | /* The set takes over the caller's request reference. */ |
d7e09d03 PT |
978 | req->rq_set = set; |
979 | req->rq_queued_time = cfs_time_current(); | |
980 | list_add_tail(&req->rq_set_chain, &set->set_new_requests); | |
981 | count = atomic_inc_return(&set->set_new_count); | |
982 | spin_unlock(&set->set_new_req_lock); | |
983 | ||
984 | /* Only need to call wakeup once for the first entry. */ | |
985 | if (count == 1) { | |
986 | wake_up(&set->set_waitq); | |
987 | ||
ce1c42ed SW |
988 | /* |
989 | * XXX: It maybe unnecessary to wakeup all the partners. But to | |
d7e09d03 | 990 | * guarantee the async RPC can be processed ASAP, we have |
ce1c42ed SW |
991 | * no other better choice. It maybe fixed in future. |
992 | */ | |
d7e09d03 PT |
993 | for (i = 0; i < pc->pc_npartners; i++) |
994 | wake_up(&pc->pc_partners[i]->pc_set->set_waitq); | |
995 | } | |
996 | } | |
997 | EXPORT_SYMBOL(ptlrpc_set_add_new_req); | |
998 | ||
999 | /** | |
1000 | * Based on the current state of the import, determine if the request | |
1001 | * can be sent, is an error, or should be delayed. | |
1002 | * | |
1003 | * Returns true if this request should be delayed. If false, and | |
1004 | * *status is set, then the request can not be sent and *status is the | |
1005 | * error code. If false and status is 0, then request can be sent. | |
1006 | * | |
1007 | * The imp->imp_lock must be held. | |
1008 | */ | |
1009 | static int ptlrpc_import_delay_req(struct obd_import *imp, | |
1010 | struct ptlrpc_request *req, int *status) | |
1011 | { | |
1012 | int delay = 0; | |
d7e09d03 | 1013 | |
d7e09d03 PT |
1014 | *status = 0; |
1015 | ||
1016 | if (req->rq_ctx_init || req->rq_ctx_fini) { | |
1017 | /* always allow ctx init/fini rpc go through */ | |
1018 | } else if (imp->imp_state == LUSTRE_IMP_NEW) { | |
1019 | DEBUG_REQ(D_ERROR, req, "Uninitialized import."); | |
1020 | *status = -EIO; | |
1021 | } else if (imp->imp_state == LUSTRE_IMP_CLOSED) { | |
1022 | /* pings may safely race with umount */ | |
1023 | DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ? | |
1024 | D_HA : D_ERROR, req, "IMP_CLOSED "); | |
1025 | *status = -EIO; | |
1026 | } else if (ptlrpc_send_limit_expired(req)) { | |
1027 | /* probably doesn't need to be a D_ERROR after initial testing */ | |
1028 | DEBUG_REQ(D_ERROR, req, "send limit expired "); | |
1029 | *status = -EIO; | |
1030 | } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING && | |
1031 | imp->imp_state == LUSTRE_IMP_CONNECTING) { | |
7b8633de | 1032 | /* allow CONNECT even if import is invalid */ |
d7e09d03 PT |
1033 | if (atomic_read(&imp->imp_inval_count) != 0) { |
1034 | DEBUG_REQ(D_ERROR, req, "invalidate in flight"); | |
1035 | *status = -EIO; | |
1036 | } | |
1037 | } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) { | |
1038 | if (!imp->imp_deactive) | |
1039 | DEBUG_REQ(D_NET, req, "IMP_INVALID"); | |
1040 | *status = -ESHUTDOWN; /* bz 12940 */ | |
1041 | } else if (req->rq_import_generation != imp->imp_generation) { | |
1042 | DEBUG_REQ(D_ERROR, req, "req wrong generation:"); | |
1043 | *status = -EIO; | |
1044 | } else if (req->rq_send_state != imp->imp_state) { | |
1045 | /* invalidate in progress - any requests should be drop */ | |
1046 | if (atomic_read(&imp->imp_inval_count) != 0) { | |
1047 | DEBUG_REQ(D_ERROR, req, "invalidate in flight"); | |
1048 | *status = -EIO; | |
1049 | } else if (imp->imp_dlm_fake || req->rq_no_delay) { | |
1050 | *status = -EWOULDBLOCK; | |
1051 | } else if (req->rq_allow_replay && | |
1052 | (imp->imp_state == LUSTRE_IMP_REPLAY || | |
1053 | imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS || | |
1054 | imp->imp_state == LUSTRE_IMP_REPLAY_WAIT || | |
1055 | imp->imp_state == LUSTRE_IMP_RECOVER)) { | |
1056 | DEBUG_REQ(D_HA, req, "allow during recovery.\n"); | |
1057 | } else { | |
1058 | delay = 1; | |
1059 | } | |
1060 | } | |
1061 | ||
0a3bdb00 | 1062 | return delay; |
d7e09d03 PT |
1063 | } |
1064 | ||
1065 | /** | |
930cef9a | 1066 | * Decide if the error message regarding provided request \a req |
d7e09d03 PT |
1067 | * should be printed to the console or not. |
1068 | * Makes it's decision on request status and other properties. | |
1069 | * Returns 1 to print error on the system console or 0 if not. | |
1070 | */ | |
1071 | static int ptlrpc_console_allow(struct ptlrpc_request *req) | |
1072 | { | |
1073 | __u32 opc; | |
1074 | int err; | |
1075 | ||
8b382089 | 1076 | LASSERT(req->rq_reqmsg); |
d7e09d03 PT |
1077 | opc = lustre_msg_get_opc(req->rq_reqmsg); |
1078 | ||
ce1c42ed SW |
1079 | /* |
1080 | * Suppress particular reconnect errors which are to be expected. No | |
1081 | * errors are suppressed for the initial connection on an import | |
1082 | */ | |
d7e09d03 PT |
1083 | if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) && |
1084 | (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) { | |
1085 | ||
1086 | /* Suppress timed out reconnect requests */ | |
1087 | if (req->rq_timedout) | |
1088 | return 0; | |
1089 | ||
1090 | /* Suppress unavailable/again reconnect requests */ | |
1091 | err = lustre_msg_get_status(req->rq_repmsg); | |
1092 | if (err == -ENODEV || err == -EAGAIN) | |
1093 | return 0; | |
1094 | } | |
1095 | ||
1096 | return 1; | |
1097 | } | |
1098 | ||
1099 | /** | |
1100 | * Check request processing status. | |
1101 | * Returns the status. | |
1102 | */ | |
1103 | static int ptlrpc_check_status(struct ptlrpc_request *req) | |
1104 | { | |
1105 | int err; | |
d7e09d03 PT |
1106 | |
1107 | err = lustre_msg_get_status(req->rq_repmsg); | |
1108 | if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { | |
1109 | struct obd_import *imp = req->rq_import; | |
1110 | __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); | |
50ffcb7e | 1111 | |
d7e09d03 | 1112 | if (ptlrpc_console_allow(req)) |
2d00bd17 | 1113 | LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s, operation %s failed with %d.\n", |
d7e09d03 PT |
1114 | imp->imp_obd->obd_name, |
1115 | libcfs_nid2str( | |
2d00bd17 | 1116 | imp->imp_connection->c_peer.nid), |
d7e09d03 | 1117 | ll_opcode2str(opc), err); |
0a3bdb00 | 1118 | return err < 0 ? err : -EINVAL; |
d7e09d03 PT |
1119 | } |
1120 | ||
1b779d2e | 1121 | if (err < 0) |
d7e09d03 | 1122 | DEBUG_REQ(D_INFO, req, "status is %d", err); |
1b779d2e | 1123 | else if (err > 0) |
d7e09d03 PT |
1124 | /* XXX: translate this error from net to host */ |
1125 | DEBUG_REQ(D_INFO, req, "status is %d", err); | |
d7e09d03 | 1126 | |
0a3bdb00 | 1127 | return err; |
d7e09d03 PT |
1128 | } |
1129 | ||
1130 | /** | |
1131 | * save pre-versions of objects into request for replay. | |
1132 | * Versions are obtained from server reply. | |
1133 | * used for VBR. | |
1134 | */ | |
1135 | static void ptlrpc_save_versions(struct ptlrpc_request *req) | |
1136 | { | |
1137 | struct lustre_msg *repmsg = req->rq_repmsg; | |
1138 | struct lustre_msg *reqmsg = req->rq_reqmsg; | |
1139 | __u64 *versions = lustre_msg_get_versions(repmsg); | |
d7e09d03 PT |
1140 | |
1141 | if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) | |
1142 | return; | |
1143 | ||
1144 | LASSERT(versions); | |
1145 | lustre_msg_set_versions(reqmsg, versions); | |
55f5a824 | 1146 | CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n", |
d7e09d03 | 1147 | versions[0], versions[1]); |
d7e09d03 PT |
1148 | } |
1149 | ||
1150 | /** | |
1151 | * Callback function called when client receives RPC reply for \a req. | |
1152 | * Returns 0 on success or error code. | |
930cef9a | 1153 | * The return value would be assigned to req->rq_status by the caller |
d7e09d03 PT |
1154 | * as request processing status. |
1155 | * This function also decides if the request needs to be saved for later replay. | |
1156 | */ | |
1157 | static int after_reply(struct ptlrpc_request *req) | |
1158 | { | |
1159 | struct obd_import *imp = req->rq_import; | |
1160 | struct obd_device *obd = req->rq_import->imp_obd; | |
1161 | int rc; | |
219e6de6 | 1162 | struct timespec64 work_start; |
d7e09d03 | 1163 | long timediff; |
d7e09d03 | 1164 | |
8b382089 | 1165 | LASSERT(obd); |
d7e09d03 | 1166 | /* repbuf must be unlinked */ |
cf378ff7 | 1167 | LASSERT(!req->rq_receiving_reply && !req->rq_reply_unlink); |
d7e09d03 PT |
1168 | |
1169 | if (req->rq_reply_truncate) { | |
1170 | if (ptlrpc_no_resend(req)) { | |
2d00bd17 | 1171 | DEBUG_REQ(D_ERROR, req, "reply buffer overflow, expected: %d, actual size: %d", |
d7e09d03 | 1172 | req->rq_nob_received, req->rq_repbuf_len); |
0a3bdb00 | 1173 | return -EOVERFLOW; |
d7e09d03 PT |
1174 | } |
1175 | ||
1176 | sptlrpc_cli_free_repbuf(req); | |
ce1c42ed SW |
1177 | /* |
1178 | * Pass the required reply buffer size (include space for early | |
1179 | * reply). NB: no need to round up because alloc_repbuf will | |
1180 | * round it up | |
1181 | */ | |
d7e09d03 PT |
1182 | req->rq_replen = req->rq_nob_received; |
1183 | req->rq_nob_received = 0; | |
15c50ccc | 1184 | spin_lock(&req->rq_lock); |
d7e09d03 | 1185 | req->rq_resend = 1; |
15c50ccc | 1186 | spin_unlock(&req->rq_lock); |
0a3bdb00 | 1187 | return 0; |
d7e09d03 PT |
1188 | } |
1189 | ||
1190 | /* | |
1191 | * NB Until this point, the whole of the incoming message, | |
1192 | * including buflens, status etc is in the sender's byte order. | |
1193 | */ | |
1194 | rc = sptlrpc_cli_unwrap_reply(req); | |
1195 | if (rc) { | |
1196 | DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc); | |
0a3bdb00 | 1197 | return rc; |
d7e09d03 PT |
1198 | } |
1199 | ||
ce1c42ed | 1200 | /* Security layer unwrap might ask resend this request. */ |
d7e09d03 | 1201 | if (req->rq_resend) |
0a3bdb00 | 1202 | return 0; |
d7e09d03 PT |
1203 | |
1204 | rc = unpack_reply(req); | |
1205 | if (rc) | |
0a3bdb00 | 1206 | return rc; |
d7e09d03 PT |
1207 | |
1208 | /* retry indefinitely on EINPROGRESS */ | |
1209 | if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS && | |
1210 | ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) { | |
219e6de6 | 1211 | time64_t now = ktime_get_real_seconds(); |
d7e09d03 PT |
1212 | |
1213 | DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS"); | |
d27f9b07 | 1214 | spin_lock(&req->rq_lock); |
d7e09d03 | 1215 | req->rq_resend = 1; |
d27f9b07 | 1216 | spin_unlock(&req->rq_lock); |
d7e09d03 PT |
1217 | req->rq_nr_resend++; |
1218 | ||
1219 | /* allocate new xid to avoid reply reconstruction */ | |
1220 | if (!req->rq_bulk) { | |
ce1c42ed | 1221 | /* new xid is already allocated for bulk in ptlrpc_check_set() */ |
d7e09d03 | 1222 | req->rq_xid = ptlrpc_next_xid(); |
2d00bd17 | 1223 | DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for resend on EINPROGRESS"); |
d7e09d03 PT |
1224 | } |
1225 | ||
1226 | /* Readjust the timeout for current conditions */ | |
1227 | ptlrpc_at_set_req_timeout(req); | |
ce1c42ed SW |
1228 | /* |
1229 | * delay resend to give a chance to the server to get ready. | |
d7e09d03 PT |
1230 | * The delay is increased by 1s on every resend and is capped to |
1231 | * the current request timeout (i.e. obd_timeout if AT is off, | |
ce1c42ed SW |
1232 | * or AT service time x 125% + 5s, see at_est2timeout) |
1233 | */ | |
d7e09d03 PT |
1234 | if (req->rq_nr_resend > req->rq_timeout) |
1235 | req->rq_sent = now + req->rq_timeout; | |
1236 | else | |
1237 | req->rq_sent = now + req->rq_nr_resend; | |
1238 | ||
0a3bdb00 | 1239 | return 0; |
d7e09d03 PT |
1240 | } |
1241 | ||
219e6de6 AB |
1242 | ktime_get_real_ts64(&work_start); |
1243 | timediff = (work_start.tv_sec - req->rq_arrival_time.tv_sec) * USEC_PER_SEC + | |
1244 | (work_start.tv_nsec - req->rq_arrival_time.tv_nsec) / NSEC_PER_USEC; | |
8b382089 | 1245 | if (obd->obd_svc_stats) { |
d7e09d03 PT |
1246 | lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, |
1247 | timediff); | |
1248 | ptlrpc_lprocfs_rpc_sent(req, timediff); | |
1249 | } | |
1250 | ||
1251 | if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY && | |
1252 | lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) { | |
1253 | DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)", | |
1254 | lustre_msg_get_type(req->rq_repmsg)); | |
0a3bdb00 | 1255 | return -EPROTO; |
d7e09d03 PT |
1256 | } |
1257 | ||
1258 | if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) | |
1259 | CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val); | |
1260 | ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg)); | |
1261 | ptlrpc_at_adj_net_latency(req, | |
1262 | lustre_msg_get_service_time(req->rq_repmsg)); | |
1263 | ||
1264 | rc = ptlrpc_check_status(req); | |
1265 | imp->imp_connect_error = rc; | |
1266 | ||
1267 | if (rc) { | |
1268 | /* | |
1269 | * Either we've been evicted, or the server has failed for | |
1270 | * some reason. Try to reconnect, and if that fails, punt to | |
1271 | * the upcall. | |
1272 | */ | |
1273 | if (ll_rpc_recoverable_error(rc)) { | |
1274 | if (req->rq_send_state != LUSTRE_IMP_FULL || | |
1275 | imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { | |
0a3bdb00 | 1276 | return rc; |
d7e09d03 PT |
1277 | } |
1278 | ptlrpc_request_handle_notconn(req); | |
0a3bdb00 | 1279 | return rc; |
d7e09d03 PT |
1280 | } |
1281 | } else { | |
1282 | /* | |
1283 | * Let's look if server sent slv. Do it only for RPC with | |
1284 | * rc == 0. | |
1285 | */ | |
1286 | ldlm_cli_update_pool(req); | |
1287 | } | |
1288 | ||
ce1c42ed | 1289 | /* Store transno in reqmsg for replay. */ |
d7e09d03 PT |
1290 | if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) { |
1291 | req->rq_transno = lustre_msg_get_transno(req->rq_repmsg); | |
1292 | lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno); | |
1293 | } | |
1294 | ||
1295 | if (imp->imp_replayable) { | |
1296 | spin_lock(&imp->imp_lock); | |
1297 | /* | |
1298 | * No point in adding already-committed requests to the replay | |
1299 | * list, we will just remove them immediately. b=9829 | |
1300 | */ | |
1301 | if (req->rq_transno != 0 && | |
1302 | (req->rq_transno > | |
1303 | lustre_msg_get_last_committed(req->rq_repmsg) || | |
1304 | req->rq_replay)) { | |
ce1c42ed | 1305 | /* version recovery */ |
d7e09d03 PT |
1306 | ptlrpc_save_versions(req); |
1307 | ptlrpc_retain_replayable_request(req, imp); | |
8b382089 | 1308 | } else if (req->rq_commit_cb && |
503a1ac7 | 1309 | list_empty(&req->rq_replay_list)) { |
ce1c42ed SW |
1310 | /* |
1311 | * NB: don't call rq_commit_cb if it's already on | |
503a1ac7 | 1312 | * rq_replay_list, ptlrpc_free_committed() will call |
ce1c42ed SW |
1313 | * it later, see LU-3618 for details |
1314 | */ | |
d7e09d03 PT |
1315 | spin_unlock(&imp->imp_lock); |
1316 | req->rq_commit_cb(req); | |
1317 | spin_lock(&imp->imp_lock); | |
1318 | } | |
1319 | ||
ce1c42ed | 1320 | /* Replay-enabled imports return commit-status information. */ |
d7e09d03 PT |
1321 | if (lustre_msg_get_last_committed(req->rq_repmsg)) { |
1322 | imp->imp_peer_committed_transno = | |
1323 | lustre_msg_get_last_committed(req->rq_repmsg); | |
1324 | } | |
1325 | ||
1326 | ptlrpc_free_committed(imp); | |
1327 | ||
1328 | if (!list_empty(&imp->imp_replay_list)) { | |
1329 | struct ptlrpc_request *last; | |
1330 | ||
1331 | last = list_entry(imp->imp_replay_list.prev, | |
30c0aa39 OD |
1332 | struct ptlrpc_request, |
1333 | rq_replay_list); | |
d7e09d03 PT |
1334 | /* |
1335 | * Requests with rq_replay stay on the list even if no | |
1336 | * commit is expected. | |
1337 | */ | |
1338 | if (last->rq_transno > imp->imp_peer_committed_transno) | |
1339 | ptlrpc_pinger_commit_expected(imp); | |
1340 | } | |
1341 | ||
1342 | spin_unlock(&imp->imp_lock); | |
1343 | } | |
1344 | ||
0a3bdb00 | 1345 | return rc; |
d7e09d03 PT |
1346 | } |
1347 | ||
1348 | /** | |
1349 | * Helper function to send request \a req over the network for the first time | |
1350 | * Also adjusts request phase. | |
1351 | * Returns 0 on success or error code. | |
1352 | */ | |
1353 | static int ptlrpc_send_new_req(struct ptlrpc_request *req) | |
1354 | { | |
d0bfef31 | 1355 | struct obd_import *imp = req->rq_import; |
d7e09d03 | 1356 | int rc; |
d7e09d03 PT |
1357 | |
1358 | LASSERT(req->rq_phase == RQ_PHASE_NEW); | |
219e6de6 | 1359 | if (req->rq_sent && (req->rq_sent > ktime_get_real_seconds()) && |
d7e09d03 PT |
1360 | (!req->rq_generation_set || |
1361 | req->rq_import_generation == imp->imp_generation)) | |
0a3bdb00 | 1362 | return 0; |
d7e09d03 PT |
1363 | |
1364 | ptlrpc_rqphase_move(req, RQ_PHASE_RPC); | |
1365 | ||
1366 | spin_lock(&imp->imp_lock); | |
1367 | ||
1368 | if (!req->rq_generation_set) | |
1369 | req->rq_import_generation = imp->imp_generation; | |
1370 | ||
1371 | if (ptlrpc_import_delay_req(imp, req, &rc)) { | |
1372 | spin_lock(&req->rq_lock); | |
1373 | req->rq_waiting = 1; | |
1374 | spin_unlock(&req->rq_lock); | |
1375 | ||
2d00bd17 JP |
1376 | DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: (%s != %s)", |
1377 | lustre_msg_get_status(req->rq_reqmsg), | |
d7e09d03 PT |
1378 | ptlrpc_import_state_name(req->rq_send_state), |
1379 | ptlrpc_import_state_name(imp->imp_state)); | |
1380 | LASSERT(list_empty(&req->rq_list)); | |
1381 | list_add_tail(&req->rq_list, &imp->imp_delayed_list); | |
1382 | atomic_inc(&req->rq_import->imp_inflight); | |
1383 | spin_unlock(&imp->imp_lock); | |
0a3bdb00 | 1384 | return 0; |
d7e09d03 PT |
1385 | } |
1386 | ||
1387 | if (rc != 0) { | |
1388 | spin_unlock(&imp->imp_lock); | |
1389 | req->rq_status = rc; | |
1390 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
0a3bdb00 | 1391 | return rc; |
d7e09d03 PT |
1392 | } |
1393 | ||
1394 | LASSERT(list_empty(&req->rq_list)); | |
1395 | list_add_tail(&req->rq_list, &imp->imp_sending_list); | |
1396 | atomic_inc(&req->rq_import->imp_inflight); | |
1397 | spin_unlock(&imp->imp_lock); | |
1398 | ||
1399 | lustre_msg_set_status(req->rq_reqmsg, current_pid()); | |
1400 | ||
1401 | rc = sptlrpc_req_refresh_ctx(req, -1); | |
1402 | if (rc) { | |
1403 | if (req->rq_err) { | |
1404 | req->rq_status = rc; | |
0a3bdb00 | 1405 | return 1; |
d7e09d03 | 1406 | } |
5b9359f1 MY |
1407 | spin_lock(&req->rq_lock); |
1408 | req->rq_wait_ctx = 1; | |
1409 | spin_unlock(&req->rq_lock); | |
1410 | return 0; | |
d7e09d03 PT |
1411 | } |
1412 | ||
2d00bd17 JP |
1413 | CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n", |
1414 | current_comm(), | |
d7e09d03 PT |
1415 | imp->imp_obd->obd_uuid.uuid, |
1416 | lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, | |
1417 | libcfs_nid2str(imp->imp_connection->c_peer.nid), | |
1418 | lustre_msg_get_opc(req->rq_reqmsg)); | |
1419 | ||
1420 | rc = ptl_send_rpc(req, 0); | |
1421 | if (rc) { | |
1422 | DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc); | |
15c50ccc | 1423 | spin_lock(&req->rq_lock); |
d7e09d03 | 1424 | req->rq_net_err = 1; |
15c50ccc | 1425 | spin_unlock(&req->rq_lock); |
0a3bdb00 | 1426 | return rc; |
d7e09d03 | 1427 | } |
0a3bdb00 | 1428 | return 0; |
d7e09d03 PT |
1429 | } |
1430 | ||
1431 | static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set) | |
1432 | { | |
1433 | int remaining, rc; | |
d7e09d03 | 1434 | |
8b382089 | 1435 | LASSERT(set->set_producer); |
d7e09d03 PT |
1436 | |
1437 | remaining = atomic_read(&set->set_remaining); | |
1438 | ||
ce1c42ed SW |
1439 | /* |
1440 | * populate the ->set_requests list with requests until we | |
1441 | * reach the maximum number of RPCs in flight for this set | |
1442 | */ | |
d7e09d03 PT |
1443 | while (atomic_read(&set->set_remaining) < set->set_max_inflight) { |
1444 | rc = set->set_producer(set, set->set_producer_arg); | |
1445 | if (rc == -ENOENT) { | |
1446 | /* no more RPC to produce */ | |
1447 | set->set_producer = NULL; | |
1448 | set->set_producer_arg = NULL; | |
0a3bdb00 | 1449 | return 0; |
d7e09d03 PT |
1450 | } |
1451 | } | |
1452 | ||
0a3bdb00 | 1453 | return (atomic_read(&set->set_remaining) - remaining); |
d7e09d03 PT |
1454 | } |
1455 | ||
1456 | /** | |
1457 | * this sends any unsent RPCs in \a set and returns 1 if all are sent | |
1458 | * and no more replies are expected. | |
1459 | * (it is possible to get less replies than requests sent e.g. due to timed out | |
1460 | * requests or requests that we had trouble to send out) | |
da9e33c9 CM |
1461 | * |
1462 | * NOTE: This function contains a potential schedule point (cond_resched()). | |
d7e09d03 PT |
1463 | */ |
1464 | int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) | |
1465 | { | |
1466 | struct list_head *tmp, *next; | |
fa55c6a4 | 1467 | struct list_head comp_reqs; |
d7e09d03 | 1468 | int force_timer_recalc = 0; |
d7e09d03 PT |
1469 | |
1470 | if (atomic_read(&set->set_remaining) == 0) | |
0a3bdb00 | 1471 | return 1; |
d7e09d03 | 1472 | |
fa55c6a4 | 1473 | INIT_LIST_HEAD(&comp_reqs); |
d7e09d03 PT |
1474 | list_for_each_safe(tmp, next, &set->set_requests) { |
1475 | struct ptlrpc_request *req = | |
30c0aa39 | 1476 | list_entry(tmp, struct ptlrpc_request, rq_set_chain); |
d7e09d03 PT |
1477 | struct obd_import *imp = req->rq_import; |
1478 | int unregistered = 0; | |
1479 | int rc = 0; | |
1480 | ||
ce1c42ed SW |
1481 | /* |
1482 | * This schedule point is mainly for the ptlrpcd caller of this | |
da9e33c9 CM |
1483 | * function. Most ptlrpc sets are not long-lived and unbounded |
1484 | * in length, but at the least the set used by the ptlrpcd is. | |
1485 | * Since the processing time is unbounded, we need to insert an | |
1486 | * explicit schedule point to make the thread well-behaved. | |
1487 | */ | |
1488 | cond_resched(); | |
1489 | ||
d7e09d03 PT |
1490 | if (req->rq_phase == RQ_PHASE_NEW && |
1491 | ptlrpc_send_new_req(req)) { | |
1492 | force_timer_recalc = 1; | |
1493 | } | |
1494 | ||
1495 | /* delayed send - skip */ | |
1496 | if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent) | |
1497 | continue; | |
1498 | ||
1499 | /* delayed resend - skip */ | |
1500 | if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend && | |
219e6de6 | 1501 | req->rq_sent > ktime_get_real_seconds()) |
d7e09d03 PT |
1502 | continue; |
1503 | ||
1504 | if (!(req->rq_phase == RQ_PHASE_RPC || | |
1505 | req->rq_phase == RQ_PHASE_BULK || | |
1506 | req->rq_phase == RQ_PHASE_INTERPRET || | |
1507 | req->rq_phase == RQ_PHASE_UNREGISTERING || | |
1508 | req->rq_phase == RQ_PHASE_COMPLETE)) { | |
1509 | DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase); | |
1510 | LBUG(); | |
1511 | } | |
1512 | ||
1513 | if (req->rq_phase == RQ_PHASE_UNREGISTERING) { | |
1514 | LASSERT(req->rq_next_phase != req->rq_phase); | |
1515 | LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED); | |
1516 | ||
1517 | /* | |
1518 | * Skip processing until reply is unlinked. We | |
1519 | * can't return to pool before that and we can't | |
1520 | * call interpret before that. We need to make | |
1521 | * sure that all rdma transfers finished and will | |
1522 | * not corrupt any data. | |
1523 | */ | |
1524 | if (ptlrpc_client_recv_or_unlink(req) || | |
1525 | ptlrpc_client_bulk_active(req)) | |
1526 | continue; | |
1527 | ||
1528 | /* | |
1529 | * Turn fail_loc off to prevent it from looping | |
1530 | * forever. | |
1531 | */ | |
1532 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { | |
1533 | OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK, | |
1534 | OBD_FAIL_ONCE); | |
1535 | } | |
1536 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) { | |
1537 | OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK, | |
1538 | OBD_FAIL_ONCE); | |
1539 | } | |
1540 | ||
ce1c42ed | 1541 | /* Move to next phase if reply was successfully |
d7e09d03 PT |
1542 | * unlinked. |
1543 | */ | |
1544 | ptlrpc_rqphase_move(req, req->rq_next_phase); | |
1545 | } | |
1546 | ||
fa55c6a4 LZ |
1547 | if (req->rq_phase == RQ_PHASE_COMPLETE) { |
1548 | list_move_tail(&req->rq_set_chain, &comp_reqs); | |
d7e09d03 | 1549 | continue; |
fa55c6a4 | 1550 | } |
d7e09d03 PT |
1551 | |
1552 | if (req->rq_phase == RQ_PHASE_INTERPRET) | |
a9b3e8f3 | 1553 | goto interpret; |
d7e09d03 | 1554 | |
ce1c42ed | 1555 | /* Note that this also will start async reply unlink. */ |
d7e09d03 PT |
1556 | if (req->rq_net_err && !req->rq_timedout) { |
1557 | ptlrpc_expire_one_request(req, 1); | |
1558 | ||
ce1c42ed | 1559 | /* Check if we still need to wait for unlink. */ |
d7e09d03 PT |
1560 | if (ptlrpc_client_recv_or_unlink(req) || |
1561 | ptlrpc_client_bulk_active(req)) | |
1562 | continue; | |
1563 | /* If there is no need to resend, fail it now. */ | |
1564 | if (req->rq_no_resend) { | |
1565 | if (req->rq_status == 0) | |
1566 | req->rq_status = -EIO; | |
1567 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
a9b3e8f3 | 1568 | goto interpret; |
d7e09d03 PT |
1569 | } else { |
1570 | continue; | |
1571 | } | |
1572 | } | |
1573 | ||
1574 | if (req->rq_err) { | |
1575 | spin_lock(&req->rq_lock); | |
1576 | req->rq_replied = 0; | |
1577 | spin_unlock(&req->rq_lock); | |
1578 | if (req->rq_status == 0) | |
1579 | req->rq_status = -EIO; | |
1580 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
a9b3e8f3 | 1581 | goto interpret; |
d7e09d03 PT |
1582 | } |
1583 | ||
ce1c42ed SW |
1584 | /* |
1585 | * ptlrpc_set_wait->l_wait_event sets lwi_allow_intr | |
d7e09d03 PT |
1586 | * so it sets rq_intr regardless of individual rpc |
1587 | * timeouts. The synchronous IO waiting path sets | |
1588 | * rq_intr irrespective of whether ptlrpcd | |
1589 | * has seen a timeout. Our policy is to only interpret | |
1590 | * interrupted rpcs after they have timed out, so we | |
1591 | * need to enforce that here. | |
1592 | */ | |
1593 | ||
1594 | if (req->rq_intr && (req->rq_timedout || req->rq_waiting || | |
1595 | req->rq_wait_ctx)) { | |
1596 | req->rq_status = -EINTR; | |
1597 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
a9b3e8f3 | 1598 | goto interpret; |
d7e09d03 PT |
1599 | } |
1600 | ||
1601 | if (req->rq_phase == RQ_PHASE_RPC) { | |
1602 | if (req->rq_timedout || req->rq_resend || | |
1603 | req->rq_waiting || req->rq_wait_ctx) { | |
1604 | int status; | |
1605 | ||
1606 | if (!ptlrpc_unregister_reply(req, 1)) | |
1607 | continue; | |
1608 | ||
1609 | spin_lock(&imp->imp_lock); | |
cb68dd2d KM |
1610 | if (ptlrpc_import_delay_req(imp, req, |
1611 | &status)) { | |
ce1c42ed SW |
1612 | /* |
1613 | * put on delay list - only if we wait | |
1614 | * recovery finished - before send | |
1615 | */ | |
d7e09d03 PT |
1616 | list_del_init(&req->rq_list); |
1617 | list_add_tail(&req->rq_list, | |
30c0aa39 | 1618 | &imp->imp_delayed_list); |
d7e09d03 PT |
1619 | spin_unlock(&imp->imp_lock); |
1620 | continue; | |
1621 | } | |
1622 | ||
d0bfef31 | 1623 | if (status != 0) { |
d7e09d03 PT |
1624 | req->rq_status = status; |
1625 | ptlrpc_rqphase_move(req, | |
30c0aa39 | 1626 | RQ_PHASE_INTERPRET); |
d7e09d03 | 1627 | spin_unlock(&imp->imp_lock); |
a9b3e8f3 | 1628 | goto interpret; |
d7e09d03 PT |
1629 | } |
1630 | if (ptlrpc_no_resend(req) && | |
1631 | !req->rq_wait_ctx) { | |
1632 | req->rq_status = -ENOTCONN; | |
1633 | ptlrpc_rqphase_move(req, | |
1634 | RQ_PHASE_INTERPRET); | |
1635 | spin_unlock(&imp->imp_lock); | |
a9b3e8f3 | 1636 | goto interpret; |
d7e09d03 PT |
1637 | } |
1638 | ||
1639 | list_del_init(&req->rq_list); | |
1640 | list_add_tail(&req->rq_list, | |
30c0aa39 | 1641 | &imp->imp_sending_list); |
d7e09d03 PT |
1642 | |
1643 | spin_unlock(&imp->imp_lock); | |
1644 | ||
1645 | spin_lock(&req->rq_lock); | |
1646 | req->rq_waiting = 0; | |
1647 | spin_unlock(&req->rq_lock); | |
1648 | ||
1649 | if (req->rq_timedout || req->rq_resend) { | |
ce1c42ed | 1650 | /* This is re-sending anyway, let's mark req as resend. */ |
d7e09d03 PT |
1651 | spin_lock(&req->rq_lock); |
1652 | req->rq_resend = 1; | |
1653 | spin_unlock(&req->rq_lock); | |
1654 | if (req->rq_bulk) { | |
1655 | __u64 old_xid; | |
1656 | ||
1657 | if (!ptlrpc_unregister_bulk(req, 1)) | |
1658 | continue; | |
1659 | ||
1660 | /* ensure previous bulk fails */ | |
1661 | old_xid = req->rq_xid; | |
1662 | req->rq_xid = ptlrpc_next_xid(); | |
b0f5aad5 | 1663 | CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n", |
d7e09d03 PT |
1664 | old_xid, req->rq_xid); |
1665 | } | |
1666 | } | |
1667 | /* | |
1668 | * rq_wait_ctx is only touched by ptlrpcd, | |
1669 | * so no lock is needed here. | |
1670 | */ | |
1671 | status = sptlrpc_req_refresh_ctx(req, -1); | |
1672 | if (status) { | |
1673 | if (req->rq_err) { | |
1674 | req->rq_status = status; | |
1675 | spin_lock(&req->rq_lock); | |
1676 | req->rq_wait_ctx = 0; | |
1677 | spin_unlock(&req->rq_lock); | |
1678 | force_timer_recalc = 1; | |
1679 | } else { | |
1680 | spin_lock(&req->rq_lock); | |
1681 | req->rq_wait_ctx = 1; | |
1682 | spin_unlock(&req->rq_lock); | |
1683 | } | |
1684 | ||
1685 | continue; | |
1686 | } else { | |
1687 | spin_lock(&req->rq_lock); | |
1688 | req->rq_wait_ctx = 0; | |
1689 | spin_unlock(&req->rq_lock); | |
1690 | } | |
1691 | ||
1692 | rc = ptl_send_rpc(req, 0); | |
1693 | if (rc) { | |
1694 | DEBUG_REQ(D_HA, req, | |
1695 | "send failed: rc = %d", rc); | |
1696 | force_timer_recalc = 1; | |
1697 | spin_lock(&req->rq_lock); | |
1698 | req->rq_net_err = 1; | |
1699 | spin_unlock(&req->rq_lock); | |
e3bceb23 | 1700 | continue; |
d7e09d03 PT |
1701 | } |
1702 | /* need to reset the timeout */ | |
1703 | force_timer_recalc = 1; | |
1704 | } | |
1705 | ||
1706 | spin_lock(&req->rq_lock); | |
1707 | ||
1708 | if (ptlrpc_client_early(req)) { | |
1709 | ptlrpc_at_recv_early_reply(req); | |
1710 | spin_unlock(&req->rq_lock); | |
1711 | continue; | |
1712 | } | |
1713 | ||
1714 | /* Still waiting for a reply? */ | |
1715 | if (ptlrpc_client_recv(req)) { | |
1716 | spin_unlock(&req->rq_lock); | |
1717 | continue; | |
1718 | } | |
1719 | ||
1720 | /* Did we actually receive a reply? */ | |
1721 | if (!ptlrpc_client_replied(req)) { | |
1722 | spin_unlock(&req->rq_lock); | |
1723 | continue; | |
1724 | } | |
1725 | ||
1726 | spin_unlock(&req->rq_lock); | |
1727 | ||
ce1c42ed SW |
1728 | /* |
1729 | * unlink from net because we are going to | |
1730 | * swab in-place of reply buffer | |
1731 | */ | |
d7e09d03 PT |
1732 | unregistered = ptlrpc_unregister_reply(req, 1); |
1733 | if (!unregistered) | |
1734 | continue; | |
1735 | ||
1736 | req->rq_status = after_reply(req); | |
1737 | if (req->rq_resend) | |
1738 | continue; | |
1739 | ||
ce1c42ed SW |
1740 | /* |
1741 | * If there is no bulk associated with this request, | |
d7e09d03 PT |
1742 | * then we're done and should let the interpreter |
1743 | * process the reply. Similarly if the RPC returned | |
1744 | * an error, and therefore the bulk will never arrive. | |
1745 | */ | |
8b382089 | 1746 | if (!req->rq_bulk || req->rq_status < 0) { |
d7e09d03 | 1747 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); |
a9b3e8f3 | 1748 | goto interpret; |
d7e09d03 PT |
1749 | } |
1750 | ||
1751 | ptlrpc_rqphase_move(req, RQ_PHASE_BULK); | |
1752 | } | |
1753 | ||
1754 | LASSERT(req->rq_phase == RQ_PHASE_BULK); | |
1755 | if (ptlrpc_client_bulk_active(req)) | |
1756 | continue; | |
1757 | ||
1758 | if (req->rq_bulk->bd_failure) { | |
ce1c42ed SW |
1759 | /* |
1760 | * The RPC reply arrived OK, but the bulk screwed | |
d7e09d03 PT |
1761 | * up! Dead weird since the server told us the RPC |
1762 | * was good after getting the REPLY for her GET or | |
ce1c42ed SW |
1763 | * the ACK for her PUT. |
1764 | */ | |
d7e09d03 PT |
1765 | DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); |
1766 | req->rq_status = -EIO; | |
1767 | } | |
1768 | ||
1769 | ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); | |
1770 | ||
7f1d15a8 | 1771 | interpret: |
d7e09d03 PT |
1772 | LASSERT(req->rq_phase == RQ_PHASE_INTERPRET); |
1773 | ||
ce1c42ed SW |
1774 | /* |
1775 | * This moves to "unregistering" phase we need to wait for | |
1776 | * reply unlink. | |
1777 | */ | |
d7e09d03 PT |
1778 | if (!unregistered && !ptlrpc_unregister_reply(req, 1)) { |
1779 | /* start async bulk unlink too */ | |
1780 | ptlrpc_unregister_bulk(req, 1); | |
1781 | continue; | |
1782 | } | |
1783 | ||
1784 | if (!ptlrpc_unregister_bulk(req, 1)) | |
1785 | continue; | |
1786 | ||
ce1c42ed | 1787 | /* When calling interpret receive should already be finished. */ |
d7e09d03 PT |
1788 | LASSERT(!req->rq_receiving_reply); |
1789 | ||
1790 | ptlrpc_req_interpret(env, req, req->rq_status); | |
1791 | ||
82a373ae LZ |
1792 | if (ptlrpcd_check_work(req)) { |
1793 | atomic_dec(&set->set_remaining); | |
1794 | continue; | |
1795 | } | |
d7e09d03 PT |
1796 | ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE); |
1797 | ||
8b382089 | 1798 | CDEBUG(req->rq_reqmsg ? D_RPCTRACE : 0, |
2d00bd17 JP |
1799 | "Completed RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n", |
1800 | current_comm(), imp->imp_obd->obd_uuid.uuid, | |
1801 | lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, | |
1802 | libcfs_nid2str(imp->imp_connection->c_peer.nid), | |
1803 | lustre_msg_get_opc(req->rq_reqmsg)); | |
d7e09d03 PT |
1804 | |
1805 | spin_lock(&imp->imp_lock); | |
ce1c42ed SW |
1806 | /* |
1807 | * Request already may be not on sending or delaying list. This | |
d7e09d03 PT |
1808 | * may happen in the case of marking it erroneous for the case |
1809 | * ptlrpc_import_delay_req(req, status) find it impossible to | |
ce1c42ed SW |
1810 | * allow sending this rpc and returns *status != 0. |
1811 | */ | |
d7e09d03 PT |
1812 | if (!list_empty(&req->rq_list)) { |
1813 | list_del_init(&req->rq_list); | |
1814 | atomic_dec(&imp->imp_inflight); | |
1815 | } | |
1816 | spin_unlock(&imp->imp_lock); | |
1817 | ||
1818 | atomic_dec(&set->set_remaining); | |
1819 | wake_up_all(&imp->imp_recovery_waitq); | |
1820 | ||
1821 | if (set->set_producer) { | |
1822 | /* produce a new request if possible */ | |
1823 | if (ptlrpc_set_producer(set) > 0) | |
1824 | force_timer_recalc = 1; | |
1825 | ||
ce1c42ed SW |
1826 | /* |
1827 | * free the request that has just been completed | |
1828 | * in order not to pollute set->set_requests | |
1829 | */ | |
d7e09d03 PT |
1830 | list_del_init(&req->rq_set_chain); |
1831 | spin_lock(&req->rq_lock); | |
1832 | req->rq_set = NULL; | |
1833 | req->rq_invalid_rqset = 0; | |
1834 | spin_unlock(&req->rq_lock); | |
1835 | ||
1836 | /* record rq_status to compute the final status later */ | |
1837 | if (req->rq_status != 0) | |
1838 | set->set_rc = req->rq_status; | |
1839 | ptlrpc_req_finished(req); | |
fa55c6a4 LZ |
1840 | } else { |
1841 | list_move_tail(&req->rq_set_chain, &comp_reqs); | |
d7e09d03 PT |
1842 | } |
1843 | } | |
1844 | ||
ce1c42ed SW |
1845 | /* |
1846 | * move completed request at the head of list so it's easier for | |
1847 | * caller to find them | |
1848 | */ | |
fa55c6a4 LZ |
1849 | list_splice(&comp_reqs, &set->set_requests); |
1850 | ||
d7e09d03 | 1851 | /* If we hit an error, we want to recover promptly. */ |
0a3bdb00 | 1852 | return atomic_read(&set->set_remaining) == 0 || force_timer_recalc; |
d7e09d03 PT |
1853 | } |
1854 | EXPORT_SYMBOL(ptlrpc_check_set); | |
1855 | ||
1856 | /** | |
1857 | * Time out request \a req. is \a async_unlink is set, that means do not wait | |
1858 | * until LNet actually confirms network buffer unlinking. | |
1859 | * Return 1 if we should give up further retrying attempts or 0 otherwise. | |
1860 | */ | |
1861 | int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) | |
1862 | { | |
1863 | struct obd_import *imp = req->rq_import; | |
1864 | int rc = 0; | |
d7e09d03 PT |
1865 | |
1866 | spin_lock(&req->rq_lock); | |
1867 | req->rq_timedout = 1; | |
1868 | spin_unlock(&req->rq_lock); | |
1869 | ||
219e6de6 | 1870 | DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent %lld/real %lld]", |
d7e09d03 PT |
1871 | req->rq_net_err ? "failed due to network error" : |
1872 | ((req->rq_real_sent == 0 || | |
219e6de6 AB |
1873 | req->rq_real_sent < req->rq_sent || |
1874 | req->rq_real_sent >= req->rq_deadline) ? | |
d7e09d03 | 1875 | "timed out for sent delay" : "timed out for slow reply"), |
219e6de6 | 1876 | (s64)req->rq_sent, (s64)req->rq_real_sent); |
d7e09d03 | 1877 | |
8b382089 | 1878 | if (imp && obd_debug_peer_on_timeout) |
71c36dd7 | 1879 | LNetDebugPeer(imp->imp_connection->c_peer); |
d7e09d03 PT |
1880 | |
1881 | ptlrpc_unregister_reply(req, async_unlink); | |
1882 | ptlrpc_unregister_bulk(req, async_unlink); | |
1883 | ||
1884 | if (obd_dump_on_timeout) | |
1885 | libcfs_debug_dumplog(); | |
1886 | ||
8b382089 | 1887 | if (!imp) { |
d7e09d03 | 1888 | DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?"); |
0a3bdb00 | 1889 | return 1; |
d7e09d03 PT |
1890 | } |
1891 | ||
1892 | atomic_inc(&imp->imp_timeouts); | |
1893 | ||
1894 | /* The DLM server doesn't want recovery run on its imports. */ | |
1895 | if (imp->imp_dlm_fake) | |
0a3bdb00 | 1896 | return 1; |
d7e09d03 | 1897 | |
ce1c42ed SW |
1898 | /* |
1899 | * If this request is for recovery or other primordial tasks, | |
1900 | * then error it out here. | |
1901 | */ | |
d7e09d03 PT |
1902 | if (req->rq_ctx_init || req->rq_ctx_fini || |
1903 | req->rq_send_state != LUSTRE_IMP_FULL || | |
1904 | imp->imp_obd->obd_no_recov) { | |
1905 | DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)", | |
1906 | ptlrpc_import_state_name(req->rq_send_state), | |
1907 | ptlrpc_import_state_name(imp->imp_state)); | |
1908 | spin_lock(&req->rq_lock); | |
1909 | req->rq_status = -ETIMEDOUT; | |
1910 | req->rq_err = 1; | |
1911 | spin_unlock(&req->rq_lock); | |
0a3bdb00 | 1912 | return 1; |
d7e09d03 PT |
1913 | } |
1914 | ||
ce1c42ed SW |
1915 | /* |
1916 | * if a request can't be resent we can't wait for an answer after | |
1917 | * the timeout | |
1918 | */ | |
d7e09d03 PT |
1919 | if (ptlrpc_no_resend(req)) { |
1920 | DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:"); | |
1921 | rc = 1; | |
1922 | } | |
1923 | ||
1924 | ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg)); | |
1925 | ||
0a3bdb00 | 1926 | return rc; |
d7e09d03 PT |
1927 | } |
1928 | ||
1929 | /** | |
1930 | * Time out all uncompleted requests in request set pointed by \a data | |
1931 | * Callback used when waiting on sets with l_wait_event. | |
1932 | * Always returns 1. | |
1933 | */ | |
1934 | int ptlrpc_expired_set(void *data) | |
1935 | { | |
1936 | struct ptlrpc_request_set *set = data; | |
d0bfef31 | 1937 | struct list_head *tmp; |
219e6de6 | 1938 | time64_t now = ktime_get_real_seconds(); |
d7e09d03 | 1939 | |
ce1c42ed | 1940 | /* A timeout expired. See which reqs it applies to... */ |
3949015e | 1941 | list_for_each(tmp, &set->set_requests) { |
d7e09d03 | 1942 | struct ptlrpc_request *req = |
30c0aa39 | 1943 | list_entry(tmp, struct ptlrpc_request, rq_set_chain); |
d7e09d03 PT |
1944 | |
1945 | /* don't expire request waiting for context */ | |
1946 | if (req->rq_wait_ctx) | |
1947 | continue; | |
1948 | ||
1949 | /* Request in-flight? */ | |
1950 | if (!((req->rq_phase == RQ_PHASE_RPC && | |
1951 | !req->rq_waiting && !req->rq_resend) || | |
1952 | (req->rq_phase == RQ_PHASE_BULK))) | |
1953 | continue; | |
1954 | ||
1955 | if (req->rq_timedout || /* already dealt with */ | |
1956 | req->rq_deadline > now) /* not expired */ | |
1957 | continue; | |
1958 | ||
ce1c42ed SW |
1959 | /* |
1960 | * Deal with this guy. Do it asynchronously to not block | |
1961 | * ptlrpcd thread. | |
1962 | */ | |
d7e09d03 PT |
1963 | ptlrpc_expire_one_request(req, 1); |
1964 | } | |
1965 | ||
1966 | /* | |
1967 | * When waiting for a whole set, we always break out of the | |
1968 | * sleep so we can recalculate the timeout, or enable interrupts | |
1969 | * if everyone's timed out. | |
1970 | */ | |
0a3bdb00 | 1971 | return 1; |
d7e09d03 PT |
1972 | } |
1973 | EXPORT_SYMBOL(ptlrpc_expired_set); | |
1974 | ||
1975 | /** | |
1976 | * Sets rq_intr flag in \a req under spinlock. | |
1977 | */ | |
1978 | void ptlrpc_mark_interrupted(struct ptlrpc_request *req) | |
1979 | { | |
1980 | spin_lock(&req->rq_lock); | |
1981 | req->rq_intr = 1; | |
1982 | spin_unlock(&req->rq_lock); | |
1983 | } | |
1984 | EXPORT_SYMBOL(ptlrpc_mark_interrupted); | |
1985 | ||
1986 | /** | |
1987 | * Interrupts (sets interrupted flag) all uncompleted requests in | |
1988 | * a set \a data. Callback for l_wait_event for interruptible waits. | |
1989 | */ | |
1990 | void ptlrpc_interrupted_set(void *data) | |
1991 | { | |
1992 | struct ptlrpc_request_set *set = data; | |
1993 | struct list_head *tmp; | |
1994 | ||
d7e09d03 PT |
1995 | CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set); |
1996 | ||
1997 | list_for_each(tmp, &set->set_requests) { | |
1998 | struct ptlrpc_request *req = | |
30c0aa39 | 1999 | list_entry(tmp, struct ptlrpc_request, rq_set_chain); |
d7e09d03 PT |
2000 | |
2001 | if (req->rq_phase != RQ_PHASE_RPC && | |
2002 | req->rq_phase != RQ_PHASE_UNREGISTERING) | |
2003 | continue; | |
2004 | ||
2005 | ptlrpc_mark_interrupted(req); | |
2006 | } | |
2007 | } | |
2008 | EXPORT_SYMBOL(ptlrpc_interrupted_set); | |
2009 | ||
2010 | /** | |
2011 | * Get the smallest timeout in the set; this does NOT set a timeout. | |
2012 | */ | |
2013 | int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) | |
2014 | { | |
d0bfef31 | 2015 | struct list_head *tmp; |
219e6de6 | 2016 | time64_t now = ktime_get_real_seconds(); |
d0bfef31 | 2017 | int timeout = 0; |
d7e09d03 | 2018 | struct ptlrpc_request *req; |
219e6de6 | 2019 | time64_t deadline; |
d7e09d03 | 2020 | |
d7e09d03 PT |
2021 | list_for_each(tmp, &set->set_requests) { |
2022 | req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); | |
2023 | ||
ce1c42ed | 2024 | /* Request in-flight? */ |
d7e09d03 PT |
2025 | if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || |
2026 | (req->rq_phase == RQ_PHASE_BULK) || | |
2027 | (req->rq_phase == RQ_PHASE_NEW))) | |
2028 | continue; | |
2029 | ||
ce1c42ed | 2030 | /* Already timed out. */ |
d7e09d03 PT |
2031 | if (req->rq_timedout) |
2032 | continue; | |
2033 | ||
ce1c42ed | 2034 | /* Waiting for ctx. */ |
d7e09d03 PT |
2035 | if (req->rq_wait_ctx) |
2036 | continue; | |
2037 | ||
2038 | if (req->rq_phase == RQ_PHASE_NEW) | |
2039 | deadline = req->rq_sent; | |
2040 | else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend) | |
2041 | deadline = req->rq_sent; | |
2042 | else | |
2043 | deadline = req->rq_sent + req->rq_timeout; | |
2044 | ||
2045 | if (deadline <= now) /* actually expired already */ | |
2046 | timeout = 1; /* ASAP */ | |
2047 | else if (timeout == 0 || timeout > deadline - now) | |
2048 | timeout = deadline - now; | |
2049 | } | |
0a3bdb00 | 2050 | return timeout; |
d7e09d03 PT |
2051 | } |
2052 | EXPORT_SYMBOL(ptlrpc_set_next_timeout); | |
2053 | ||
2054 | /** | |
930cef9a | 2055 | * Send all unset request from the set and then wait until all |
d7e09d03 PT |
2056 | * requests in the set complete (either get a reply, timeout, get an |
2057 | * error or otherwise be interrupted). | |
2058 | * Returns 0 on success or error code otherwise. | |
2059 | */ | |
2060 | int ptlrpc_set_wait(struct ptlrpc_request_set *set) | |
2061 | { | |
d0bfef31 | 2062 | struct list_head *tmp; |
d7e09d03 | 2063 | struct ptlrpc_request *req; |
d0bfef31 CH |
2064 | struct l_wait_info lwi; |
2065 | int rc, timeout; | |
d7e09d03 PT |
2066 | |
2067 | if (set->set_producer) | |
2068 | (void)ptlrpc_set_producer(set); | |
2069 | else | |
2070 | list_for_each(tmp, &set->set_requests) { | |
2071 | req = list_entry(tmp, struct ptlrpc_request, | |
30c0aa39 | 2072 | rq_set_chain); |
d7e09d03 PT |
2073 | if (req->rq_phase == RQ_PHASE_NEW) |
2074 | (void)ptlrpc_send_new_req(req); | |
2075 | } | |
2076 | ||
2077 | if (list_empty(&set->set_requests)) | |
0a3bdb00 | 2078 | return 0; |
d7e09d03 PT |
2079 | |
2080 | do { | |
2081 | timeout = ptlrpc_set_next_timeout(set); | |
2082 | ||
ce1c42ed SW |
2083 | /* |
2084 | * wait until all complete, interrupted, or an in-flight | |
2085 | * req times out | |
2086 | */ | |
d7e09d03 PT |
2087 | CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n", |
2088 | set, timeout); | |
2089 | ||
2090 | if (timeout == 0 && !cfs_signal_pending()) | |
2091 | /* | |
2092 | * No requests are in-flight (ether timed out | |
2093 | * or delayed), so we can allow interrupts. | |
2094 | * We still want to block for a limited time, | |
2095 | * so we allow interrupts during the timeout. | |
2096 | */ | |
2097 | lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1), | |
2098 | ptlrpc_expired_set, | |
2099 | ptlrpc_interrupted_set, set); | |
2100 | else | |
2101 | /* | |
2102 | * At least one request is in flight, so no | |
2103 | * interrupts are allowed. Wait until all | |
2104 | * complete, or an in-flight req times out. | |
2105 | */ | |
0ae015be | 2106 | lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1), |
d7e09d03 PT |
2107 | ptlrpc_expired_set, set); |
2108 | ||
2109 | rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi); | |
2110 | ||
ce1c42ed SW |
2111 | /* |
2112 | * LU-769 - if we ignored the signal because it was already | |
d7e09d03 | 2113 | * pending when we started, we need to handle it now or we risk |
ce1c42ed SW |
2114 | * it being ignored forever |
2115 | */ | |
d7e09d03 PT |
2116 | if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr && |
2117 | cfs_signal_pending()) { | |
2118 | sigset_t blocked_sigs = | |
2119 | cfs_block_sigsinv(LUSTRE_FATAL_SIGS); | |
2120 | ||
ce1c42ed SW |
2121 | /* |
2122 | * In fact we only interrupt for the "fatal" signals | |
d7e09d03 PT |
2123 | * like SIGINT or SIGKILL. We still ignore less |
2124 | * important signals since ptlrpc set is not easily | |
ce1c42ed SW |
2125 | * reentrant from userspace again |
2126 | */ | |
d7e09d03 PT |
2127 | if (cfs_signal_pending()) |
2128 | ptlrpc_interrupted_set(set); | |
2129 | cfs_restore_sigs(blocked_sigs); | |
2130 | } | |
2131 | ||
2132 | LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT); | |
2133 | ||
ce1c42ed SW |
2134 | /* |
2135 | * -EINTR => all requests have been flagged rq_intr so next | |
d7e09d03 PT |
2136 | * check completes. |
2137 | * -ETIMEDOUT => someone timed out. When all reqs have | |
2138 | * timed out, signals are enabled allowing completion with | |
2139 | * EINTR. | |
2140 | * I don't really care if we go once more round the loop in | |
ce1c42ed SW |
2141 | * the error cases -eeb. |
2142 | */ | |
d7e09d03 PT |
2143 | if (rc == 0 && atomic_read(&set->set_remaining) == 0) { |
2144 | list_for_each(tmp, &set->set_requests) { | |
2145 | req = list_entry(tmp, struct ptlrpc_request, | |
30c0aa39 | 2146 | rq_set_chain); |
d7e09d03 PT |
2147 | spin_lock(&req->rq_lock); |
2148 | req->rq_invalid_rqset = 1; | |
2149 | spin_unlock(&req->rq_lock); | |
2150 | } | |
2151 | } | |
2152 | } while (rc != 0 || atomic_read(&set->set_remaining) != 0); | |
2153 | ||
2154 | LASSERT(atomic_read(&set->set_remaining) == 0); | |
2155 | ||
2156 | rc = set->set_rc; /* rq_status of already freed requests if any */ | |
2157 | list_for_each(tmp, &set->set_requests) { | |
2158 | req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); | |
2159 | ||
2160 | LASSERT(req->rq_phase == RQ_PHASE_COMPLETE); | |
2161 | if (req->rq_status != 0) | |
2162 | rc = req->rq_status; | |
2163 | } | |
2164 | ||
8b382089 | 2165 | if (set->set_interpret) { |
0ae015be | 2166 | int (*interpreter)(struct ptlrpc_request_set *set, void *, int) = |
d7e09d03 | 2167 | set->set_interpret; |
295ce74f | 2168 | rc = interpreter(set, set->set_arg, rc); |
d7e09d03 PT |
2169 | } else { |
2170 | struct ptlrpc_set_cbdata *cbdata, *n; | |
2171 | int err; | |
2172 | ||
2173 | list_for_each_entry_safe(cbdata, n, | |
2174 | &set->set_cblist, psc_item) { | |
2175 | list_del_init(&cbdata->psc_item); | |
2176 | err = cbdata->psc_interpret(set, cbdata->psc_data, rc); | |
2177 | if (err && !rc) | |
2178 | rc = err; | |
9ae10597 | 2179 | kfree(cbdata); |
d7e09d03 PT |
2180 | } |
2181 | } | |
2182 | ||
0a3bdb00 | 2183 | return rc; |
d7e09d03 PT |
2184 | } |
2185 | EXPORT_SYMBOL(ptlrpc_set_wait); | |
2186 | ||
2187 | /** | |
930cef9a | 2188 | * Helper function for request freeing. |
d7e09d03 PT |
2189 | * Called when request count reached zero and request needs to be freed. |
2190 | * Removes request from all sorts of sending/replay lists it might be on, | |
2191 | * frees network buffers if any are present. | |
2192 | * If \a locked is set, that means caller is already holding import imp_lock | |
2193 | * and so we no longer need to reobtain it (for certain lists manipulations) | |
2194 | */ | |
2195 | static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) | |
2196 | { | |
8b382089 | 2197 | if (!request) |
d7e09d03 | 2198 | return; |
d7e09d03 | 2199 | LASSERTF(!request->rq_receiving_reply, "req %p\n", request); |
8b382089 | 2200 | LASSERTF(!request->rq_rqbd, "req %p\n", request);/* client-side */ |
d7e09d03 PT |
2201 | LASSERTF(list_empty(&request->rq_list), "req %p\n", request); |
2202 | LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request); | |
2203 | LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request); | |
2204 | LASSERTF(!request->rq_replay, "req %p\n", request); | |
2205 | ||
2206 | req_capsule_fini(&request->rq_pill); | |
2207 | ||
ce1c42ed SW |
2208 | /* |
2209 | * We must take it off the imp_replay_list first. Otherwise, we'll set | |
2210 | * request->rq_reqmsg to NULL while osc_close is dereferencing it. | |
2211 | */ | |
8b382089 | 2212 | if (request->rq_import) { |
d7e09d03 PT |
2213 | if (!locked) |
2214 | spin_lock(&request->rq_import->imp_lock); | |
2215 | list_del_init(&request->rq_replay_list); | |
2216 | if (!locked) | |
2217 | spin_unlock(&request->rq_import->imp_lock); | |
2218 | } | |
2219 | LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request); | |
2220 | ||
2221 | if (atomic_read(&request->rq_refcount) != 0) { | |
2222 | DEBUG_REQ(D_ERROR, request, | |
2223 | "freeing request with nonzero refcount"); | |
2224 | LBUG(); | |
2225 | } | |
2226 | ||
8b382089 | 2227 | if (request->rq_repbuf) |
d7e09d03 | 2228 | sptlrpc_cli_free_repbuf(request); |
8b382089 | 2229 | if (request->rq_export) { |
d7e09d03 PT |
2230 | class_export_put(request->rq_export); |
2231 | request->rq_export = NULL; | |
2232 | } | |
8b382089 | 2233 | if (request->rq_import) { |
d7e09d03 PT |
2234 | class_import_put(request->rq_import); |
2235 | request->rq_import = NULL; | |
2236 | } | |
8b382089 | 2237 | if (request->rq_bulk) |
d7e09d03 PT |
2238 | ptlrpc_free_bulk_pin(request->rq_bulk); |
2239 | ||
8b382089 | 2240 | if (request->rq_reqbuf || request->rq_clrbuf) |
d7e09d03 PT |
2241 | sptlrpc_cli_free_reqbuf(request); |
2242 | ||
2243 | if (request->rq_cli_ctx) | |
2244 | sptlrpc_req_put_ctx(request, !locked); | |
2245 | ||
2246 | if (request->rq_pool) | |
2247 | __ptlrpc_free_req_to_pool(request); | |
2248 | else | |
35b2e1b7 | 2249 | ptlrpc_request_cache_free(request); |
d7e09d03 PT |
2250 | } |
2251 | ||
d7e09d03 PT |
2252 | /** |
2253 | * Helper function | |
2254 | * Drops one reference count for request \a request. | |
2255 | * \a locked set indicates that caller holds import imp_lock. | |
930cef9a | 2256 | * Frees the request when reference count reaches zero. |
d7e09d03 PT |
2257 | */ |
2258 | static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked) | |
2259 | { | |
8b382089 | 2260 | if (!request) |
0a3bdb00 | 2261 | return 1; |
d7e09d03 PT |
2262 | |
2263 | if (request == LP_POISON || | |
2264 | request->rq_reqmsg == LP_POISON) { | |
2265 | CERROR("dereferencing freed request (bug 575)\n"); | |
2266 | LBUG(); | |
0a3bdb00 | 2267 | return 1; |
d7e09d03 PT |
2268 | } |
2269 | ||
2270 | DEBUG_REQ(D_INFO, request, "refcount now %u", | |
2271 | atomic_read(&request->rq_refcount) - 1); | |
2272 | ||
2273 | if (atomic_dec_and_test(&request->rq_refcount)) { | |
2274 | __ptlrpc_free_req(request, locked); | |
0a3bdb00 | 2275 | return 1; |
d7e09d03 PT |
2276 | } |
2277 | ||
0a3bdb00 | 2278 | return 0; |
d7e09d03 PT |
2279 | } |
2280 | ||
2281 | /** | |
2282 | * Drops one reference count for a request. | |
2283 | */ | |
2284 | void ptlrpc_req_finished(struct ptlrpc_request *request) | |
2285 | { | |
2286 | __ptlrpc_req_finished(request, 0); | |
2287 | } | |
2288 | EXPORT_SYMBOL(ptlrpc_req_finished); | |
2289 | ||
2290 | /** | |
2291 | * Returns xid of a \a request | |
2292 | */ | |
2293 | __u64 ptlrpc_req_xid(struct ptlrpc_request *request) | |
2294 | { | |
2295 | return request->rq_xid; | |
2296 | } | |
2297 | EXPORT_SYMBOL(ptlrpc_req_xid); | |
2298 | ||
2299 | /** | |
2300 | * Disengage the client's reply buffer from the network | |
2301 | * NB does _NOT_ unregister any client-side bulk. | |
2302 | * IDEMPOTENT, but _not_ safe against concurrent callers. | |
2303 | * The request owner (i.e. the thread doing the I/O) must call... | |
2304 | * Returns 0 on success or 1 if unregistering cannot be made. | |
2305 | */ | |
2306 | int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) | |
2307 | { | |
d0bfef31 CH |
2308 | int rc; |
2309 | wait_queue_head_t *wq; | |
d7e09d03 PT |
2310 | struct l_wait_info lwi; |
2311 | ||
ce1c42ed | 2312 | /* Might sleep. */ |
d7e09d03 PT |
2313 | LASSERT(!in_interrupt()); |
2314 | ||
ce1c42ed | 2315 | /* Let's setup deadline for reply unlink. */ |
d7e09d03 PT |
2316 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && |
2317 | async && request->rq_reply_deadline == 0) | |
219e6de6 | 2318 | request->rq_reply_deadline = ktime_get_real_seconds()+LONG_UNLINK; |
d7e09d03 | 2319 | |
ce1c42ed | 2320 | /* Nothing left to do. */ |
d7e09d03 | 2321 | if (!ptlrpc_client_recv_or_unlink(request)) |
0a3bdb00 | 2322 | return 1; |
d7e09d03 PT |
2323 | |
2324 | LNetMDUnlink(request->rq_reply_md_h); | |
2325 | ||
ce1c42ed | 2326 | /* Let's check it once again. */ |
d7e09d03 | 2327 | if (!ptlrpc_client_recv_or_unlink(request)) |
0a3bdb00 | 2328 | return 1; |
d7e09d03 | 2329 | |
ce1c42ed | 2330 | /* Move to "Unregistering" phase as reply was not unlinked yet. */ |
d7e09d03 PT |
2331 | ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING); |
2332 | ||
ce1c42ed | 2333 | /* Do not wait for unlink to finish. */ |
d7e09d03 | 2334 | if (async) |
0a3bdb00 | 2335 | return 0; |
d7e09d03 PT |
2336 | |
2337 | /* | |
2338 | * We have to l_wait_event() whatever the result, to give liblustre | |
2339 | * a chance to run reply_in_callback(), and to make sure we've | |
2340 | * unlinked before returning a req to the pool. | |
2341 | */ | |
8b382089 | 2342 | if (request->rq_set) |
d7e09d03 PT |
2343 | wq = &request->rq_set->set_waitq; |
2344 | else | |
2345 | wq = &request->rq_reply_waitq; | |
2346 | ||
2347 | for (;;) { | |
ce1c42ed SW |
2348 | /* |
2349 | * Network access will complete in finite time but the HUGE | |
2350 | * timeout lets us CWARN for visibility of sluggish NALs | |
2351 | */ | |
d7e09d03 PT |
2352 | lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), |
2353 | cfs_time_seconds(1), NULL, NULL); | |
2354 | rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request), | |
2355 | &lwi); | |
2356 | if (rc == 0) { | |
2357 | ptlrpc_rqphase_move(request, request->rq_next_phase); | |
0a3bdb00 | 2358 | return 1; |
d7e09d03 PT |
2359 | } |
2360 | ||
2361 | LASSERT(rc == -ETIMEDOUT); | |
cf378ff7 AL |
2362 | DEBUG_REQ(D_WARNING, request, |
2363 | "Unexpectedly long timeout rvcng=%d unlnk=%d/%d", | |
2364 | request->rq_receiving_reply, | |
2365 | request->rq_req_unlink, request->rq_reply_unlink); | |
d7e09d03 | 2366 | } |
0a3bdb00 | 2367 | return 0; |
d7e09d03 PT |
2368 | } |
2369 | EXPORT_SYMBOL(ptlrpc_unregister_reply); | |
2370 | ||
63d42578 HZ |
2371 | static void ptlrpc_free_request(struct ptlrpc_request *req) |
2372 | { | |
2373 | spin_lock(&req->rq_lock); | |
2374 | req->rq_replay = 0; | |
2375 | spin_unlock(&req->rq_lock); | |
2376 | ||
8b382089 | 2377 | if (req->rq_commit_cb) |
63d42578 HZ |
2378 | req->rq_commit_cb(req); |
2379 | list_del_init(&req->rq_replay_list); | |
2380 | ||
2381 | __ptlrpc_req_finished(req, 1); | |
2382 | } | |
2383 | ||
2384 | /** | |
2385 | * the request is committed and dropped from the replay list of its import | |
2386 | */ | |
2387 | void ptlrpc_request_committed(struct ptlrpc_request *req, int force) | |
2388 | { | |
2389 | struct obd_import *imp = req->rq_import; | |
2390 | ||
2391 | spin_lock(&imp->imp_lock); | |
2392 | if (list_empty(&req->rq_replay_list)) { | |
2393 | spin_unlock(&imp->imp_lock); | |
2394 | return; | |
2395 | } | |
2396 | ||
2397 | if (force || req->rq_transno <= imp->imp_peer_committed_transno) | |
2398 | ptlrpc_free_request(req); | |
2399 | ||
2400 | spin_unlock(&imp->imp_lock); | |
2401 | } | |
2402 | EXPORT_SYMBOL(ptlrpc_request_committed); | |
2403 | ||
d7e09d03 PT |
2404 | /** |
2405 | * Iterates through replay_list on import and prunes | |
2406 | * all requests have transno smaller than last_committed for the | |
2407 | * import and don't have rq_replay set. | |
930cef9a | 2408 | * Since requests are sorted in transno order, stops when meeting first |
d7e09d03 PT |
2409 | * transno bigger than last_committed. |
2410 | * caller must hold imp->imp_lock | |
2411 | */ | |
2412 | void ptlrpc_free_committed(struct obd_import *imp) | |
2413 | { | |
63d42578 | 2414 | struct ptlrpc_request *req, *saved; |
d7e09d03 | 2415 | struct ptlrpc_request *last_req = NULL; /* temporary fire escape */ |
d0bfef31 | 2416 | bool skip_committed_list = true; |
d7e09d03 | 2417 | |
5e42bc9d | 2418 | assert_spin_locked(&imp->imp_lock); |
d7e09d03 PT |
2419 | |
2420 | if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked && | |
2421 | imp->imp_generation == imp->imp_last_generation_checked) { | |
b0f5aad5 | 2422 | CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n", |
d7e09d03 | 2423 | imp->imp_obd->obd_name, imp->imp_peer_committed_transno); |
d7e09d03 PT |
2424 | return; |
2425 | } | |
b0f5aad5 | 2426 | CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n", |
d7e09d03 PT |
2427 | imp->imp_obd->obd_name, imp->imp_peer_committed_transno, |
2428 | imp->imp_generation); | |
63d42578 HZ |
2429 | |
2430 | if (imp->imp_generation != imp->imp_last_generation_checked) | |
2431 | skip_committed_list = false; | |
2432 | ||
d7e09d03 PT |
2433 | imp->imp_last_transno_checked = imp->imp_peer_committed_transno; |
2434 | imp->imp_last_generation_checked = imp->imp_generation; | |
2435 | ||
63d42578 HZ |
2436 | list_for_each_entry_safe(req, saved, &imp->imp_replay_list, |
2437 | rq_replay_list) { | |
d7e09d03 PT |
2438 | /* XXX ok to remove when 1357 resolved - rread 05/29/03 */ |
2439 | LASSERT(req != last_req); | |
2440 | last_req = req; | |
2441 | ||
2442 | if (req->rq_transno == 0) { | |
2443 | DEBUG_REQ(D_EMERG, req, "zero transno during replay"); | |
2444 | LBUG(); | |
2445 | } | |
2446 | if (req->rq_import_generation < imp->imp_generation) { | |
2447 | DEBUG_REQ(D_RPCTRACE, req, "free request with old gen"); | |
a9b3e8f3 | 2448 | goto free_req; |
d7e09d03 PT |
2449 | } |
2450 | ||
d7e09d03 PT |
2451 | /* not yet committed */ |
2452 | if (req->rq_transno > imp->imp_peer_committed_transno) { | |
2453 | DEBUG_REQ(D_RPCTRACE, req, "stopping search"); | |
2454 | break; | |
2455 | } | |
2456 | ||
63d42578 HZ |
2457 | if (req->rq_replay) { |
2458 | DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)"); | |
2459 | list_move_tail(&req->rq_replay_list, | |
2460 | &imp->imp_committed_list); | |
2461 | continue; | |
2462 | } | |
2463 | ||
b0f5aad5 | 2464 | DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)", |
d7e09d03 PT |
2465 | imp->imp_peer_committed_transno); |
2466 | free_req: | |
63d42578 HZ |
2467 | ptlrpc_free_request(req); |
2468 | } | |
2469 | if (skip_committed_list) | |
2470 | return; | |
2471 | ||
2472 | list_for_each_entry_safe(req, saved, &imp->imp_committed_list, | |
2473 | rq_replay_list) { | |
2474 | LASSERT(req->rq_transno != 0); | |
2475 | if (req->rq_import_generation < imp->imp_generation) { | |
2476 | DEBUG_REQ(D_RPCTRACE, req, "free stale open request"); | |
2477 | ptlrpc_free_request(req); | |
2478 | } | |
d7e09d03 | 2479 | } |
d7e09d03 PT |
2480 | } |
2481 | ||
d7e09d03 PT |
2482 | /** |
2483 | * Schedule previously sent request for resend. | |
2484 | * For bulk requests we assign new xid (to avoid problems with | |
2485 | * lost replies and therefore several transfers landing into same buffer | |
2486 | * from different sending attempts). | |
2487 | */ | |
2488 | void ptlrpc_resend_req(struct ptlrpc_request *req) | |
2489 | { | |
2490 | DEBUG_REQ(D_HA, req, "going to resend"); | |
5c689e68 AB |
2491 | spin_lock(&req->rq_lock); |
2492 | ||
ce1c42ed SW |
2493 | /* |
2494 | * Request got reply but linked to the import list still. | |
2495 | * Let ptlrpc_check_set() to process it. | |
2496 | */ | |
5c689e68 AB |
2497 | if (ptlrpc_client_replied(req)) { |
2498 | spin_unlock(&req->rq_lock); | |
2499 | DEBUG_REQ(D_HA, req, "it has reply, so skip it"); | |
2500 | return; | |
2501 | } | |
2502 | ||
d7e09d03 PT |
2503 | lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 }); |
2504 | req->rq_status = -EAGAIN; | |
2505 | ||
d7e09d03 PT |
2506 | req->rq_resend = 1; |
2507 | req->rq_net_err = 0; | |
2508 | req->rq_timedout = 0; | |
2509 | if (req->rq_bulk) { | |
2510 | __u64 old_xid = req->rq_xid; | |
2511 | ||
2512 | /* ensure previous bulk fails */ | |
2513 | req->rq_xid = ptlrpc_next_xid(); | |
b0f5aad5 | 2514 | CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n", |
d7e09d03 PT |
2515 | old_xid, req->rq_xid); |
2516 | } | |
2517 | ptlrpc_client_wake_req(req); | |
2518 | spin_unlock(&req->rq_lock); | |
2519 | } | |
2520 | EXPORT_SYMBOL(ptlrpc_resend_req); | |
2521 | ||
d7e09d03 PT |
2522 | /** |
2523 | * Grab additional reference on a request \a req | |
2524 | */ | |
2525 | struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req) | |
2526 | { | |
d7e09d03 | 2527 | atomic_inc(&req->rq_refcount); |
0a3bdb00 | 2528 | return req; |
d7e09d03 PT |
2529 | } |
2530 | EXPORT_SYMBOL(ptlrpc_request_addref); | |
2531 | ||
2532 | /** | |
2533 | * Add a request to import replay_list. | |
2534 | * Must be called under imp_lock | |
2535 | */ | |
2536 | void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, | |
2537 | struct obd_import *imp) | |
2538 | { | |
2539 | struct list_head *tmp; | |
2540 | ||
5e42bc9d | 2541 | assert_spin_locked(&imp->imp_lock); |
d7e09d03 PT |
2542 | |
2543 | if (req->rq_transno == 0) { | |
2544 | DEBUG_REQ(D_EMERG, req, "saving request with zero transno"); | |
2545 | LBUG(); | |
2546 | } | |
2547 | ||
ce1c42ed SW |
2548 | /* |
2549 | * clear this for new requests that were resent as well | |
2550 | * as resent replayed requests. | |
2551 | */ | |
d7e09d03 PT |
2552 | lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT); |
2553 | ||
2554 | /* don't re-add requests that have been replayed */ | |
2555 | if (!list_empty(&req->rq_replay_list)) | |
2556 | return; | |
2557 | ||
2558 | lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY); | |
2559 | ||
2560 | LASSERT(imp->imp_replayable); | |
2561 | /* Balanced in ptlrpc_free_committed, usually. */ | |
2562 | ptlrpc_request_addref(req); | |
2563 | list_for_each_prev(tmp, &imp->imp_replay_list) { | |
2564 | struct ptlrpc_request *iter = | |
30c0aa39 | 2565 | list_entry(tmp, struct ptlrpc_request, rq_replay_list); |
d7e09d03 | 2566 | |
ce1c42ed SW |
2567 | /* |
2568 | * We may have duplicate transnos if we create and then | |
d7e09d03 PT |
2569 | * open a file, or for closes retained if to match creating |
2570 | * opens, so use req->rq_xid as a secondary key. | |
2571 | * (See bugs 684, 685, and 428.) | |
2572 | * XXX no longer needed, but all opens need transnos! | |
2573 | */ | |
2574 | if (iter->rq_transno > req->rq_transno) | |
2575 | continue; | |
2576 | ||
2577 | if (iter->rq_transno == req->rq_transno) { | |
2578 | LASSERT(iter->rq_xid != req->rq_xid); | |
2579 | if (iter->rq_xid > req->rq_xid) | |
2580 | continue; | |
2581 | } | |
2582 | ||
2583 | list_add(&req->rq_replay_list, &iter->rq_replay_list); | |
2584 | return; | |
2585 | } | |
2586 | ||
2587 | list_add(&req->rq_replay_list, &imp->imp_replay_list); | |
2588 | } | |
2589 | EXPORT_SYMBOL(ptlrpc_retain_replayable_request); | |
2590 | ||
2591 | /** | |
2592 | * Send request and wait until it completes. | |
2593 | * Returns request processing status. | |
2594 | */ | |
2595 | int ptlrpc_queue_wait(struct ptlrpc_request *req) | |
2596 | { | |
2597 | struct ptlrpc_request_set *set; | |
2598 | int rc; | |
d7e09d03 | 2599 | |
8b382089 | 2600 | LASSERT(!req->rq_set); |
d7e09d03 PT |
2601 | LASSERT(!req->rq_receiving_reply); |
2602 | ||
2603 | set = ptlrpc_prep_set(); | |
8b382089 | 2604 | if (!set) { |
19b2056f | 2605 | CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM); |
0a3bdb00 | 2606 | return -ENOMEM; |
d7e09d03 PT |
2607 | } |
2608 | ||
2609 | /* for distributed debugging */ | |
2610 | lustre_msg_set_status(req->rq_reqmsg, current_pid()); | |
2611 | ||
2612 | /* add a ref for the set (see comment in ptlrpc_set_add_req) */ | |
2613 | ptlrpc_request_addref(req); | |
2614 | ptlrpc_set_add_req(set, req); | |
2615 | rc = ptlrpc_set_wait(set); | |
2616 | ptlrpc_set_destroy(set); | |
2617 | ||
0a3bdb00 | 2618 | return rc; |
d7e09d03 PT |
2619 | } |
2620 | EXPORT_SYMBOL(ptlrpc_queue_wait); | |
2621 | ||
2622 | struct ptlrpc_replay_async_args { | |
2623 | int praa_old_state; | |
2624 | int praa_old_status; | |
2625 | }; | |
2626 | ||
2627 | /** | |
2628 | * Callback used for replayed requests reply processing. | |
930cef9a | 2629 | * In case of successful reply calls registered request replay callback. |
d7e09d03 PT |
2630 | * In case of error restart replay process. |
2631 | */ | |
2632 | static int ptlrpc_replay_interpret(const struct lu_env *env, | |
2633 | struct ptlrpc_request *req, | |
0028d585 | 2634 | void *data, int rc) |
d7e09d03 PT |
2635 | { |
2636 | struct ptlrpc_replay_async_args *aa = data; | |
2637 | struct obd_import *imp = req->rq_import; | |
2638 | ||
d7e09d03 PT |
2639 | atomic_dec(&imp->imp_replay_inflight); |
2640 | ||
2641 | if (!ptlrpc_client_replied(req)) { | |
2642 | CERROR("request replay timed out, restarting recovery\n"); | |
a9b3e8f3 JL |
2643 | rc = -ETIMEDOUT; |
2644 | goto out; | |
d7e09d03 PT |
2645 | } |
2646 | ||
2647 | if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR && | |
2648 | (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN || | |
a9b3e8f3 JL |
2649 | lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) { |
2650 | rc = lustre_msg_get_status(req->rq_repmsg); | |
2651 | goto out; | |
2652 | } | |
d7e09d03 PT |
2653 | |
2654 | /** VBR: check version failure */ | |
2655 | if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) { | |
2656 | /** replay was failed due to version mismatch */ | |
2657 | DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n"); | |
2658 | spin_lock(&imp->imp_lock); | |
2659 | imp->imp_vbr_failed = 1; | |
2660 | imp->imp_no_lock_replay = 1; | |
2661 | spin_unlock(&imp->imp_lock); | |
2662 | lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); | |
2663 | } else { | |
2664 | /** The transno had better not change over replay. */ | |
2665 | LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) == | |
2666 | lustre_msg_get_transno(req->rq_repmsg) || | |
2667 | lustre_msg_get_transno(req->rq_repmsg) == 0, | |
55f5a824 | 2668 | "%#llx/%#llx\n", |
d7e09d03 PT |
2669 | lustre_msg_get_transno(req->rq_reqmsg), |
2670 | lustre_msg_get_transno(req->rq_repmsg)); | |
2671 | } | |
2672 | ||
2673 | spin_lock(&imp->imp_lock); | |
2674 | /** if replays by version then gap occur on server, no trust to locks */ | |
2675 | if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY) | |
2676 | imp->imp_no_lock_replay = 1; | |
2677 | imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg); | |
2678 | spin_unlock(&imp->imp_lock); | |
2679 | LASSERT(imp->imp_last_replay_transno); | |
2680 | ||
2681 | /* transaction number shouldn't be bigger than the latest replayed */ | |
2682 | if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) { | |
2683 | DEBUG_REQ(D_ERROR, req, | |
b0f5aad5 GKH |
2684 | "Reported transno %llu is bigger than the replayed one: %llu", |
2685 | req->rq_transno, | |
d7e09d03 | 2686 | lustre_msg_get_transno(req->rq_reqmsg)); |
a9b3e8f3 JL |
2687 | rc = -EINVAL; |
2688 | goto out; | |
d7e09d03 PT |
2689 | } |
2690 | ||
2691 | DEBUG_REQ(D_HA, req, "got rep"); | |
2692 | ||
2693 | /* let the callback do fixups, possibly including in the request */ | |
2694 | if (req->rq_replay_cb) | |
2695 | req->rq_replay_cb(req); | |
2696 | ||
2697 | if (ptlrpc_client_replied(req) && | |
2698 | lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) { | |
2699 | DEBUG_REQ(D_ERROR, req, "status %d, old was %d", | |
2700 | lustre_msg_get_status(req->rq_repmsg), | |
2701 | aa->praa_old_status); | |
2702 | } else { | |
2703 | /* Put it back for re-replay. */ | |
2704 | lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); | |
2705 | } | |
2706 | ||
2707 | /* | |
2708 | * Errors while replay can set transno to 0, but | |
2709 | * imp_last_replay_transno shouldn't be set to 0 anyway | |
2710 | */ | |
2711 | if (req->rq_transno == 0) | |
2712 | CERROR("Transno is 0 during replay!\n"); | |
2713 | ||
2714 | /* continue with recovery */ | |
2715 | rc = ptlrpc_import_recovery_state_machine(imp); | |
2716 | out: | |
2717 | req->rq_send_state = aa->praa_old_state; | |
2718 | ||
2719 | if (rc != 0) | |
2720 | /* this replay failed, so restart recovery */ | |
2721 | ptlrpc_connect_import(imp); | |
2722 | ||
0a3bdb00 | 2723 | return rc; |
d7e09d03 PT |
2724 | } |
2725 | ||
2726 | /** | |
2727 | * Prepares and queues request for replay. | |
2728 | * Adds it to ptlrpcd queue for actual sending. | |
2729 | * Returns 0 on success. | |
2730 | */ | |
2731 | int ptlrpc_replay_req(struct ptlrpc_request *req) | |
2732 | { | |
2733 | struct ptlrpc_replay_async_args *aa; | |
d7e09d03 PT |
2734 | |
2735 | LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY); | |
2736 | ||
3949015e | 2737 | LASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); |
d7e09d03 | 2738 | aa = ptlrpc_req_async_args(req); |
ec83e611 | 2739 | memset(aa, 0, sizeof(*aa)); |
d7e09d03 PT |
2740 | |
2741 | /* Prepare request to be resent with ptlrpcd */ | |
2742 | aa->praa_old_state = req->rq_send_state; | |
2743 | req->rq_send_state = LUSTRE_IMP_REPLAY; | |
2744 | req->rq_phase = RQ_PHASE_NEW; | |
2745 | req->rq_next_phase = RQ_PHASE_UNDEFINED; | |
2746 | if (req->rq_repmsg) | |
2747 | aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); | |
2748 | req->rq_status = 0; | |
2749 | req->rq_interpret_reply = ptlrpc_replay_interpret; | |
2750 | /* Readjust the timeout for current conditions */ | |
2751 | ptlrpc_at_set_req_timeout(req); | |
2752 | ||
ce1c42ed SW |
2753 | /* |
2754 | * Tell server the net_latency, so the server can calculate how long | |
2755 | * it should wait for next replay | |
2756 | */ | |
d7e09d03 PT |
2757 | lustre_msg_set_service_time(req->rq_reqmsg, |
2758 | ptlrpc_at_get_net_latency(req)); | |
2759 | DEBUG_REQ(D_HA, req, "REPLAY"); | |
2760 | ||
2761 | atomic_inc(&req->rq_import->imp_replay_inflight); | |
2762 | ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ | |
2763 | ||
c5c4c6fa | 2764 | ptlrpcd_add_req(req); |
0a3bdb00 | 2765 | return 0; |
d7e09d03 PT |
2766 | } |
2767 | EXPORT_SYMBOL(ptlrpc_replay_req); | |
2768 | ||
2769 | /** | |
2770 | * Aborts all in-flight request on import \a imp sending and delayed lists | |
2771 | */ | |
2772 | void ptlrpc_abort_inflight(struct obd_import *imp) | |
2773 | { | |
2774 | struct list_head *tmp, *n; | |
d7e09d03 | 2775 | |
ce1c42ed SW |
2776 | /* |
2777 | * Make sure that no new requests get processed for this import. | |
d7e09d03 PT |
2778 | * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing |
2779 | * this flag and then putting requests on sending_list or delayed_list. | |
2780 | */ | |
2781 | spin_lock(&imp->imp_lock); | |
2782 | ||
ce1c42ed SW |
2783 | /* |
2784 | * XXX locking? Maybe we should remove each request with the list | |
d7e09d03 PT |
2785 | * locked? Also, how do we know if the requests on the list are |
2786 | * being freed at this time? | |
2787 | */ | |
2788 | list_for_each_safe(tmp, n, &imp->imp_sending_list) { | |
2789 | struct ptlrpc_request *req = | |
2790 | list_entry(tmp, struct ptlrpc_request, rq_list); | |
2791 | ||
2792 | DEBUG_REQ(D_RPCTRACE, req, "inflight"); | |
2793 | ||
2794 | spin_lock(&req->rq_lock); | |
2795 | if (req->rq_import_generation < imp->imp_generation) { | |
2796 | req->rq_err = 1; | |
2797 | req->rq_status = -EIO; | |
2798 | ptlrpc_client_wake_req(req); | |
2799 | } | |
2800 | spin_unlock(&req->rq_lock); | |
2801 | } | |
2802 | ||
2803 | list_for_each_safe(tmp, n, &imp->imp_delayed_list) { | |
2804 | struct ptlrpc_request *req = | |
2805 | list_entry(tmp, struct ptlrpc_request, rq_list); | |
2806 | ||
2807 | DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req"); | |
2808 | ||
2809 | spin_lock(&req->rq_lock); | |
2810 | if (req->rq_import_generation < imp->imp_generation) { | |
2811 | req->rq_err = 1; | |
2812 | req->rq_status = -EIO; | |
2813 | ptlrpc_client_wake_req(req); | |
2814 | } | |
2815 | spin_unlock(&req->rq_lock); | |
2816 | } | |
2817 | ||
ce1c42ed SW |
2818 | /* |
2819 | * Last chance to free reqs left on the replay list, but we | |
2820 | * will still leak reqs that haven't committed. | |
2821 | */ | |
d7e09d03 PT |
2822 | if (imp->imp_replayable) |
2823 | ptlrpc_free_committed(imp); | |
2824 | ||
2825 | spin_unlock(&imp->imp_lock); | |
d7e09d03 PT |
2826 | } |
2827 | EXPORT_SYMBOL(ptlrpc_abort_inflight); | |
2828 | ||
2829 | /** | |
2830 | * Abort all uncompleted requests in request set \a set | |
2831 | */ | |
2832 | void ptlrpc_abort_set(struct ptlrpc_request_set *set) | |
2833 | { | |
2834 | struct list_head *tmp, *pos; | |
2835 | ||
d7e09d03 PT |
2836 | list_for_each_safe(pos, tmp, &set->set_requests) { |
2837 | struct ptlrpc_request *req = | |
30c0aa39 | 2838 | list_entry(pos, struct ptlrpc_request, rq_set_chain); |
d7e09d03 PT |
2839 | |
2840 | spin_lock(&req->rq_lock); | |
2841 | if (req->rq_phase != RQ_PHASE_RPC) { | |
2842 | spin_unlock(&req->rq_lock); | |
2843 | continue; | |
2844 | } | |
2845 | ||
2846 | req->rq_err = 1; | |
2847 | req->rq_status = -EINTR; | |
2848 | ptlrpc_client_wake_req(req); | |
2849 | spin_unlock(&req->rq_lock); | |
2850 | } | |
2851 | } | |
2852 | ||
2853 | static __u64 ptlrpc_last_xid; | |
2854 | static spinlock_t ptlrpc_last_xid_lock; | |
2855 | ||
2856 | /** | |
2857 | * Initialize the XID for the node. This is common among all requests on | |
2858 | * this node, and only requires the property that it is monotonically | |
2859 | * increasing. It does not need to be sequential. Since this is also used | |
2860 | * as the RDMA match bits, it is important that a single client NOT have | |
2861 | * the same match bits for two different in-flight requests, hence we do | |
2862 | * NOT want to have an XID per target or similar. | |
2863 | * | |
2864 | * To avoid an unlikely collision between match bits after a client reboot | |
2865 | * (which would deliver old data into the wrong RDMA buffer) initialize | |
2866 | * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s. | |
2867 | * If the time is clearly incorrect, we instead use a 62-bit random number. | |
2868 | * In the worst case the random number will overflow 1M RPCs per second in | |
2869 | * 9133 years, or permutations thereof. | |
2870 | */ | |
2871 | #define YEAR_2004 (1ULL << 30) | |
2872 | void ptlrpc_init_xid(void) | |
2873 | { | |
219e6de6 | 2874 | time64_t now = ktime_get_real_seconds(); |
d7e09d03 PT |
2875 | |
2876 | spin_lock_init(&ptlrpc_last_xid_lock); | |
2877 | if (now < YEAR_2004) { | |
2878 | cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid)); | |
2879 | ptlrpc_last_xid >>= 2; | |
2880 | ptlrpc_last_xid |= (1ULL << 61); | |
2881 | } else { | |
2882 | ptlrpc_last_xid = (__u64)now << 20; | |
2883 | } | |
2884 | ||
930cef9a | 2885 | /* Always need to be aligned to a power-of-two for multi-bulk BRW */ |
2b241d31 | 2886 | CLASSERT(((PTLRPC_BULK_OPS_COUNT - 1) & PTLRPC_BULK_OPS_COUNT) == 0); |
d7e09d03 PT |
2887 | ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK; |
2888 | } | |
2889 | ||
2890 | /** | |
2891 | * Increase xid and returns resulting new value to the caller. | |
2892 | * | |
2893 | * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting | |
2894 | * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC | |
2895 | * itself uses the last bulk xid needed, so the server can determine the | |
2896 | * the number of bulk transfers from the RPC XID and a bitmask. The starting | |
2897 | * xid must align to a power-of-two value. | |
2898 | * | |
2899 | * This is assumed to be true due to the initial ptlrpc_last_xid | |
2900 | * value also being initialized to a power-of-two value. LU-1431 | |
2901 | */ | |
2902 | __u64 ptlrpc_next_xid(void) | |
2903 | { | |
2904 | __u64 next; | |
2905 | ||
2906 | spin_lock(&ptlrpc_last_xid_lock); | |
2907 | next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; | |
2908 | ptlrpc_last_xid = next; | |
2909 | spin_unlock(&ptlrpc_last_xid_lock); | |
2910 | ||
2911 | return next; | |
2912 | } | |
2913 | EXPORT_SYMBOL(ptlrpc_next_xid); | |
2914 | ||
2915 | /** | |
2916 | * Get a glimpse at what next xid value might have been. | |
2917 | * Returns possible next xid. | |
2918 | */ | |
2919 | __u64 ptlrpc_sample_next_xid(void) | |
2920 | { | |
2921 | #if BITS_PER_LONG == 32 | |
2922 | /* need to avoid possible word tearing on 32-bit systems */ | |
2923 | __u64 next; | |
2924 | ||
2925 | spin_lock(&ptlrpc_last_xid_lock); | |
2926 | next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; | |
2927 | spin_unlock(&ptlrpc_last_xid_lock); | |
2928 | ||
2929 | return next; | |
2930 | #else | |
2931 | /* No need to lock, since returned value is racy anyways */ | |
2932 | return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; | |
2933 | #endif | |
2934 | } | |
2935 | EXPORT_SYMBOL(ptlrpc_sample_next_xid); | |
2936 | ||
2937 | /** | |
2938 | * Functions for operating ptlrpc workers. | |
2939 | * | |
2940 | * A ptlrpc work is a function which will be running inside ptlrpc context. | |
2941 | * The callback shouldn't sleep otherwise it will block that ptlrpcd thread. | |
2942 | * | |
2943 | * 1. after a work is created, it can be used many times, that is: | |
2944 | * handler = ptlrpcd_alloc_work(); | |
2945 | * ptlrpcd_queue_work(); | |
2946 | * | |
2947 | * queue it again when necessary: | |
2948 | * ptlrpcd_queue_work(); | |
2949 | * ptlrpcd_destroy_work(); | |
2950 | * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but | |
2951 | * it will only be queued once in any time. Also as its name implies, it may | |
2952 | * have delay before it really runs by ptlrpcd thread. | |
2953 | */ | |
2954 | struct ptlrpc_work_async_args { | |
d0bfef31 CH |
2955 | int (*cb)(const struct lu_env *, void *); |
2956 | void *cbdata; | |
d7e09d03 PT |
2957 | }; |
2958 | ||
82a373ae LZ |
2959 | static void ptlrpcd_add_work_req(struct ptlrpc_request *req) |
2960 | { | |
2961 | /* re-initialize the req */ | |
2962 | req->rq_timeout = obd_timeout; | |
219e6de6 | 2963 | req->rq_sent = ktime_get_real_seconds(); |
82a373ae LZ |
2964 | req->rq_deadline = req->rq_sent + req->rq_timeout; |
2965 | req->rq_reply_deadline = req->rq_deadline; | |
2966 | req->rq_phase = RQ_PHASE_INTERPRET; | |
2967 | req->rq_next_phase = RQ_PHASE_COMPLETE; | |
2968 | req->rq_xid = ptlrpc_next_xid(); | |
2969 | req->rq_import_generation = req->rq_import->imp_generation; | |
2970 | ||
c5c4c6fa | 2971 | ptlrpcd_add_req(req); |
82a373ae | 2972 | } |
d7e09d03 PT |
2973 | |
2974 | static int work_interpreter(const struct lu_env *env, | |
2975 | struct ptlrpc_request *req, void *data, int rc) | |
2976 | { | |
2977 | struct ptlrpc_work_async_args *arg = data; | |
2978 | ||
82a373ae | 2979 | LASSERT(ptlrpcd_check_work(req)); |
d7e09d03 | 2980 | |
82a373ae LZ |
2981 | rc = arg->cb(env, arg->cbdata); |
2982 | ||
2983 | list_del_init(&req->rq_set_chain); | |
2984 | req->rq_set = NULL; | |
2985 | ||
2986 | if (atomic_dec_return(&req->rq_refcount) > 1) { | |
2987 | atomic_set(&req->rq_refcount, 2); | |
2988 | ptlrpcd_add_work_req(req); | |
2989 | } | |
2990 | return rc; | |
2991 | } | |
2992 | ||
2993 | static int worker_format; | |
2994 | ||
2995 | static int ptlrpcd_check_work(struct ptlrpc_request *req) | |
2996 | { | |
2997 | return req->rq_pill.rc_fmt == (void *)&worker_format; | |
d7e09d03 PT |
2998 | } |
2999 | ||
3000 | /** | |
3001 | * Create a work for ptlrpc. | |
3002 | */ | |
3003 | void *ptlrpcd_alloc_work(struct obd_import *imp, | |
3004 | int (*cb)(const struct lu_env *, void *), void *cbdata) | |
3005 | { | |
3006 | struct ptlrpc_request *req = NULL; | |
3007 | struct ptlrpc_work_async_args *args; | |
d7e09d03 PT |
3008 | |
3009 | might_sleep(); | |
3010 | ||
8b382089 | 3011 | if (!cb) |
0a3bdb00 | 3012 | return ERR_PTR(-EINVAL); |
d7e09d03 PT |
3013 | |
3014 | /* copy some code from deprecated fakereq. */ | |
0be19afa | 3015 | req = ptlrpc_request_cache_alloc(GFP_NOFS); |
8b382089 | 3016 | if (!req) { |
d7e09d03 | 3017 | CERROR("ptlrpc: run out of memory!\n"); |
0a3bdb00 | 3018 | return ERR_PTR(-ENOMEM); |
d7e09d03 PT |
3019 | } |
3020 | ||
3021 | req->rq_send_state = LUSTRE_IMP_FULL; | |
3022 | req->rq_type = PTL_RPC_MSG_REQUEST; | |
3023 | req->rq_import = class_import_get(imp); | |
3024 | req->rq_export = NULL; | |
3025 | req->rq_interpret_reply = work_interpreter; | |
3026 | /* don't want reply */ | |
3027 | req->rq_receiving_reply = 0; | |
cf378ff7 | 3028 | req->rq_req_unlink = req->rq_reply_unlink = 0; |
d7e09d03 | 3029 | req->rq_no_delay = req->rq_no_resend = 1; |
82a373ae | 3030 | req->rq_pill.rc_fmt = (void *)&worker_format; |
d7e09d03 PT |
3031 | |
3032 | spin_lock_init(&req->rq_lock); | |
3033 | INIT_LIST_HEAD(&req->rq_list); | |
3034 | INIT_LIST_HEAD(&req->rq_replay_list); | |
3035 | INIT_LIST_HEAD(&req->rq_set_chain); | |
3036 | INIT_LIST_HEAD(&req->rq_history_list); | |
3037 | INIT_LIST_HEAD(&req->rq_exp_list); | |
3038 | init_waitqueue_head(&req->rq_reply_waitq); | |
3039 | init_waitqueue_head(&req->rq_set_waitq); | |
3040 | atomic_set(&req->rq_refcount, 1); | |
3041 | ||
3949015e | 3042 | CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args)); |
d7e09d03 | 3043 | args = ptlrpc_req_async_args(req); |
d0bfef31 | 3044 | args->cb = cb; |
d7e09d03 PT |
3045 | args->cbdata = cbdata; |
3046 | ||
0a3bdb00 | 3047 | return req; |
d7e09d03 PT |
3048 | } |
3049 | EXPORT_SYMBOL(ptlrpcd_alloc_work); | |
3050 | ||
3051 | void ptlrpcd_destroy_work(void *handler) | |
3052 | { | |
3053 | struct ptlrpc_request *req = handler; | |
3054 | ||
3055 | if (req) | |
3056 | ptlrpc_req_finished(req); | |
3057 | } | |
3058 | EXPORT_SYMBOL(ptlrpcd_destroy_work); | |
3059 | ||
3060 | int ptlrpcd_queue_work(void *handler) | |
3061 | { | |
3062 | struct ptlrpc_request *req = handler; | |
3063 | ||
3064 | /* | |
3065 | * Check if the req is already being queued. | |
3066 | * | |
3067 | * Here comes a trick: it lacks a way of checking if a req is being | |
3068 | * processed reliably in ptlrpc. Here I have to use refcount of req | |
3069 | * for this purpose. This is okay because the caller should use this | |
3070 | * req as opaque data. - Jinshan | |
3071 | */ | |
3072 | LASSERT(atomic_read(&req->rq_refcount) > 0); | |
82a373ae LZ |
3073 | if (atomic_inc_return(&req->rq_refcount) == 2) |
3074 | ptlrpcd_add_work_req(req); | |
d7e09d03 PT |
3075 | return 0; |
3076 | } | |
3077 | EXPORT_SYMBOL(ptlrpcd_queue_work); |