]>
Commit | Line | Data |
---|---|---|
c14dd9d5 | 1 | // SPDX-License-Identifier: GPL-2.0 |
d7e09d03 PT |
2 | /* |
3 | * GPL HEADER START | |
4 | * | |
5 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License version 2 only, | |
9 | * as published by the Free Software Foundation. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, but | |
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * General Public License version 2 for more details (a copy is included | |
15 | * in the LICENSE file that accompanied this code). | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * version 2 along with this program; If not, see | |
6a5b99a4 | 19 | * http://www.gnu.org/licenses/gpl-2.0.html |
d7e09d03 | 20 | * |
d7e09d03 PT |
21 | * GPL HEADER END |
22 | */ | |
23 | /* | |
24 | * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. | |
25 | * Use is subject to license terms. | |
26 | * | |
1dc563a6 | 27 | * Copyright (c) 2010, 2015, Intel Corporation. |
d7e09d03 PT |
28 | */ |
29 | /* | |
30 | * This file is part of Lustre, http://www.lustre.org/ | |
31 | * Lustre is a trademark of Sun Microsystems, Inc. | |
32 | */ | |
33 | ||
34 | #define DEBUG_SUBSYSTEM S_RPC | |
ea6882bf JS |
35 | |
36 | #include <obd_support.h> | |
37 | #include <obd_class.h> | |
38 | #include <lustre_net.h> | |
39 | #include <lu_object.h> | |
40 | #include <uapi/linux/lnet/lnet-types.h> | |
d7e09d03 PT |
41 | #include "ptlrpc_internal.h" |
42 | ||
43 | /* The following are visible and mutable through /sys/module/ptlrpc */ | |
b963e722 | 44 | int test_req_buffer_pressure; |
8cc7b4b9 PT |
45 | module_param(test_req_buffer_pressure, int, 0444); |
46 | MODULE_PARM_DESC(test_req_buffer_pressure, "set non-zero to put pressure on request buffer pools"); | |
47 | module_param(at_min, int, 0644); | |
48 | MODULE_PARM_DESC(at_min, "Adaptive timeout minimum (sec)"); | |
49 | module_param(at_max, int, 0644); | |
50 | MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)"); | |
51 | module_param(at_history, int, 0644); | |
52 | MODULE_PARM_DESC(at_history, | |
53 | "Adaptive timeouts remember the slowest event that took place within this period (sec)"); | |
54 | module_param(at_early_margin, int, 0644); | |
55 | MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply"); | |
56 | module_param(at_extra, int, 0644); | |
57 | MODULE_PARM_DESC(at_extra, "How much extra time to give with each early reply"); | |
d7e09d03 | 58 | |
d7e09d03 PT |
59 | /* forward ref */ |
60 | static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt); | |
61 | static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req); | |
62 | static void ptlrpc_at_remove_timed(struct ptlrpc_request *req); | |
63 | ||
64 | /** Holds a list of all PTLRPC services */ | |
65 | LIST_HEAD(ptlrpc_all_services); | |
66 | /** Used to protect the \e ptlrpc_all_services list */ | |
67 | struct mutex ptlrpc_all_services_mutex; | |
68 | ||
a96389d9 | 69 | static struct ptlrpc_request_buffer_desc * |
d7e09d03 PT |
70 | ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt) |
71 | { | |
d0bfef31 | 72 | struct ptlrpc_service *svc = svcpt->scp_service; |
d7e09d03 PT |
73 | struct ptlrpc_request_buffer_desc *rqbd; |
74 | ||
bae97e81 JL |
75 | rqbd = kzalloc_node(sizeof(*rqbd), GFP_NOFS, |
76 | cfs_cpt_spread_node(svc->srv_cptable, | |
77 | svcpt->scp_cpt)); | |
8b382089 | 78 | if (!rqbd) |
d7e09d03 PT |
79 | return NULL; |
80 | ||
81 | rqbd->rqbd_svcpt = svcpt; | |
82 | rqbd->rqbd_refcount = 0; | |
83 | rqbd->rqbd_cbid.cbid_fn = request_in_callback; | |
84 | rqbd->rqbd_cbid.cbid_arg = rqbd; | |
85 | INIT_LIST_HEAD(&rqbd->rqbd_reqs); | |
6fd57333 OD |
86 | rqbd->rqbd_buffer = libcfs_kvzalloc_cpt(svc->srv_cptable, |
87 | svcpt->scp_cpt, | |
88 | svc->srv_buf_size, | |
89 | GFP_KERNEL); | |
8b382089 | 90 | if (!rqbd->rqbd_buffer) { |
9ae10597 | 91 | kfree(rqbd); |
d7e09d03 PT |
92 | return NULL; |
93 | } | |
94 | ||
95 | spin_lock(&svcpt->scp_lock); | |
96 | list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); | |
97 | svcpt->scp_nrqbds_total++; | |
98 | spin_unlock(&svcpt->scp_lock); | |
99 | ||
100 | return rqbd; | |
101 | } | |
102 | ||
a96389d9 | 103 | static void |
d7e09d03 PT |
104 | ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd) |
105 | { | |
106 | struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; | |
107 | ||
108 | LASSERT(rqbd->rqbd_refcount == 0); | |
109 | LASSERT(list_empty(&rqbd->rqbd_reqs)); | |
110 | ||
111 | spin_lock(&svcpt->scp_lock); | |
112 | list_del(&rqbd->rqbd_list); | |
113 | svcpt->scp_nrqbds_total--; | |
114 | spin_unlock(&svcpt->scp_lock); | |
115 | ||
ee0ec194 | 116 | kvfree(rqbd->rqbd_buffer); |
9ae10597 | 117 | kfree(rqbd); |
d7e09d03 PT |
118 | } |
119 | ||
a96389d9 | 120 | static int |
d7e09d03 PT |
121 | ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post) |
122 | { | |
d0bfef31 | 123 | struct ptlrpc_service *svc = svcpt->scp_service; |
d7e09d03 | 124 | struct ptlrpc_request_buffer_desc *rqbd; |
d0bfef31 CH |
125 | int rc = 0; |
126 | int i; | |
d7e09d03 PT |
127 | |
128 | if (svcpt->scp_rqbd_allocating) | |
129 | goto try_post; | |
130 | ||
131 | spin_lock(&svcpt->scp_lock); | |
132 | /* check again with lock */ | |
133 | if (svcpt->scp_rqbd_allocating) { | |
134 | /* NB: we might allow more than one thread in the future */ | |
135 | LASSERT(svcpt->scp_rqbd_allocating == 1); | |
136 | spin_unlock(&svcpt->scp_lock); | |
137 | goto try_post; | |
138 | } | |
139 | ||
140 | svcpt->scp_rqbd_allocating++; | |
141 | spin_unlock(&svcpt->scp_lock); | |
142 | ||
d7e09d03 PT |
143 | for (i = 0; i < svc->srv_nbuf_per_group; i++) { |
144 | /* NB: another thread might have recycled enough rqbds, we | |
dadfcdab OD |
145 | * need to make sure it wouldn't over-allocate, see LU-1212. |
146 | */ | |
d7e09d03 PT |
147 | if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group) |
148 | break; | |
149 | ||
150 | rqbd = ptlrpc_alloc_rqbd(svcpt); | |
151 | ||
8b382089 | 152 | if (!rqbd) { |
d7e09d03 PT |
153 | CERROR("%s: Can't allocate request buffer\n", |
154 | svc->srv_name); | |
155 | rc = -ENOMEM; | |
156 | break; | |
157 | } | |
158 | } | |
159 | ||
160 | spin_lock(&svcpt->scp_lock); | |
161 | ||
162 | LASSERT(svcpt->scp_rqbd_allocating == 1); | |
163 | svcpt->scp_rqbd_allocating--; | |
164 | ||
165 | spin_unlock(&svcpt->scp_lock); | |
166 | ||
167 | CDEBUG(D_RPCTRACE, | |
168 | "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n", | |
169 | svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted, | |
170 | svcpt->scp_nrqbds_total, rc); | |
171 | ||
172 | try_post: | |
173 | if (post && rc == 0) | |
174 | rc = ptlrpc_server_post_idle_rqbds(svcpt); | |
175 | ||
176 | return rc; | |
177 | } | |
178 | ||
d7e09d03 PT |
179 | struct ptlrpc_hr_partition; |
180 | ||
181 | struct ptlrpc_hr_thread { | |
182 | int hrt_id; /* thread ID */ | |
183 | spinlock_t hrt_lock; | |
184 | wait_queue_head_t hrt_waitq; | |
185 | struct list_head hrt_queue; /* RS queue */ | |
186 | struct ptlrpc_hr_partition *hrt_partition; | |
187 | }; | |
188 | ||
189 | struct ptlrpc_hr_partition { | |
190 | /* # of started threads */ | |
191 | atomic_t hrp_nstarted; | |
192 | /* # of stopped threads */ | |
193 | atomic_t hrp_nstopped; | |
194 | /* cpu partition id */ | |
195 | int hrp_cpt; | |
196 | /* round-robin rotor for choosing thread */ | |
197 | int hrp_rotor; | |
198 | /* total number of threads on this partition */ | |
199 | int hrp_nthrs; | |
200 | /* threads table */ | |
201 | struct ptlrpc_hr_thread *hrp_thrs; | |
202 | }; | |
203 | ||
204 | #define HRT_RUNNING 0 | |
205 | #define HRT_STOPPING 1 | |
206 | ||
207 | struct ptlrpc_hr_service { | |
208 | /* CPU partition table, it's just cfs_cpt_table for now */ | |
209 | struct cfs_cpt_table *hr_cpt_table; | |
210 | /** controller sleep waitq */ | |
211 | wait_queue_head_t hr_waitq; | |
212 | unsigned int hr_stopping; | |
213 | /** roundrobin rotor for non-affinity service */ | |
214 | unsigned int hr_rotor; | |
215 | /* partition data */ | |
216 | struct ptlrpc_hr_partition **hr_partitions; | |
217 | }; | |
218 | ||
d7e09d03 PT |
219 | /** reply handling service. */ |
220 | static struct ptlrpc_hr_service ptlrpc_hr; | |
221 | ||
d7e09d03 PT |
222 | /** |
223 | * Choose an hr thread to dispatch requests to. | |
224 | */ | |
225 | static struct ptlrpc_hr_thread * | |
226 | ptlrpc_hr_select(struct ptlrpc_service_part *svcpt) | |
227 | { | |
d0bfef31 CH |
228 | struct ptlrpc_hr_partition *hrp; |
229 | unsigned int rotor; | |
d7e09d03 PT |
230 | |
231 | if (svcpt->scp_cpt >= 0 && | |
232 | svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) { | |
233 | /* directly match partition */ | |
234 | hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt]; | |
235 | ||
236 | } else { | |
237 | rotor = ptlrpc_hr.hr_rotor++; | |
238 | rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table); | |
239 | ||
240 | hrp = ptlrpc_hr.hr_partitions[rotor]; | |
241 | } | |
242 | ||
243 | rotor = hrp->hrp_rotor++; | |
244 | return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs]; | |
245 | } | |
246 | ||
d7e09d03 PT |
247 | /** |
248 | * Put reply state into a queue for processing because we received | |
249 | * ACK from the client | |
250 | */ | |
251 | void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs) | |
252 | { | |
253 | struct ptlrpc_hr_thread *hrt; | |
d7e09d03 PT |
254 | |
255 | LASSERT(list_empty(&rs->rs_list)); | |
256 | ||
257 | hrt = ptlrpc_hr_select(rs->rs_svcpt); | |
258 | ||
259 | spin_lock(&hrt->hrt_lock); | |
260 | list_add_tail(&rs->rs_list, &hrt->hrt_queue); | |
261 | spin_unlock(&hrt->hrt_lock); | |
262 | ||
263 | wake_up(&hrt->hrt_waitq); | |
d7e09d03 PT |
264 | } |
265 | ||
266 | void | |
267 | ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs) | |
268 | { | |
5e42bc9d LX |
269 | assert_spin_locked(&rs->rs_svcpt->scp_rep_lock); |
270 | assert_spin_locked(&rs->rs_lock); | |
3949015e | 271 | LASSERT(rs->rs_difficult); |
d7e09d03 PT |
272 | rs->rs_scheduled_ever = 1; /* flag any notification attempt */ |
273 | ||
274 | if (rs->rs_scheduled) { /* being set up or already notified */ | |
d7e09d03 PT |
275 | return; |
276 | } | |
277 | ||
278 | rs->rs_scheduled = 1; | |
279 | list_del_init(&rs->rs_list); | |
280 | ptlrpc_dispatch_difficult_reply(rs); | |
d7e09d03 PT |
281 | } |
282 | EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply); | |
283 | ||
d7e09d03 PT |
284 | static int |
285 | ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt) | |
286 | { | |
287 | struct ptlrpc_request_buffer_desc *rqbd; | |
d0bfef31 CH |
288 | int rc; |
289 | int posted = 0; | |
d7e09d03 PT |
290 | |
291 | for (;;) { | |
292 | spin_lock(&svcpt->scp_lock); | |
293 | ||
294 | if (list_empty(&svcpt->scp_rqbd_idle)) { | |
295 | spin_unlock(&svcpt->scp_lock); | |
296 | return posted; | |
297 | } | |
298 | ||
299 | rqbd = list_entry(svcpt->scp_rqbd_idle.next, | |
30c0aa39 OD |
300 | struct ptlrpc_request_buffer_desc, |
301 | rqbd_list); | |
d7e09d03 PT |
302 | list_del(&rqbd->rqbd_list); |
303 | ||
304 | /* assume we will post successfully */ | |
305 | svcpt->scp_nrqbds_posted++; | |
306 | list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted); | |
307 | ||
308 | spin_unlock(&svcpt->scp_lock); | |
309 | ||
310 | rc = ptlrpc_register_rqbd(rqbd); | |
311 | if (rc != 0) | |
312 | break; | |
313 | ||
314 | posted = 1; | |
315 | } | |
316 | ||
317 | spin_lock(&svcpt->scp_lock); | |
318 | ||
319 | svcpt->scp_nrqbds_posted--; | |
320 | list_del(&rqbd->rqbd_list); | |
321 | list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); | |
322 | ||
323 | /* Don't complain if no request buffers are posted right now; LNET | |
dadfcdab OD |
324 | * won't drop requests because we set the portal lazy! |
325 | */ | |
d7e09d03 PT |
326 | |
327 | spin_unlock(&svcpt->scp_lock); | |
328 | ||
329 | return -1; | |
330 | } | |
331 | ||
e99e88a9 | 332 | static void ptlrpc_at_timer(struct timer_list *t) |
d7e09d03 PT |
333 | { |
334 | struct ptlrpc_service_part *svcpt; | |
335 | ||
e99e88a9 | 336 | svcpt = from_timer(svcpt, t, scp_at_timer); |
d7e09d03 PT |
337 | |
338 | svcpt->scp_at_check = 1; | |
339 | svcpt->scp_at_checktime = cfs_time_current(); | |
340 | wake_up(&svcpt->scp_waitq); | |
341 | } | |
342 | ||
343 | static void | |
344 | ptlrpc_server_nthreads_check(struct ptlrpc_service *svc, | |
345 | struct ptlrpc_service_conf *conf) | |
346 | { | |
d0bfef31 | 347 | struct ptlrpc_service_thr_conf *tc = &conf->psc_thr; |
8dde0685 OD |
348 | unsigned int init; |
349 | unsigned int total; | |
350 | unsigned int nthrs; | |
d0bfef31 | 351 | int weight; |
d7e09d03 PT |
352 | |
353 | /* | |
354 | * Common code for estimating & validating threads number. | |
355 | * CPT affinity service could have percpt thread-pool instead | |
356 | * of a global thread-pool, which means user might not always | |
357 | * get the threads number they give it in conf::tc_nthrs_user | |
358 | * even they did set. It's because we need to validate threads | |
359 | * number for each CPT to guarantee each pool will have enough | |
360 | * threads to keep the service healthy. | |
361 | */ | |
362 | init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL); | |
363 | init = max_t(int, init, tc->tc_nthrs_init); | |
364 | ||
365 | /* NB: please see comments in lustre_lnet.h for definition | |
dadfcdab OD |
366 | * details of these members |
367 | */ | |
d7e09d03 PT |
368 | LASSERT(tc->tc_nthrs_max != 0); |
369 | ||
370 | if (tc->tc_nthrs_user != 0) { | |
371 | /* In case there is a reason to test a service with many | |
372 | * threads, we give a less strict check here, it can | |
dadfcdab OD |
373 | * be up to 8 * nthrs_max |
374 | */ | |
d7e09d03 PT |
375 | total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user); |
376 | nthrs = total / svc->srv_ncpts; | |
d0bfef31 | 377 | init = max(init, nthrs); |
d7e09d03 PT |
378 | goto out; |
379 | } | |
380 | ||
381 | total = tc->tc_nthrs_max; | |
382 | if (tc->tc_nthrs_base == 0) { | |
383 | /* don't care about base threads number per partition, | |
dadfcdab OD |
384 | * this is most for non-affinity service |
385 | */ | |
d7e09d03 PT |
386 | nthrs = total / svc->srv_ncpts; |
387 | goto out; | |
388 | } | |
389 | ||
390 | nthrs = tc->tc_nthrs_base; | |
391 | if (svc->srv_ncpts == 1) { | |
d0bfef31 | 392 | int i; |
d7e09d03 PT |
393 | |
394 | /* NB: Increase the base number if it's single partition | |
395 | * and total number of cores/HTs is larger or equal to 4. | |
dadfcdab OD |
396 | * result will always < 2 * nthrs_base |
397 | */ | |
d7e09d03 PT |
398 | weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY); |
399 | for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */ | |
400 | (tc->tc_nthrs_base >> i) != 0; i++) | |
401 | nthrs += tc->tc_nthrs_base >> i; | |
402 | } | |
403 | ||
404 | if (tc->tc_thr_factor != 0) { | |
d0bfef31 | 405 | int factor = tc->tc_thr_factor; |
d7e09d03 PT |
406 | const int fade = 4; |
407 | ||
408 | /* | |
409 | * User wants to increase number of threads with for | |
410 | * each CPU core/HT, most likely the factor is larger then | |
411 | * one thread/core because service threads are supposed to | |
412 | * be blocked by lock or wait for IO. | |
413 | */ | |
414 | /* | |
415 | * Amdahl's law says that adding processors wouldn't give | |
416 | * a linear increasing of parallelism, so it's nonsense to | |
417 | * have too many threads no matter how many cores/HTs | |
418 | * there are. | |
419 | */ | |
6301647b | 420 | /* weight is # of HTs */ |
06931e62 | 421 | if (cpumask_weight(topology_sibling_cpumask(0)) > 1) { |
d7e09d03 PT |
422 | /* depress thread factor for hyper-thread */ |
423 | factor = factor - (factor >> 1) + (factor >> 3); | |
424 | } | |
425 | ||
426 | weight = cfs_cpt_weight(svc->srv_cptable, 0); | |
427 | LASSERT(weight > 0); | |
428 | ||
429 | for (; factor > 0 && weight > 0; factor--, weight -= fade) | |
430 | nthrs += min(weight, fade) * factor; | |
431 | } | |
432 | ||
433 | if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { | |
434 | nthrs = max(tc->tc_nthrs_base, | |
435 | tc->tc_nthrs_max / svc->srv_ncpts); | |
436 | } | |
437 | out: | |
438 | nthrs = max(nthrs, tc->tc_nthrs_init); | |
439 | svc->srv_nthrs_cpt_limit = nthrs; | |
440 | svc->srv_nthrs_cpt_init = init; | |
441 | ||
442 | if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { | |
2d00bd17 | 443 | CDEBUG(D_OTHER, "%s: This service may have more threads (%d) than the given soft limit (%d)\n", |
d7e09d03 PT |
444 | svc->srv_name, nthrs * svc->srv_ncpts, |
445 | tc->tc_nthrs_max); | |
446 | } | |
447 | } | |
448 | ||
449 | /** | |
450 | * Initialize percpt data for a service | |
451 | */ | |
452 | static int | |
453 | ptlrpc_service_part_init(struct ptlrpc_service *svc, | |
454 | struct ptlrpc_service_part *svcpt, int cpt) | |
455 | { | |
456 | struct ptlrpc_at_array *array; | |
d0bfef31 CH |
457 | int size; |
458 | int index; | |
459 | int rc; | |
d7e09d03 PT |
460 | |
461 | svcpt->scp_cpt = cpt; | |
462 | INIT_LIST_HEAD(&svcpt->scp_threads); | |
463 | ||
464 | /* rqbd and incoming request queue */ | |
465 | spin_lock_init(&svcpt->scp_lock); | |
466 | INIT_LIST_HEAD(&svcpt->scp_rqbd_idle); | |
467 | INIT_LIST_HEAD(&svcpt->scp_rqbd_posted); | |
468 | INIT_LIST_HEAD(&svcpt->scp_req_incoming); | |
469 | init_waitqueue_head(&svcpt->scp_waitq); | |
470 | /* history request & rqbd list */ | |
471 | INIT_LIST_HEAD(&svcpt->scp_hist_reqs); | |
472 | INIT_LIST_HEAD(&svcpt->scp_hist_rqbds); | |
473 | ||
369e5c9a | 474 | /* active requests and hp requests */ |
d7e09d03 PT |
475 | spin_lock_init(&svcpt->scp_req_lock); |
476 | ||
477 | /* reply states */ | |
478 | spin_lock_init(&svcpt->scp_rep_lock); | |
479 | INIT_LIST_HEAD(&svcpt->scp_rep_active); | |
480 | INIT_LIST_HEAD(&svcpt->scp_rep_idle); | |
481 | init_waitqueue_head(&svcpt->scp_rep_waitq); | |
482 | atomic_set(&svcpt->scp_nreps_difficult, 0); | |
483 | ||
484 | /* adaptive timeout */ | |
485 | spin_lock_init(&svcpt->scp_at_lock); | |
486 | array = &svcpt->scp_at_array; | |
487 | ||
488 | size = at_est2timeout(at_max); | |
d0bfef31 CH |
489 | array->paa_size = size; |
490 | array->paa_count = 0; | |
d7e09d03 PT |
491 | array->paa_deadline = -1; |
492 | ||
493 | /* allocate memory for scp_at_array (ptlrpc_at_array) */ | |
bae97e81 JL |
494 | array->paa_reqs_array = |
495 | kzalloc_node(sizeof(struct list_head) * size, GFP_NOFS, | |
496 | cfs_cpt_spread_node(svc->srv_cptable, cpt)); | |
8b382089 | 497 | if (!array->paa_reqs_array) |
d7e09d03 PT |
498 | return -ENOMEM; |
499 | ||
500 | for (index = 0; index < size; index++) | |
501 | INIT_LIST_HEAD(&array->paa_reqs_array[index]); | |
502 | ||
bae97e81 JL |
503 | array->paa_reqs_count = |
504 | kzalloc_node(sizeof(__u32) * size, GFP_NOFS, | |
505 | cfs_cpt_spread_node(svc->srv_cptable, cpt)); | |
8b382089 | 506 | if (!array->paa_reqs_count) |
207e99c2 | 507 | goto free_reqs_array; |
d7e09d03 | 508 | |
e99e88a9 | 509 | timer_setup(&svcpt->scp_at_timer, ptlrpc_at_timer, 0); |
922da0c5 | 510 | |
d7e09d03 | 511 | /* At SOW, service time should be quick; 10s seems generous. If client |
dadfcdab OD |
512 | * timeout is less than this, we'll be sending an early reply. |
513 | */ | |
d7e09d03 PT |
514 | at_init(&svcpt->scp_at_estimate, 10, 0); |
515 | ||
516 | /* assign this before call ptlrpc_grow_req_bufs */ | |
517 | svcpt->scp_service = svc; | |
518 | /* Now allocate the request buffers, but don't post them now */ | |
519 | rc = ptlrpc_grow_req_bufs(svcpt, 0); | |
520 | /* We shouldn't be under memory pressure at startup, so | |
dadfcdab OD |
521 | * fail if we can't allocate all our buffers at this time. |
522 | */ | |
d7e09d03 | 523 | if (rc != 0) |
207e99c2 | 524 | goto free_reqs_count; |
d7e09d03 PT |
525 | |
526 | return 0; | |
527 | ||
207e99c2 JL |
528 | free_reqs_count: |
529 | kfree(array->paa_reqs_count); | |
530 | array->paa_reqs_count = NULL; | |
531 | free_reqs_array: | |
532 | kfree(array->paa_reqs_array); | |
533 | array->paa_reqs_array = NULL; | |
d7e09d03 PT |
534 | |
535 | return -ENOMEM; | |
536 | } | |
537 | ||
538 | /** | |
539 | * Initialize service on a given portal. | |
540 | * This includes starting serving threads , allocating and posting rqbds and | |
541 | * so on. | |
542 | */ | |
543 | struct ptlrpc_service * | |
544 | ptlrpc_register_service(struct ptlrpc_service_conf *conf, | |
328676f8 | 545 | struct kset *parent, |
700815d4 | 546 | struct dentry *debugfs_entry) |
d7e09d03 | 547 | { |
d0bfef31 CH |
548 | struct ptlrpc_service_cpt_conf *cconf = &conf->psc_cpt; |
549 | struct ptlrpc_service *service; | |
550 | struct ptlrpc_service_part *svcpt; | |
551 | struct cfs_cpt_table *cptable; | |
552 | __u32 *cpts = NULL; | |
553 | int ncpts; | |
554 | int cpt; | |
555 | int rc; | |
556 | int i; | |
d7e09d03 PT |
557 | |
558 | LASSERT(conf->psc_buf.bc_nbufs > 0); | |
559 | LASSERT(conf->psc_buf.bc_buf_size >= | |
560 | conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD); | |
561 | LASSERT(conf->psc_thr.tc_ctx_tags != 0); | |
562 | ||
563 | cptable = cconf->cc_cptable; | |
8b382089 | 564 | if (!cptable) |
d7e09d03 PT |
565 | cptable = cfs_cpt_table; |
566 | ||
567 | if (!conf->psc_thr.tc_cpu_affinity) { | |
568 | ncpts = 1; | |
569 | } else { | |
570 | ncpts = cfs_cpt_number(cptable); | |
8b382089 | 571 | if (cconf->cc_pattern) { |
d0bfef31 | 572 | struct cfs_expr_list *el; |
d7e09d03 PT |
573 | |
574 | rc = cfs_expr_list_parse(cconf->cc_pattern, | |
575 | strlen(cconf->cc_pattern), | |
576 | 0, ncpts - 1, &el); | |
577 | if (rc != 0) { | |
578 | CERROR("%s: invalid CPT pattern string: %s", | |
579 | conf->psc_name, cconf->cc_pattern); | |
0a3bdb00 | 580 | return ERR_PTR(-EINVAL); |
d7e09d03 PT |
581 | } |
582 | ||
583 | rc = cfs_expr_list_values(el, ncpts, &cpts); | |
584 | cfs_expr_list_free(el); | |
585 | if (rc <= 0) { | |
586 | CERROR("%s: failed to parse CPT array %s: %d\n", | |
587 | conf->psc_name, cconf->cc_pattern, rc); | |
207e99c2 | 588 | kfree(cpts); |
0a3bdb00 | 589 | return ERR_PTR(rc < 0 ? rc : -EINVAL); |
d7e09d03 PT |
590 | } |
591 | ncpts = rc; | |
592 | } | |
593 | } | |
594 | ||
9ae10597 JL |
595 | service = kzalloc(offsetof(struct ptlrpc_service, srv_parts[ncpts]), |
596 | GFP_NOFS); | |
597851ac | 597 | if (!service) { |
207e99c2 | 598 | kfree(cpts); |
0a3bdb00 | 599 | return ERR_PTR(-ENOMEM); |
d7e09d03 PT |
600 | } |
601 | ||
d0bfef31 CH |
602 | service->srv_cptable = cptable; |
603 | service->srv_cpts = cpts; | |
604 | service->srv_ncpts = ncpts; | |
d7e09d03 PT |
605 | |
606 | service->srv_cpt_bits = 0; /* it's zero already, easy to read... */ | |
607 | while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable)) | |
608 | service->srv_cpt_bits++; | |
609 | ||
610 | /* public members */ | |
611 | spin_lock_init(&service->srv_lock); | |
d0bfef31 CH |
612 | service->srv_name = conf->psc_name; |
613 | service->srv_watchdog_factor = conf->psc_watchdog_factor; | |
b6da17f3 | 614 | INIT_LIST_HEAD(&service->srv_list); /* for safety of cleanup */ |
d7e09d03 PT |
615 | |
616 | /* buffer configuration */ | |
d0bfef31 | 617 | service->srv_nbuf_per_group = test_req_buffer_pressure ? |
d7e09d03 | 618 | 1 : conf->psc_buf.bc_nbufs; |
d0bfef31 | 619 | service->srv_max_req_size = conf->psc_buf.bc_req_max_size + |
d7e09d03 | 620 | SPTLRPC_MAX_PAYLOAD; |
d0bfef31 CH |
621 | service->srv_buf_size = conf->psc_buf.bc_buf_size; |
622 | service->srv_rep_portal = conf->psc_buf.bc_rep_portal; | |
623 | service->srv_req_portal = conf->psc_buf.bc_req_portal; | |
d7e09d03 PT |
624 | |
625 | /* Increase max reply size to next power of two */ | |
626 | service->srv_max_reply_size = 1; | |
627 | while (service->srv_max_reply_size < | |
628 | conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD) | |
629 | service->srv_max_reply_size <<= 1; | |
630 | ||
d0bfef31 CH |
631 | service->srv_thread_name = conf->psc_thr.tc_thr_name; |
632 | service->srv_ctx_tags = conf->psc_thr.tc_ctx_tags; | |
633 | service->srv_hpreq_ratio = PTLRPC_SVC_HP_RATIO; | |
634 | service->srv_ops = conf->psc_ops; | |
d7e09d03 PT |
635 | |
636 | for (i = 0; i < ncpts; i++) { | |
637 | if (!conf->psc_thr.tc_cpu_affinity) | |
638 | cpt = CFS_CPT_ANY; | |
639 | else | |
8b382089 | 640 | cpt = cpts ? cpts[i] : i; |
d7e09d03 | 641 | |
bae97e81 JL |
642 | svcpt = kzalloc_node(sizeof(*svcpt), GFP_NOFS, |
643 | cfs_cpt_spread_node(cptable, cpt)); | |
8b382089 | 644 | if (!svcpt) { |
a9b3e8f3 JL |
645 | rc = -ENOMEM; |
646 | goto failed; | |
647 | } | |
d7e09d03 PT |
648 | |
649 | service->srv_parts[i] = svcpt; | |
650 | rc = ptlrpc_service_part_init(service, svcpt, cpt); | |
651 | if (rc != 0) | |
a9b3e8f3 | 652 | goto failed; |
d7e09d03 PT |
653 | } |
654 | ||
655 | ptlrpc_server_nthreads_check(service, conf); | |
656 | ||
657 | rc = LNetSetLazyPortal(service->srv_req_portal); | |
658 | LASSERT(rc == 0); | |
659 | ||
660 | mutex_lock(&ptlrpc_all_services_mutex); | |
3949015e | 661 | list_add(&service->srv_list, &ptlrpc_all_services); |
d7e09d03 PT |
662 | mutex_unlock(&ptlrpc_all_services_mutex); |
663 | ||
328676f8 OD |
664 | if (parent) { |
665 | rc = ptlrpc_sysfs_register_service(parent, service); | |
666 | if (rc) | |
667 | goto failed; | |
668 | } | |
669 | ||
700815d4 DE |
670 | if (!IS_ERR_OR_NULL(debugfs_entry)) |
671 | ptlrpc_ldebugfs_register_service(debugfs_entry, service); | |
d7e09d03 PT |
672 | |
673 | rc = ptlrpc_service_nrs_setup(service); | |
674 | if (rc != 0) | |
a9b3e8f3 | 675 | goto failed; |
d7e09d03 PT |
676 | |
677 | CDEBUG(D_NET, "%s: Started, listening on portal %d\n", | |
678 | service->srv_name, service->srv_req_portal); | |
679 | ||
680 | rc = ptlrpc_start_threads(service); | |
681 | if (rc != 0) { | |
682 | CERROR("Failed to start threads for service %s: %d\n", | |
683 | service->srv_name, rc); | |
a9b3e8f3 | 684 | goto failed; |
d7e09d03 PT |
685 | } |
686 | ||
0a3bdb00 | 687 | return service; |
d7e09d03 PT |
688 | failed: |
689 | ptlrpc_unregister_service(service); | |
0a3bdb00 | 690 | return ERR_PTR(rc); |
d7e09d03 PT |
691 | } |
692 | EXPORT_SYMBOL(ptlrpc_register_service); | |
693 | ||
694 | /** | |
695 | * to actually free the request, must be called without holding svc_lock. | |
696 | * note it's caller's responsibility to unlink req->rq_list. | |
697 | */ | |
698 | static void ptlrpc_server_free_request(struct ptlrpc_request *req) | |
699 | { | |
700 | LASSERT(atomic_read(&req->rq_refcount) == 0); | |
701 | LASSERT(list_empty(&req->rq_timed_list)); | |
702 | ||
703 | /* DEBUG_REQ() assumes the reply state of a request with a valid | |
dadfcdab OD |
704 | * ref will not be destroyed until that reference is dropped. |
705 | */ | |
d7e09d03 PT |
706 | ptlrpc_req_drop_rs(req); |
707 | ||
708 | sptlrpc_svc_ctx_decref(req); | |
709 | ||
710 | if (req != &req->rq_rqbd->rqbd_req) { | |
711 | /* NB request buffers use an embedded | |
712 | * req if the incoming req unlinked the | |
dadfcdab OD |
713 | * MD; this isn't one of them! |
714 | */ | |
35b2e1b7 | 715 | ptlrpc_request_cache_free(req); |
d7e09d03 PT |
716 | } |
717 | } | |
718 | ||
719 | /** | |
720 | * drop a reference count of the request. if it reaches 0, we either | |
721 | * put it into history list, or free it immediately. | |
722 | */ | |
230a8da1 | 723 | static void ptlrpc_server_drop_request(struct ptlrpc_request *req) |
d7e09d03 PT |
724 | { |
725 | struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; | |
d0bfef31 CH |
726 | struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; |
727 | struct ptlrpc_service *svc = svcpt->scp_service; | |
728 | int refcount; | |
729 | struct list_head *tmp; | |
730 | struct list_head *nxt; | |
d7e09d03 PT |
731 | |
732 | if (!atomic_dec_and_test(&req->rq_refcount)) | |
733 | return; | |
734 | ||
735 | if (req->rq_at_linked) { | |
736 | spin_lock(&svcpt->scp_at_lock); | |
737 | /* recheck with lock, in case it's unlinked by | |
dadfcdab OD |
738 | * ptlrpc_at_check_timed() |
739 | */ | |
d7e09d03 PT |
740 | if (likely(req->rq_at_linked)) |
741 | ptlrpc_at_remove_timed(req); | |
742 | spin_unlock(&svcpt->scp_at_lock); | |
743 | } | |
744 | ||
745 | LASSERT(list_empty(&req->rq_timed_list)); | |
746 | ||
747 | /* finalize request */ | |
748 | if (req->rq_export) { | |
749 | class_export_put(req->rq_export); | |
750 | req->rq_export = NULL; | |
751 | } | |
752 | ||
753 | spin_lock(&svcpt->scp_lock); | |
754 | ||
755 | list_add(&req->rq_list, &rqbd->rqbd_reqs); | |
756 | ||
757 | refcount = --(rqbd->rqbd_refcount); | |
758 | if (refcount == 0) { | |
759 | /* request buffer is now idle: add to history */ | |
760 | list_del(&rqbd->rqbd_list); | |
761 | ||
762 | list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds); | |
763 | svcpt->scp_hist_nrqbds++; | |
764 | ||
765 | /* cull some history? | |
dadfcdab OD |
766 | * I expect only about 1 or 2 rqbds need to be recycled here |
767 | */ | |
d7e09d03 PT |
768 | while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) { |
769 | rqbd = list_entry(svcpt->scp_hist_rqbds.next, | |
30c0aa39 OD |
770 | struct ptlrpc_request_buffer_desc, |
771 | rqbd_list); | |
d7e09d03 PT |
772 | |
773 | list_del(&rqbd->rqbd_list); | |
774 | svcpt->scp_hist_nrqbds--; | |
775 | ||
776 | /* remove rqbd's reqs from svc's req history while | |
dadfcdab OD |
777 | * I've got the service lock |
778 | */ | |
d7e09d03 PT |
779 | list_for_each(tmp, &rqbd->rqbd_reqs) { |
780 | req = list_entry(tmp, struct ptlrpc_request, | |
30c0aa39 | 781 | rq_list); |
d7e09d03 PT |
782 | /* Track the highest culled req seq */ |
783 | if (req->rq_history_seq > | |
784 | svcpt->scp_hist_seq_culled) { | |
785 | svcpt->scp_hist_seq_culled = | |
786 | req->rq_history_seq; | |
787 | } | |
788 | list_del(&req->rq_history_list); | |
789 | } | |
790 | ||
791 | spin_unlock(&svcpt->scp_lock); | |
792 | ||
793 | list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) { | |
794 | req = list_entry(rqbd->rqbd_reqs.next, | |
30c0aa39 OD |
795 | struct ptlrpc_request, |
796 | rq_list); | |
d7e09d03 PT |
797 | list_del(&req->rq_list); |
798 | ptlrpc_server_free_request(req); | |
799 | } | |
800 | ||
801 | spin_lock(&svcpt->scp_lock); | |
802 | /* | |
803 | * now all reqs including the embedded req has been | |
804 | * disposed, schedule request buffer for re-use. | |
805 | */ | |
806 | LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == | |
807 | 0); | |
30c0aa39 | 808 | list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); |
d7e09d03 PT |
809 | } |
810 | ||
811 | spin_unlock(&svcpt->scp_lock); | |
812 | } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) { | |
813 | /* If we are low on memory, we are not interested in history */ | |
814 | list_del(&req->rq_list); | |
815 | list_del_init(&req->rq_history_list); | |
816 | ||
817 | /* Track the highest culled req seq */ | |
818 | if (req->rq_history_seq > svcpt->scp_hist_seq_culled) | |
819 | svcpt->scp_hist_seq_culled = req->rq_history_seq; | |
820 | ||
821 | spin_unlock(&svcpt->scp_lock); | |
822 | ||
823 | ptlrpc_server_free_request(req); | |
824 | } else { | |
825 | spin_unlock(&svcpt->scp_lock); | |
826 | } | |
827 | } | |
828 | ||
d7e09d03 PT |
829 | /** |
830 | * to finish a request: stop sending more early replies, and release | |
831 | * the request. | |
832 | */ | |
833 | static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt, | |
834 | struct ptlrpc_request *req) | |
835 | { | |
836 | ptlrpc_server_hpreq_fini(req); | |
837 | ||
70187506 MP |
838 | if (req->rq_session.lc_thread) { |
839 | lu_context_exit(&req->rq_session); | |
840 | lu_context_fini(&req->rq_session); | |
841 | } | |
842 | ||
d7e09d03 PT |
843 | ptlrpc_server_drop_request(req); |
844 | } | |
845 | ||
846 | /** | |
847 | * to finish a active request: stop sending more early replies, and release | |
848 | * the request. should be called after we finished handling the request. | |
849 | */ | |
850 | static void ptlrpc_server_finish_active_request( | |
851 | struct ptlrpc_service_part *svcpt, | |
852 | struct ptlrpc_request *req) | |
853 | { | |
854 | spin_lock(&svcpt->scp_req_lock); | |
855 | ptlrpc_nrs_req_stop_nolock(req); | |
856 | svcpt->scp_nreqs_active--; | |
857 | if (req->rq_hp) | |
858 | svcpt->scp_nhreqs_active--; | |
859 | spin_unlock(&svcpt->scp_req_lock); | |
860 | ||
861 | ptlrpc_nrs_req_finalize(req); | |
862 | ||
8b382089 | 863 | if (req->rq_export) |
d7e09d03 PT |
864 | class_export_rpc_dec(req->rq_export); |
865 | ||
866 | ptlrpc_server_finish_request(svcpt, req); | |
867 | } | |
868 | ||
d7e09d03 PT |
869 | /** |
870 | * Sanity check request \a req. | |
871 | * Return 0 if all is ok, error code otherwise. | |
872 | */ | |
873 | static int ptlrpc_check_req(struct ptlrpc_request *req) | |
874 | { | |
f60d7c39 | 875 | struct obd_device *obd = req->rq_export->exp_obd; |
d7e09d03 PT |
876 | int rc = 0; |
877 | ||
878 | if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) < | |
879 | req->rq_export->exp_conn_cnt)) { | |
880 | DEBUG_REQ(D_RPCTRACE, req, | |
881 | "DROPPING req from old connection %d < %d", | |
882 | lustre_msg_get_conn_cnt(req->rq_reqmsg), | |
883 | req->rq_export->exp_conn_cnt); | |
884 | return -EEXIST; | |
885 | } | |
8b382089 | 886 | if (unlikely(!obd || obd->obd_fail)) { |
532118c0 KM |
887 | /* |
888 | * Failing over, don't handle any more reqs, send | |
889 | * error response instead. | |
890 | */ | |
d7e09d03 | 891 | CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n", |
8b382089 | 892 | req, obd ? obd->obd_name : "unknown"); |
d7e09d03 PT |
893 | rc = -ENODEV; |
894 | } else if (lustre_msg_get_flags(req->rq_reqmsg) & | |
af3ec53b OD |
895 | (MSG_REPLAY | MSG_REQ_REPLAY_DONE)) { |
896 | DEBUG_REQ(D_ERROR, req, "Invalid replay without recovery"); | |
897 | class_fail_export(req->rq_export); | |
898 | rc = -ENODEV; | |
899 | } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0) { | |
900 | DEBUG_REQ(D_ERROR, req, | |
901 | "Invalid req with transno %llu without recovery", | |
902 | lustre_msg_get_transno(req->rq_reqmsg)); | |
903 | class_fail_export(req->rq_export); | |
904 | rc = -ENODEV; | |
d7e09d03 PT |
905 | } |
906 | ||
907 | if (unlikely(rc < 0)) { | |
908 | req->rq_status = rc; | |
909 | ptlrpc_error(req); | |
910 | } | |
911 | return rc; | |
912 | } | |
913 | ||
914 | static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt) | |
915 | { | |
916 | struct ptlrpc_at_array *array = &svcpt->scp_at_array; | |
917 | __s32 next; | |
918 | ||
919 | if (array->paa_count == 0) { | |
922da0c5 | 920 | del_timer(&svcpt->scp_at_timer); |
d7e09d03 PT |
921 | return; |
922 | } | |
923 | ||
924 | /* Set timer for closest deadline */ | |
219e6de6 | 925 | next = (__s32)(array->paa_deadline - ktime_get_real_seconds() - |
d7e09d03 PT |
926 | at_early_margin); |
927 | if (next <= 0) { | |
e99e88a9 | 928 | ptlrpc_at_timer(&svcpt->scp_at_timer); |
d7e09d03 | 929 | } else { |
922da0c5 | 930 | mod_timer(&svcpt->scp_at_timer, cfs_time_shift(next)); |
d7e09d03 PT |
931 | CDEBUG(D_INFO, "armed %s at %+ds\n", |
932 | svcpt->scp_service->srv_name, next); | |
933 | } | |
934 | } | |
935 | ||
936 | /* Add rpc to early reply check list */ | |
937 | static int ptlrpc_at_add_timed(struct ptlrpc_request *req) | |
938 | { | |
939 | struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; | |
940 | struct ptlrpc_at_array *array = &svcpt->scp_at_array; | |
941 | struct ptlrpc_request *rq = NULL; | |
942 | __u32 index; | |
943 | ||
944 | if (AT_OFF) | |
fbe7c6c7 | 945 | return 0; |
d7e09d03 PT |
946 | |
947 | if (req->rq_no_reply) | |
948 | return 0; | |
949 | ||
950 | if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0) | |
fbe7c6c7 | 951 | return -ENOSYS; |
d7e09d03 PT |
952 | |
953 | spin_lock(&svcpt->scp_at_lock); | |
954 | LASSERT(list_empty(&req->rq_timed_list)); | |
955 | ||
219e6de6 | 956 | div_u64_rem(req->rq_deadline, array->paa_size, &index); |
d7e09d03 PT |
957 | if (array->paa_reqs_count[index] > 0) { |
958 | /* latest rpcs will have the latest deadlines in the list, | |
dadfcdab OD |
959 | * so search backward. |
960 | */ | |
30c0aa39 OD |
961 | list_for_each_entry_reverse(rq, &array->paa_reqs_array[index], |
962 | rq_timed_list) { | |
d7e09d03 PT |
963 | if (req->rq_deadline >= rq->rq_deadline) { |
964 | list_add(&req->rq_timed_list, | |
30c0aa39 | 965 | &rq->rq_timed_list); |
d7e09d03 PT |
966 | break; |
967 | } | |
968 | } | |
969 | } | |
970 | ||
971 | /* Add the request at the head of the list */ | |
972 | if (list_empty(&req->rq_timed_list)) | |
30c0aa39 | 973 | list_add(&req->rq_timed_list, &array->paa_reqs_array[index]); |
d7e09d03 PT |
974 | |
975 | spin_lock(&req->rq_lock); | |
976 | req->rq_at_linked = 1; | |
977 | spin_unlock(&req->rq_lock); | |
978 | req->rq_at_index = index; | |
979 | array->paa_reqs_count[index]++; | |
980 | array->paa_count++; | |
981 | if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) { | |
982 | array->paa_deadline = req->rq_deadline; | |
983 | ptlrpc_at_set_timer(svcpt); | |
984 | } | |
985 | spin_unlock(&svcpt->scp_at_lock); | |
986 | ||
987 | return 0; | |
988 | } | |
989 | ||
990 | static void | |
991 | ptlrpc_at_remove_timed(struct ptlrpc_request *req) | |
992 | { | |
993 | struct ptlrpc_at_array *array; | |
994 | ||
995 | array = &req->rq_rqbd->rqbd_svcpt->scp_at_array; | |
996 | ||
997 | /* NB: must call with hold svcpt::scp_at_lock */ | |
998 | LASSERT(!list_empty(&req->rq_timed_list)); | |
999 | list_del_init(&req->rq_timed_list); | |
1000 | ||
1001 | spin_lock(&req->rq_lock); | |
1002 | req->rq_at_linked = 0; | |
1003 | spin_unlock(&req->rq_lock); | |
1004 | ||
1005 | array->paa_reqs_count[req->rq_at_index]--; | |
1006 | array->paa_count--; | |
1007 | } | |
1008 | ||
31c5e95e CH |
1009 | /* |
1010 | * Attempt to extend the request deadline by sending an early reply to the | |
1011 | * client. | |
1012 | */ | |
d7e09d03 PT |
1013 | static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) |
1014 | { | |
1015 | struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; | |
1016 | struct ptlrpc_request *reqcopy; | |
1017 | struct lustre_msg *reqmsg; | |
219e6de6 | 1018 | long olddl = req->rq_deadline - ktime_get_real_seconds(); |
be247981 | 1019 | time64_t newdl; |
d7e09d03 | 1020 | int rc; |
d7e09d03 PT |
1021 | |
1022 | /* deadline is when the client expects us to reply, margin is the | |
dadfcdab OD |
1023 | * difference between clients' and servers' expectations |
1024 | */ | |
d7e09d03 | 1025 | DEBUG_REQ(D_ADAPTTO, req, |
2d00bd17 JP |
1026 | "%ssending early reply (deadline %+lds, margin %+lds) for %d+%d", |
1027 | AT_OFF ? "AT off - not " : "", | |
d7e09d03 PT |
1028 | olddl, olddl - at_get(&svcpt->scp_at_estimate), |
1029 | at_get(&svcpt->scp_at_estimate), at_extra); | |
1030 | ||
1031 | if (AT_OFF) | |
0a3bdb00 | 1032 | return 0; |
d7e09d03 PT |
1033 | |
1034 | if (olddl < 0) { | |
2d00bd17 JP |
1035 | DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), not sending early reply. Consider increasing at_early_margin (%d)?", |
1036 | olddl, at_early_margin); | |
d7e09d03 PT |
1037 | |
1038 | /* Return an error so we're not re-added to the timed list. */ | |
0a3bdb00 | 1039 | return -ETIMEDOUT; |
d7e09d03 PT |
1040 | } |
1041 | ||
cb68dd2d | 1042 | if (!(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { |
2d00bd17 | 1043 | DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, but no AT support"); |
0a3bdb00 | 1044 | return -ENOSYS; |
d7e09d03 PT |
1045 | } |
1046 | ||
31c5e95e CH |
1047 | /* |
1048 | * We want to extend the request deadline by at_extra seconds, | |
1049 | * so we set our service estimate to reflect how much time has | |
1050 | * passed since this request arrived plus an additional | |
1051 | * at_extra seconds. The client will calculate the new deadline | |
1052 | * based on this service estimate (plus some additional time to | |
1053 | * account for network latency). See ptlrpc_at_recv_early_reply | |
dadfcdab | 1054 | */ |
af3ec53b OD |
1055 | at_measured(&svcpt->scp_at_estimate, at_extra + |
1056 | ktime_get_real_seconds() - req->rq_arrival_time.tv_sec); | |
be247981 | 1057 | newdl = req->rq_arrival_time.tv_sec + at_get(&svcpt->scp_at_estimate); |
af3ec53b OD |
1058 | |
1059 | /* Check to see if we've actually increased the deadline - | |
dadfcdab OD |
1060 | * we may be past adaptive_max |
1061 | */ | |
be247981 | 1062 | if (req->rq_deadline >= newdl) { |
af3ec53b | 1063 | DEBUG_REQ(D_WARNING, req, "Couldn't add any time (%ld/%lld), not sending early reply\n", |
be247981 | 1064 | olddl, newdl - ktime_get_real_seconds()); |
af3ec53b | 1065 | return -ETIMEDOUT; |
d7e09d03 | 1066 | } |
d7e09d03 | 1067 | |
0be19afa | 1068 | reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS); |
8b382089 | 1069 | if (!reqcopy) |
0a3bdb00 | 1070 | return -ENOMEM; |
ee0ec194 | 1071 | reqmsg = libcfs_kvzalloc(req->rq_reqlen, GFP_NOFS); |
a9b3e8f3 JL |
1072 | if (!reqmsg) { |
1073 | rc = -ENOMEM; | |
1074 | goto out_free; | |
1075 | } | |
d7e09d03 PT |
1076 | |
1077 | *reqcopy = *req; | |
1078 | reqcopy->rq_reply_state = NULL; | |
1079 | reqcopy->rq_rep_swab_mask = 0; | |
1080 | reqcopy->rq_pack_bulk = 0; | |
1081 | reqcopy->rq_pack_udesc = 0; | |
1082 | reqcopy->rq_packed_final = 0; | |
1083 | sptlrpc_svc_ctx_addref(reqcopy); | |
1084 | /* We only need the reqmsg for the magic */ | |
1085 | reqcopy->rq_reqmsg = reqmsg; | |
1086 | memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); | |
1087 | ||
1088 | LASSERT(atomic_read(&req->rq_refcount)); | |
1089 | /** if it is last refcount then early reply isn't needed */ | |
1090 | if (atomic_read(&req->rq_refcount) == 1) { | |
2d00bd17 | 1091 | DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, abort sending early reply\n"); |
a9b3e8f3 JL |
1092 | rc = -EINVAL; |
1093 | goto out; | |
d7e09d03 PT |
1094 | } |
1095 | ||
1096 | /* Connection ref */ | |
1097 | reqcopy->rq_export = class_conn2export( | |
1098 | lustre_msg_get_handle(reqcopy->rq_reqmsg)); | |
8b382089 | 1099 | if (!reqcopy->rq_export) { |
a9b3e8f3 JL |
1100 | rc = -ENODEV; |
1101 | goto out; | |
1102 | } | |
d7e09d03 PT |
1103 | |
1104 | /* RPC ref */ | |
1105 | class_export_rpc_inc(reqcopy->rq_export); | |
1106 | if (reqcopy->rq_export->exp_obd && | |
a9b3e8f3 JL |
1107 | reqcopy->rq_export->exp_obd->obd_fail) { |
1108 | rc = -ENODEV; | |
1109 | goto out_put; | |
1110 | } | |
d7e09d03 PT |
1111 | |
1112 | rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY); | |
1113 | if (rc) | |
a9b3e8f3 | 1114 | goto out_put; |
d7e09d03 PT |
1115 | |
1116 | rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY); | |
1117 | ||
1118 | if (!rc) { | |
1119 | /* Adjust our own deadline to what we told the client */ | |
be247981 | 1120 | req->rq_deadline = newdl; |
d7e09d03 PT |
1121 | req->rq_early_count++; /* number sent, server side */ |
1122 | } else { | |
1123 | DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); | |
1124 | } | |
1125 | ||
1126 | /* Free the (early) reply state from lustre_pack_reply. | |
dadfcdab OD |
1127 | * (ptlrpc_send_reply takes it's own rs ref, so this is safe here) |
1128 | */ | |
d7e09d03 PT |
1129 | ptlrpc_req_drop_rs(reqcopy); |
1130 | ||
1131 | out_put: | |
1132 | class_export_rpc_dec(reqcopy->rq_export); | |
1133 | class_export_put(reqcopy->rq_export); | |
1134 | out: | |
1135 | sptlrpc_svc_ctx_decref(reqcopy); | |
ee0ec194 | 1136 | kvfree(reqmsg); |
35b2e1b7 AS |
1137 | out_free: |
1138 | ptlrpc_request_cache_free(reqcopy); | |
0a3bdb00 | 1139 | return rc; |
d7e09d03 PT |
1140 | } |
1141 | ||
1142 | /* Send early replies to everybody expiring within at_early_margin | |
dadfcdab OD |
1143 | * asking for at_extra time |
1144 | */ | |
80b6f295 | 1145 | static void ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt) |
d7e09d03 PT |
1146 | { |
1147 | struct ptlrpc_at_array *array = &svcpt->scp_at_array; | |
1148 | struct ptlrpc_request *rq, *n; | |
1149 | struct list_head work_list; | |
d0bfef31 | 1150 | __u32 index, count; |
219e6de6 AB |
1151 | time64_t deadline; |
1152 | time64_t now = ktime_get_real_seconds(); | |
b2d201bd | 1153 | long delay; |
d7e09d03 | 1154 | int first, counter = 0; |
d7e09d03 PT |
1155 | |
1156 | spin_lock(&svcpt->scp_at_lock); | |
1157 | if (svcpt->scp_at_check == 0) { | |
1158 | spin_unlock(&svcpt->scp_at_lock); | |
80b6f295 | 1159 | return; |
d7e09d03 PT |
1160 | } |
1161 | delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime); | |
1162 | svcpt->scp_at_check = 0; | |
1163 | ||
1164 | if (array->paa_count == 0) { | |
1165 | spin_unlock(&svcpt->scp_at_lock); | |
80b6f295 | 1166 | return; |
d7e09d03 PT |
1167 | } |
1168 | ||
1169 | /* The timer went off, but maybe the nearest rpc already completed. */ | |
1170 | first = array->paa_deadline - now; | |
1171 | if (first > at_early_margin) { | |
1172 | /* We've still got plenty of time. Reset the timer. */ | |
1173 | ptlrpc_at_set_timer(svcpt); | |
1174 | spin_unlock(&svcpt->scp_at_lock); | |
80b6f295 | 1175 | return; |
d7e09d03 PT |
1176 | } |
1177 | ||
1178 | /* We're close to a timeout, and we don't know how much longer the | |
dadfcdab OD |
1179 | * server will take. Send early replies to everyone expiring soon. |
1180 | */ | |
d7e09d03 PT |
1181 | INIT_LIST_HEAD(&work_list); |
1182 | deadline = -1; | |
219e6de6 | 1183 | div_u64_rem(array->paa_deadline, array->paa_size, &index); |
d7e09d03 PT |
1184 | count = array->paa_count; |
1185 | while (count > 0) { | |
1186 | count -= array->paa_reqs_count[index]; | |
30c0aa39 OD |
1187 | list_for_each_entry_safe(rq, n, &array->paa_reqs_array[index], |
1188 | rq_timed_list) { | |
d7e09d03 PT |
1189 | if (rq->rq_deadline > now + at_early_margin) { |
1190 | /* update the earliest deadline */ | |
1191 | if (deadline == -1 || | |
1192 | rq->rq_deadline < deadline) | |
1193 | deadline = rq->rq_deadline; | |
1194 | break; | |
1195 | } | |
1196 | ||
1197 | ptlrpc_at_remove_timed(rq); | |
1198 | /** | |
1199 | * ptlrpc_server_drop_request() may drop | |
1200 | * refcount to 0 already. Let's check this and | |
1201 | * don't add entry to work_list | |
1202 | */ | |
1203 | if (likely(atomic_inc_not_zero(&rq->rq_refcount))) | |
1204 | list_add(&rq->rq_timed_list, &work_list); | |
1205 | counter++; | |
1206 | } | |
1207 | ||
1208 | if (++index >= array->paa_size) | |
1209 | index = 0; | |
1210 | } | |
1211 | array->paa_deadline = deadline; | |
1212 | /* we have a new earliest deadline, restart the timer */ | |
1213 | ptlrpc_at_set_timer(svcpt); | |
1214 | ||
1215 | spin_unlock(&svcpt->scp_at_lock); | |
1216 | ||
2d00bd17 JP |
1217 | CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early replies\n", |
1218 | first, at_extra, counter); | |
d7e09d03 PT |
1219 | if (first < 0) { |
1220 | /* We're already past request deadlines before we even get a | |
dadfcdab OD |
1221 | * chance to send early replies |
1222 | */ | |
2d00bd17 | 1223 | LCONSOLE_WARN("%s: This server is not able to keep up with request traffic (cpu-bound).\n", |
d7e09d03 | 1224 | svcpt->scp_service->srv_name); |
219e6de6 | 1225 | CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%ld(jiff)\n", |
d7e09d03 PT |
1226 | counter, svcpt->scp_nreqs_incoming, |
1227 | svcpt->scp_nreqs_active, | |
1228 | at_get(&svcpt->scp_at_estimate), delay); | |
1229 | } | |
1230 | ||
1231 | /* we took additional refcount so entries can't be deleted from list, no | |
dadfcdab OD |
1232 | * locking is needed |
1233 | */ | |
d7e09d03 PT |
1234 | while (!list_empty(&work_list)) { |
1235 | rq = list_entry(work_list.next, struct ptlrpc_request, | |
30c0aa39 | 1236 | rq_timed_list); |
d7e09d03 PT |
1237 | list_del_init(&rq->rq_timed_list); |
1238 | ||
1239 | if (ptlrpc_at_send_early_reply(rq) == 0) | |
1240 | ptlrpc_at_add_timed(rq); | |
1241 | ||
1242 | ptlrpc_server_drop_request(rq); | |
1243 | } | |
d7e09d03 PT |
1244 | } |
1245 | ||
1246 | /** | |
1247 | * Put the request to the export list if the request may become | |
1248 | * a high priority one. | |
1249 | */ | |
1250 | static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt, | |
1251 | struct ptlrpc_request *req) | |
1252 | { | |
1253 | int rc = 0; | |
d7e09d03 PT |
1254 | |
1255 | if (svcpt->scp_service->srv_ops.so_hpreq_handler) { | |
1256 | rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req); | |
1257 | if (rc < 0) | |
0a3bdb00 | 1258 | return rc; |
d7e09d03 PT |
1259 | LASSERT(rc == 0); |
1260 | } | |
1261 | if (req->rq_export && req->rq_ops) { | |
1262 | /* Perform request specific check. We should do this check | |
1263 | * before the request is added into exp_hp_rpcs list otherwise | |
dadfcdab OD |
1264 | * it may hit swab race at LU-1044. |
1265 | */ | |
d7e09d03 PT |
1266 | if (req->rq_ops->hpreq_check) { |
1267 | rc = req->rq_ops->hpreq_check(req); | |
4c43c27d AB |
1268 | if (rc == -ESTALE) { |
1269 | req->rq_status = rc; | |
1270 | ptlrpc_error(req); | |
1271 | } | |
1272 | /** can only return error, | |
1273 | * 0 for normal request, | |
1274 | * or 1 for high priority request | |
d7e09d03 | 1275 | */ |
4c43c27d | 1276 | LASSERT(rc <= 1); |
d7e09d03 PT |
1277 | } |
1278 | ||
1279 | spin_lock_bh(&req->rq_export->exp_rpc_lock); | |
30c0aa39 | 1280 | list_add(&req->rq_exp_list, &req->rq_export->exp_hp_rpcs); |
d7e09d03 PT |
1281 | spin_unlock_bh(&req->rq_export->exp_rpc_lock); |
1282 | } | |
1283 | ||
1284 | ptlrpc_nrs_req_initialize(svcpt, req, rc); | |
1285 | ||
0a3bdb00 | 1286 | return rc; |
d7e09d03 PT |
1287 | } |
1288 | ||
1289 | /** Remove the request from the export list. */ | |
1290 | static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req) | |
1291 | { | |
d7e09d03 PT |
1292 | if (req->rq_export && req->rq_ops) { |
1293 | /* refresh lock timeout again so that client has more | |
dadfcdab OD |
1294 | * room to send lock cancel RPC. |
1295 | */ | |
d7e09d03 PT |
1296 | if (req->rq_ops->hpreq_fini) |
1297 | req->rq_ops->hpreq_fini(req); | |
1298 | ||
1299 | spin_lock_bh(&req->rq_export->exp_rpc_lock); | |
1300 | list_del_init(&req->rq_exp_list); | |
1301 | spin_unlock_bh(&req->rq_export->exp_rpc_lock); | |
1302 | } | |
d7e09d03 PT |
1303 | } |
1304 | ||
d7e09d03 PT |
1305 | static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, |
1306 | struct ptlrpc_request *req) | |
1307 | { | |
1308 | int rc; | |
d7e09d03 PT |
1309 | |
1310 | rc = ptlrpc_server_hpreq_init(svcpt, req); | |
1311 | if (rc < 0) | |
0a3bdb00 | 1312 | return rc; |
d7e09d03 PT |
1313 | |
1314 | ptlrpc_nrs_req_add(svcpt, req, !!rc); | |
1315 | ||
0a3bdb00 | 1316 | return 0; |
d7e09d03 PT |
1317 | } |
1318 | ||
1319 | /** | |
1320 | * Allow to handle high priority request | |
1321 | * User can call it w/o any lock but need to hold | |
1322 | * ptlrpc_service_part::scp_req_lock to get reliable result | |
1323 | */ | |
1324 | static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt, | |
1325 | bool force) | |
1326 | { | |
1327 | int running = svcpt->scp_nthrs_running; | |
1328 | ||
1329 | if (!nrs_svcpt_has_hp(svcpt)) | |
1330 | return false; | |
1331 | ||
1332 | if (force) | |
1333 | return true; | |
1334 | ||
1335 | if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && | |
1336 | CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { | |
1337 | /* leave just 1 thread for normal RPCs */ | |
1338 | running = PTLRPC_NTHRS_INIT; | |
8b382089 | 1339 | if (svcpt->scp_service->srv_ops.so_hpreq_handler) |
d7e09d03 PT |
1340 | running += 1; |
1341 | } | |
1342 | ||
1343 | if (svcpt->scp_nreqs_active >= running - 1) | |
1344 | return false; | |
1345 | ||
1346 | if (svcpt->scp_nhreqs_active == 0) | |
1347 | return true; | |
1348 | ||
1349 | return !ptlrpc_nrs_req_pending_nolock(svcpt, false) || | |
1350 | svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio; | |
1351 | } | |
1352 | ||
1353 | static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt, | |
1354 | bool force) | |
1355 | { | |
1356 | return ptlrpc_server_allow_high(svcpt, force) && | |
1357 | ptlrpc_nrs_req_pending_nolock(svcpt, true); | |
1358 | } | |
1359 | ||
1360 | /** | |
1361 | * Only allow normal priority requests on a service that has a high-priority | |
1362 | * queue if forced (i.e. cleanup), if there are other high priority requests | |
1363 | * already being processed (i.e. those threads can service more high-priority | |
1364 | * requests), or if there are enough idle threads that a later thread can do | |
1365 | * a high priority request. | |
1366 | * User can call it w/o any lock but need to hold | |
1367 | * ptlrpc_service_part::scp_req_lock to get reliable result | |
1368 | */ | |
1369 | static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt, | |
1370 | bool force) | |
1371 | { | |
1372 | int running = svcpt->scp_nthrs_running; | |
50ffcb7e | 1373 | |
d7e09d03 PT |
1374 | if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && |
1375 | CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { | |
1376 | /* leave just 1 thread for normal RPCs */ | |
1377 | running = PTLRPC_NTHRS_INIT; | |
8b382089 | 1378 | if (svcpt->scp_service->srv_ops.so_hpreq_handler) |
d7e09d03 PT |
1379 | running += 1; |
1380 | } | |
1381 | ||
1382 | if (force || | |
1383 | svcpt->scp_nreqs_active < running - 2) | |
1384 | return true; | |
1385 | ||
1386 | if (svcpt->scp_nreqs_active >= running - 1) | |
1387 | return false; | |
1388 | ||
1389 | return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt); | |
1390 | } | |
1391 | ||
1392 | static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt, | |
1393 | bool force) | |
1394 | { | |
1395 | return ptlrpc_server_allow_normal(svcpt, force) && | |
1396 | ptlrpc_nrs_req_pending_nolock(svcpt, false); | |
1397 | } | |
1398 | ||
1399 | /** | |
1400 | * Returns true if there are requests available in incoming | |
1401 | * request queue for processing and it is allowed to fetch them. | |
1402 | * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock | |
1403 | * to get reliable result | |
1404 | * \see ptlrpc_server_allow_normal | |
1405 | * \see ptlrpc_server_allow high | |
1406 | */ | |
1407 | static inline bool | |
1408 | ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force) | |
1409 | { | |
1410 | return ptlrpc_server_high_pending(svcpt, force) || | |
1411 | ptlrpc_server_normal_pending(svcpt, force); | |
1412 | } | |
1413 | ||
1414 | /** | |
1415 | * Fetch a request for processing from queue of unprocessed requests. | |
1416 | * Favors high-priority requests. | |
1417 | * Returns a pointer to fetched request. | |
1418 | */ | |
1419 | static struct ptlrpc_request * | |
1420 | ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force) | |
1421 | { | |
1422 | struct ptlrpc_request *req = NULL; | |
d7e09d03 PT |
1423 | |
1424 | spin_lock(&svcpt->scp_req_lock); | |
1425 | ||
1426 | if (ptlrpc_server_high_pending(svcpt, force)) { | |
1427 | req = ptlrpc_nrs_req_get_nolock(svcpt, true, force); | |
8b382089 | 1428 | if (req) { |
d7e09d03 PT |
1429 | svcpt->scp_hreq_count++; |
1430 | goto got_request; | |
1431 | } | |
1432 | } | |
1433 | ||
1434 | if (ptlrpc_server_normal_pending(svcpt, force)) { | |
1435 | req = ptlrpc_nrs_req_get_nolock(svcpt, false, force); | |
8b382089 | 1436 | if (req) { |
d7e09d03 PT |
1437 | svcpt->scp_hreq_count = 0; |
1438 | goto got_request; | |
1439 | } | |
1440 | } | |
1441 | ||
1442 | spin_unlock(&svcpt->scp_req_lock); | |
0a3bdb00 | 1443 | return NULL; |
d7e09d03 PT |
1444 | |
1445 | got_request: | |
1446 | svcpt->scp_nreqs_active++; | |
1447 | if (req->rq_hp) | |
1448 | svcpt->scp_nhreqs_active++; | |
1449 | ||
1450 | spin_unlock(&svcpt->scp_req_lock); | |
1451 | ||
1452 | if (likely(req->rq_export)) | |
1453 | class_export_rpc_inc(req->rq_export); | |
1454 | ||
0a3bdb00 | 1455 | return req; |
d7e09d03 PT |
1456 | } |
1457 | ||
1458 | /** | |
1459 | * Handle freshly incoming reqs, add to timed early reply list, | |
1460 | * pass on to regular request queue. | |
1461 | * All incoming requests pass through here before getting into | |
1462 | * ptlrpc_server_handle_req later on. | |
1463 | */ | |
1464 | static int | |
1465 | ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt, | |
1466 | struct ptlrpc_thread *thread) | |
1467 | { | |
d0bfef31 CH |
1468 | struct ptlrpc_service *svc = svcpt->scp_service; |
1469 | struct ptlrpc_request *req; | |
1470 | __u32 deadline; | |
1471 | int rc; | |
d7e09d03 PT |
1472 | |
1473 | spin_lock(&svcpt->scp_lock); | |
1474 | if (list_empty(&svcpt->scp_req_incoming)) { | |
1475 | spin_unlock(&svcpt->scp_lock); | |
0a3bdb00 | 1476 | return 0; |
d7e09d03 PT |
1477 | } |
1478 | ||
1479 | req = list_entry(svcpt->scp_req_incoming.next, | |
30c0aa39 | 1480 | struct ptlrpc_request, rq_list); |
d7e09d03 PT |
1481 | list_del_init(&req->rq_list); |
1482 | svcpt->scp_nreqs_incoming--; | |
1483 | /* Consider this still a "queued" request as far as stats are | |
dadfcdab OD |
1484 | * concerned |
1485 | */ | |
d7e09d03 PT |
1486 | spin_unlock(&svcpt->scp_lock); |
1487 | ||
1488 | /* go through security check/transform */ | |
1489 | rc = sptlrpc_svc_unwrap_request(req); | |
1490 | switch (rc) { | |
1491 | case SECSVC_OK: | |
1492 | break; | |
1493 | case SECSVC_COMPLETE: | |
1494 | target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET); | |
1495 | goto err_req; | |
1496 | case SECSVC_DROP: | |
1497 | goto err_req; | |
1498 | default: | |
1499 | LBUG(); | |
1500 | } | |
1501 | ||
1502 | /* | |
1503 | * for null-flavored rpc, msg has been unpacked by sptlrpc, although | |
1504 | * redo it wouldn't be harmful. | |
1505 | */ | |
1506 | if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { | |
1507 | rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen); | |
1508 | if (rc != 0) { | |
b0f5aad5 GKH |
1509 | CERROR("error unpacking request: ptl %d from %s x%llu\n", |
1510 | svc->srv_req_portal, libcfs_id2str(req->rq_peer), | |
1511 | req->rq_xid); | |
d7e09d03 PT |
1512 | goto err_req; |
1513 | } | |
1514 | } | |
1515 | ||
1516 | rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); | |
1517 | if (rc) { | |
b0f5aad5 GKH |
1518 | CERROR("error unpacking ptlrpc body: ptl %d from %s x%llu\n", |
1519 | svc->srv_req_portal, libcfs_id2str(req->rq_peer), | |
1520 | req->rq_xid); | |
d7e09d03 PT |
1521 | goto err_req; |
1522 | } | |
1523 | ||
1524 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) && | |
1525 | lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) { | |
b0f5aad5 | 1526 | CERROR("drop incoming rpc opc %u, x%llu\n", |
d7e09d03 PT |
1527 | cfs_fail_val, req->rq_xid); |
1528 | goto err_req; | |
1529 | } | |
1530 | ||
1531 | rc = -EINVAL; | |
1532 | if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { | |
1533 | CERROR("wrong packet type received (type=%u) from %s\n", | |
1534 | lustre_msg_get_type(req->rq_reqmsg), | |
1535 | libcfs_id2str(req->rq_peer)); | |
1536 | goto err_req; | |
1537 | } | |
1538 | ||
3949015e | 1539 | switch (lustre_msg_get_opc(req->rq_reqmsg)) { |
d7e09d03 PT |
1540 | case MDS_WRITEPAGE: |
1541 | case OST_WRITE: | |
1542 | req->rq_bulk_write = 1; | |
1543 | break; | |
1544 | case MDS_READPAGE: | |
1545 | case OST_READ: | |
1546 | case MGS_CONFIG_READ: | |
1547 | req->rq_bulk_read = 1; | |
1548 | break; | |
1549 | } | |
1550 | ||
b0f5aad5 | 1551 | CDEBUG(D_RPCTRACE, "got req x%llu\n", req->rq_xid); |
d7e09d03 PT |
1552 | |
1553 | req->rq_export = class_conn2export( | |
1554 | lustre_msg_get_handle(req->rq_reqmsg)); | |
1555 | if (req->rq_export) { | |
1556 | rc = ptlrpc_check_req(req); | |
1557 | if (rc == 0) { | |
1558 | rc = sptlrpc_target_export_check(req->rq_export, req); | |
1559 | if (rc) | |
2d00bd17 | 1560 | DEBUG_REQ(D_ERROR, req, "DROPPING req with illegal security flavor,"); |
d7e09d03 PT |
1561 | } |
1562 | ||
1563 | if (rc) | |
1564 | goto err_req; | |
d7e09d03 PT |
1565 | } |
1566 | ||
1567 | /* req_in handling should/must be fast */ | |
219e6de6 | 1568 | if (ktime_get_real_seconds() - req->rq_arrival_time.tv_sec > 5) |
75e3863b JS |
1569 | DEBUG_REQ(D_WARNING, req, "Slow req_in handling %llds", |
1570 | (s64)(ktime_get_real_seconds() - | |
1571 | req->rq_arrival_time.tv_sec)); | |
d7e09d03 PT |
1572 | |
1573 | /* Set rpc server deadline and add it to the timed list */ | |
1574 | deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) & | |
1575 | MSGHDR_AT_SUPPORT) ? | |
1576 | /* The max time the client expects us to take */ | |
1577 | lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout; | |
1578 | req->rq_deadline = req->rq_arrival_time.tv_sec + deadline; | |
1579 | if (unlikely(deadline == 0)) { | |
1580 | DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout"); | |
1581 | goto err_req; | |
1582 | } | |
1583 | ||
1584 | req->rq_svc_thread = thread; | |
70187506 MP |
1585 | if (thread) { |
1586 | /* initialize request session, it is needed for request | |
1587 | * processing by target | |
1588 | */ | |
1589 | rc = lu_context_init(&req->rq_session, | |
1590 | LCT_SERVER_SESSION | LCT_NOREF); | |
1591 | if (rc) { | |
1592 | CERROR("%s: failure to initialize session: rc = %d\n", | |
1593 | thread->t_name, rc); | |
1594 | goto err_req; | |
1595 | } | |
1596 | req->rq_session.lc_thread = thread; | |
1597 | lu_context_enter(&req->rq_session); | |
1598 | req->rq_svc_thread->t_env->le_ses = &req->rq_session; | |
1599 | } | |
d7e09d03 PT |
1600 | |
1601 | ptlrpc_at_add_timed(req); | |
1602 | ||
1603 | /* Move it over to the request processing queue */ | |
1604 | rc = ptlrpc_server_request_add(svcpt, req); | |
1605 | if (rc) | |
a9b3e8f3 | 1606 | goto err_req; |
d7e09d03 PT |
1607 | |
1608 | wake_up(&svcpt->scp_waitq); | |
0a3bdb00 | 1609 | return 1; |
d7e09d03 PT |
1610 | |
1611 | err_req: | |
1612 | ptlrpc_server_finish_request(svcpt, req); | |
1613 | ||
0a3bdb00 | 1614 | return 1; |
d7e09d03 PT |
1615 | } |
1616 | ||
1617 | /** | |
1618 | * Main incoming request handling logic. | |
1619 | * Calls handler function from service to do actual processing. | |
1620 | */ | |
1621 | static int | |
1622 | ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, | |
1623 | struct ptlrpc_thread *thread) | |
1624 | { | |
1625 | struct ptlrpc_service *svc = svcpt->scp_service; | |
1626 | struct ptlrpc_request *request; | |
219e6de6 AB |
1627 | struct timespec64 work_start; |
1628 | struct timespec64 work_end; | |
1629 | struct timespec64 timediff; | |
1630 | struct timespec64 arrived; | |
1631 | unsigned long timediff_usecs; | |
1632 | unsigned long arrived_usecs; | |
d0bfef31 | 1633 | int fail_opc = 0; |
d7e09d03 PT |
1634 | |
1635 | request = ptlrpc_server_request_get(svcpt, false); | |
8b382089 | 1636 | if (!request) |
0a3bdb00 | 1637 | return 0; |
d7e09d03 PT |
1638 | |
1639 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT)) | |
1640 | fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT; | |
1641 | else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) | |
1642 | fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT; | |
1643 | ||
1644 | if (unlikely(fail_opc)) { | |
1645 | if (request->rq_export && request->rq_ops) | |
1646 | OBD_FAIL_TIMEOUT(fail_opc, 4); | |
1647 | } | |
1648 | ||
1649 | ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET); | |
1650 | ||
3949015e | 1651 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG)) |
d7e09d03 PT |
1652 | libcfs_debug_dumplog(); |
1653 | ||
219e6de6 AB |
1654 | ktime_get_real_ts64(&work_start); |
1655 | timediff = timespec64_sub(work_start, request->rq_arrival_time); | |
1656 | timediff_usecs = timediff.tv_sec * USEC_PER_SEC + | |
1657 | timediff.tv_nsec / NSEC_PER_USEC; | |
8b382089 | 1658 | if (likely(svc->srv_stats)) { |
d7e09d03 | 1659 | lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR, |
219e6de6 | 1660 | timediff_usecs); |
d7e09d03 PT |
1661 | lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR, |
1662 | svcpt->scp_nreqs_incoming); | |
1663 | lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR, | |
1664 | svcpt->scp_nreqs_active); | |
1665 | lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT, | |
1666 | at_get(&svcpt->scp_at_estimate)); | |
1667 | } | |
1668 | ||
d7e09d03 PT |
1669 | if (likely(request->rq_export)) { |
1670 | if (unlikely(ptlrpc_check_req(request))) | |
1671 | goto put_conn; | |
d7e09d03 PT |
1672 | } |
1673 | ||
1674 | /* Discard requests queued for longer than the deadline. | |
dadfcdab OD |
1675 | * The deadline is increased if we send an early reply. |
1676 | */ | |
219e6de6 | 1677 | if (ktime_get_real_seconds() > request->rq_deadline) { |
75e3863b | 1678 | DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline %lld:%llds ago\n", |
d7e09d03 | 1679 | libcfs_id2str(request->rq_peer), |
75e3863b JS |
1680 | request->rq_deadline - |
1681 | request->rq_arrival_time.tv_sec, | |
1682 | ktime_get_real_seconds() - request->rq_deadline); | |
d7e09d03 PT |
1683 | goto put_conn; |
1684 | } | |
1685 | ||
2d00bd17 JP |
1686 | CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d\n", |
1687 | current_comm(), | |
d7e09d03 PT |
1688 | (request->rq_export ? |
1689 | (char *)request->rq_export->exp_client_uuid.uuid : "0"), | |
1690 | (request->rq_export ? | |
1691 | atomic_read(&request->rq_export->exp_refcount) : -99), | |
1692 | lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, | |
1693 | libcfs_id2str(request->rq_peer), | |
1694 | lustre_msg_get_opc(request->rq_reqmsg)); | |
1695 | ||
1696 | if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING) | |
1697 | CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val); | |
1698 | ||
70187506 MP |
1699 | CDEBUG(D_NET, "got req %llu\n", request->rq_xid); |
1700 | ||
1701 | /* re-assign request and sesson thread to the current one */ | |
1702 | request->rq_svc_thread = thread; | |
1703 | if (thread) { | |
1704 | LASSERT(request->rq_session.lc_thread); | |
1705 | request->rq_session.lc_thread = thread; | |
1706 | request->rq_session.lc_cookie = 0x55; | |
1707 | thread->t_env->le_ses = &request->rq_session; | |
1708 | } | |
1709 | svc->srv_ops.so_req_handler(request); | |
d7e09d03 PT |
1710 | |
1711 | ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE); | |
1712 | ||
1713 | put_conn: | |
219e6de6 | 1714 | if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) { |
532118c0 | 1715 | DEBUG_REQ(D_WARNING, request, |
219e6de6 AB |
1716 | "Request took longer than estimated (%lld:%llds); " |
1717 | "client may timeout.", | |
1718 | (s64)request->rq_deadline - | |
1719 | request->rq_arrival_time.tv_sec, | |
1720 | (s64)ktime_get_real_seconds() - request->rq_deadline); | |
1721 | } | |
1722 | ||
1723 | ktime_get_real_ts64(&work_end); | |
1724 | timediff = timespec64_sub(work_end, work_start); | |
1725 | timediff_usecs = timediff.tv_sec * USEC_PER_SEC + | |
1726 | timediff.tv_nsec / NSEC_PER_USEC; | |
1727 | arrived = timespec64_sub(work_end, request->rq_arrival_time); | |
1728 | arrived_usecs = arrived.tv_sec * USEC_PER_SEC + | |
1729 | arrived.tv_nsec / NSEC_PER_USEC; | |
2d00bd17 JP |
1730 | CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d Request processed in %ldus (%ldus total) trans %llu rc %d/%d\n", |
1731 | current_comm(), | |
1732 | (request->rq_export ? | |
1733 | (char *)request->rq_export->exp_client_uuid.uuid : "0"), | |
1734 | (request->rq_export ? | |
1735 | atomic_read(&request->rq_export->exp_refcount) : -99), | |
1736 | lustre_msg_get_status(request->rq_reqmsg), | |
1737 | request->rq_xid, | |
1738 | libcfs_id2str(request->rq_peer), | |
1739 | lustre_msg_get_opc(request->rq_reqmsg), | |
219e6de6 AB |
1740 | timediff_usecs, |
1741 | arrived_usecs, | |
2d00bd17 JP |
1742 | (request->rq_repmsg ? |
1743 | lustre_msg_get_transno(request->rq_repmsg) : | |
1744 | request->rq_transno), | |
1745 | request->rq_status, | |
1746 | (request->rq_repmsg ? | |
1747 | lustre_msg_get_status(request->rq_repmsg) : -999)); | |
8b382089 | 1748 | if (likely(svc->srv_stats && request->rq_reqmsg)) { |
d7e09d03 PT |
1749 | __u32 op = lustre_msg_get_opc(request->rq_reqmsg); |
1750 | int opc = opcode_offset(op); | |
50ffcb7e | 1751 | |
d7e09d03 PT |
1752 | if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { |
1753 | LASSERT(opc < LUSTRE_MAX_OPCODES); | |
1754 | lprocfs_counter_add(svc->srv_stats, | |
1755 | opc + EXTRA_MAX_OPCODES, | |
219e6de6 | 1756 | timediff_usecs); |
d7e09d03 PT |
1757 | } |
1758 | } | |
1759 | if (unlikely(request->rq_early_count)) { | |
1760 | DEBUG_REQ(D_ADAPTTO, request, | |
219e6de6 | 1761 | "sent %d early replies before finishing in %llds", |
d7e09d03 | 1762 | request->rq_early_count, |
219e6de6 AB |
1763 | (s64)work_end.tv_sec - |
1764 | request->rq_arrival_time.tv_sec); | |
d7e09d03 PT |
1765 | } |
1766 | ||
d7e09d03 PT |
1767 | ptlrpc_server_finish_active_request(svcpt, request); |
1768 | ||
0a3bdb00 | 1769 | return 1; |
d7e09d03 PT |
1770 | } |
1771 | ||
1772 | /** | |
1773 | * An internal function to process a single reply state object. | |
1774 | */ | |
1775 | static int | |
1776 | ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) | |
1777 | { | |
1778 | struct ptlrpc_service_part *svcpt = rs->rs_svcpt; | |
d0bfef31 CH |
1779 | struct ptlrpc_service *svc = svcpt->scp_service; |
1780 | struct obd_export *exp; | |
1781 | int nlocks; | |
1782 | int been_handled; | |
d7e09d03 PT |
1783 | |
1784 | exp = rs->rs_export; | |
1785 | ||
3949015e KM |
1786 | LASSERT(rs->rs_difficult); |
1787 | LASSERT(rs->rs_scheduled); | |
1788 | LASSERT(list_empty(&rs->rs_list)); | |
d7e09d03 PT |
1789 | |
1790 | spin_lock(&exp->exp_lock); | |
1791 | /* Noop if removed already */ | |
3949015e | 1792 | list_del_init(&rs->rs_exp_list); |
d7e09d03 PT |
1793 | spin_unlock(&exp->exp_lock); |
1794 | ||
1795 | /* The disk commit callback holds exp_uncommitted_replies_lock while it | |
1796 | * iterates over newly committed replies, removing them from | |
1797 | * exp_uncommitted_replies. It then drops this lock and schedules the | |
1798 | * replies it found for handling here. | |
1799 | * | |
1800 | * We can avoid contention for exp_uncommitted_replies_lock between the | |
1801 | * HRT threads and further commit callbacks by checking rs_committed | |
1802 | * which is set in the commit callback while it holds both | |
1803 | * rs_lock and exp_uncommitted_reples. | |
1804 | * | |
1805 | * If we see rs_committed clear, the commit callback _may_ not have | |
1806 | * handled this reply yet and we race with it to grab | |
1807 | * exp_uncommitted_replies_lock before removing the reply from | |
1808 | * exp_uncommitted_replies. Note that if we lose the race and the | |
1809 | * reply has already been removed, list_del_init() is a noop. | |
1810 | * | |
1811 | * If we see rs_committed set, we know the commit callback is handling, | |
1812 | * or has handled this reply since store reordering might allow us to | |
1813 | * see rs_committed set out of sequence. But since this is done | |
1814 | * holding rs_lock, we can be sure it has all completed once we hold | |
1815 | * rs_lock, which we do right next. | |
1816 | */ | |
1817 | if (!rs->rs_committed) { | |
1818 | spin_lock(&exp->exp_uncommitted_replies_lock); | |
1819 | list_del_init(&rs->rs_obd_list); | |
1820 | spin_unlock(&exp->exp_uncommitted_replies_lock); | |
1821 | } | |
1822 | ||
1823 | spin_lock(&rs->rs_lock); | |
1824 | ||
1825 | been_handled = rs->rs_handled; | |
1826 | rs->rs_handled = 1; | |
1827 | ||
1828 | nlocks = rs->rs_nlocks; /* atomic "steal", but */ | |
1829 | rs->rs_nlocks = 0; /* locks still on rs_locks! */ | |
1830 | ||
1831 | if (nlocks == 0 && !been_handled) { | |
1832 | /* If we see this, we should already have seen the warning | |
dadfcdab OD |
1833 | * in mds_steal_ack_locks() |
1834 | */ | |
f537dd2c | 1835 | CDEBUG(D_HA, "All locks stolen from rs %p x%lld.t%lld o%d NID %s\n", |
d7e09d03 PT |
1836 | rs, |
1837 | rs->rs_xid, rs->rs_transno, rs->rs_opc, | |
1838 | libcfs_nid2str(exp->exp_connection->c_peer.nid)); | |
1839 | } | |
1840 | ||
1841 | if ((!been_handled && rs->rs_on_net) || nlocks > 0) { | |
1842 | spin_unlock(&rs->rs_lock); | |
1843 | ||
1844 | if (!been_handled && rs->rs_on_net) { | |
1845 | LNetMDUnlink(rs->rs_md_h); | |
1846 | /* Ignore return code; we're racing with completion */ | |
1847 | } | |
1848 | ||
1849 | while (nlocks-- > 0) | |
1850 | ldlm_lock_decref(&rs->rs_locks[nlocks], | |
1851 | rs->rs_modes[nlocks]); | |
1852 | ||
1853 | spin_lock(&rs->rs_lock); | |
1854 | } | |
1855 | ||
1856 | rs->rs_scheduled = 0; | |
1857 | ||
1858 | if (!rs->rs_on_net) { | |
1859 | /* Off the net */ | |
1860 | spin_unlock(&rs->rs_lock); | |
1861 | ||
3949015e | 1862 | class_export_put(exp); |
d7e09d03 | 1863 | rs->rs_export = NULL; |
3949015e | 1864 | ptlrpc_rs_decref(rs); |
d7e09d03 PT |
1865 | if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) && |
1866 | svc->srv_is_stopping) | |
1867 | wake_up_all(&svcpt->scp_waitq); | |
0a3bdb00 | 1868 | return 1; |
d7e09d03 PT |
1869 | } |
1870 | ||
1871 | /* still on the net; callback will schedule */ | |
1872 | spin_unlock(&rs->rs_lock); | |
0a3bdb00 | 1873 | return 1; |
d7e09d03 PT |
1874 | } |
1875 | ||
d7e09d03 PT |
1876 | static void |
1877 | ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt) | |
1878 | { | |
1879 | int avail = svcpt->scp_nrqbds_posted; | |
1880 | int low_water = test_req_buffer_pressure ? 0 : | |
1881 | svcpt->scp_service->srv_nbuf_per_group / 2; | |
1882 | ||
1883 | /* NB I'm not locking; just looking. */ | |
1884 | ||
1885 | /* CAVEAT EMPTOR: We might be allocating buffers here because we've | |
1886 | * allowed the request history to grow out of control. We could put a | |
1887 | * sanity check on that here and cull some history if we need the | |
dadfcdab OD |
1888 | * space. |
1889 | */ | |
d7e09d03 PT |
1890 | |
1891 | if (avail <= low_water) | |
1892 | ptlrpc_grow_req_bufs(svcpt, 1); | |
1893 | ||
1894 | if (svcpt->scp_service->srv_stats) { | |
1895 | lprocfs_counter_add(svcpt->scp_service->srv_stats, | |
1896 | PTLRPC_REQBUF_AVAIL_CNTR, avail); | |
1897 | } | |
1898 | } | |
1899 | ||
1900 | static int | |
1901 | ptlrpc_retry_rqbds(void *arg) | |
1902 | { | |
864ef621 | 1903 | struct ptlrpc_service_part *svcpt = arg; |
d7e09d03 PT |
1904 | |
1905 | svcpt->scp_rqbd_timeout = 0; | |
1906 | return -ETIMEDOUT; | |
1907 | } | |
1908 | ||
1909 | static inline int | |
1910 | ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt) | |
1911 | { | |
1912 | return svcpt->scp_nreqs_active < | |
1913 | svcpt->scp_nthrs_running - 1 - | |
1914 | (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL); | |
1915 | } | |
1916 | ||
1917 | /** | |
1918 | * allowed to create more threads | |
1919 | * user can call it w/o any lock but need to hold | |
1920 | * ptlrpc_service_part::scp_lock to get reliable result | |
1921 | */ | |
1922 | static inline int | |
1923 | ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt) | |
1924 | { | |
1925 | return svcpt->scp_nthrs_running + | |
1926 | svcpt->scp_nthrs_starting < | |
1927 | svcpt->scp_service->srv_nthrs_cpt_limit; | |
1928 | } | |
1929 | ||
1930 | /** | |
1931 | * too many requests and allowed to create more threads | |
1932 | */ | |
1933 | static inline int | |
1934 | ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt) | |
1935 | { | |
1936 | return !ptlrpc_threads_enough(svcpt) && | |
1937 | ptlrpc_threads_increasable(svcpt); | |
1938 | } | |
1939 | ||
1940 | static inline int | |
1941 | ptlrpc_thread_stopping(struct ptlrpc_thread *thread) | |
1942 | { | |
1943 | return thread_is_stopping(thread) || | |
1944 | thread->t_svcpt->scp_service->srv_is_stopping; | |
1945 | } | |
1946 | ||
1947 | static inline int | |
1948 | ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt) | |
1949 | { | |
1950 | return !list_empty(&svcpt->scp_rqbd_idle) && | |
1951 | svcpt->scp_rqbd_timeout == 0; | |
1952 | } | |
1953 | ||
1954 | static inline int | |
1955 | ptlrpc_at_check(struct ptlrpc_service_part *svcpt) | |
1956 | { | |
1957 | return svcpt->scp_at_check; | |
1958 | } | |
1959 | ||
1960 | /** | |
1961 | * requests wait on preprocessing | |
1962 | * user can call it w/o any lock but need to hold | |
1963 | * ptlrpc_service_part::scp_lock to get reliable result | |
1964 | */ | |
1965 | static inline int | |
1966 | ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt) | |
1967 | { | |
1968 | return !list_empty(&svcpt->scp_req_incoming); | |
1969 | } | |
1970 | ||
1971 | static __attribute__((__noinline__)) int | |
1972 | ptlrpc_wait_event(struct ptlrpc_service_part *svcpt, | |
1973 | struct ptlrpc_thread *thread) | |
1974 | { | |
1975 | /* Don't exit while there are replies to be handled */ | |
1976 | struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout, | |
1977 | ptlrpc_retry_rqbds, svcpt); | |
1978 | ||
5d4450c4 | 1979 | /* XXX: Add this back when libcfs watchdog is merged upstream |
d7e09d03 | 1980 | lc_watchdog_disable(thread->t_watchdog); |
5d4450c4 | 1981 | */ |
d7e09d03 PT |
1982 | |
1983 | cond_resched(); | |
1984 | ||
1985 | l_wait_event_exclusive_head(svcpt->scp_waitq, | |
24c198e9 OD |
1986 | ptlrpc_thread_stopping(thread) || |
1987 | ptlrpc_server_request_incoming(svcpt) || | |
1988 | ptlrpc_server_request_pending(svcpt, | |
1989 | false) || | |
1990 | ptlrpc_rqbd_pending(svcpt) || | |
1991 | ptlrpc_at_check(svcpt), &lwi); | |
d7e09d03 PT |
1992 | |
1993 | if (ptlrpc_thread_stopping(thread)) | |
1994 | return -EINTR; | |
1995 | ||
5d4450c4 | 1996 | /* |
d7e09d03 PT |
1997 | lc_watchdog_touch(thread->t_watchdog, |
1998 | ptlrpc_server_get_timeout(svcpt)); | |
5d4450c4 | 1999 | */ |
d7e09d03 PT |
2000 | return 0; |
2001 | } | |
2002 | ||
2003 | /** | |
2004 | * Main thread body for service threads. | |
2005 | * Waits in a loop waiting for new requests to process to appear. | |
2006 | * Every time an incoming requests is added to its queue, a waitq | |
2007 | * is woken up and one of the threads will handle it. | |
2008 | */ | |
2009 | static int ptlrpc_main(void *arg) | |
2010 | { | |
864ef621 | 2011 | struct ptlrpc_thread *thread = arg; |
d0bfef31 CH |
2012 | struct ptlrpc_service_part *svcpt = thread->t_svcpt; |
2013 | struct ptlrpc_service *svc = svcpt->scp_service; | |
2014 | struct ptlrpc_reply_state *rs; | |
c88a6cbb | 2015 | struct group_info *ginfo = NULL; |
d7e09d03 PT |
2016 | struct lu_env *env; |
2017 | int counter = 0, rc = 0; | |
d7e09d03 PT |
2018 | |
2019 | thread->t_pid = current_pid(); | |
2020 | unshare_fs_struct(); | |
2021 | ||
2022 | /* NB: we will call cfs_cpt_bind() for all threads, because we | |
2023 | * might want to run lustre server only on a subset of system CPUs, | |
dadfcdab OD |
2024 | * in that case ->scp_cpt is CFS_CPT_ANY |
2025 | */ | |
d7e09d03 PT |
2026 | rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt); |
2027 | if (rc != 0) { | |
2028 | CWARN("%s: failed to bind %s on CPT %d\n", | |
2029 | svc->srv_name, thread->t_name, svcpt->scp_cpt); | |
2030 | } | |
2031 | ||
d7e09d03 PT |
2032 | ginfo = groups_alloc(0); |
2033 | if (!ginfo) { | |
2034 | rc = -ENOMEM; | |
2035 | goto out; | |
2036 | } | |
2037 | ||
2038 | set_current_groups(ginfo); | |
2039 | put_group_info(ginfo); | |
d7e09d03 | 2040 | |
8b382089 | 2041 | if (svc->srv_ops.so_thr_init) { |
d7e09d03 PT |
2042 | rc = svc->srv_ops.so_thr_init(thread); |
2043 | if (rc) | |
2044 | goto out; | |
2045 | } | |
2046 | ||
9ae10597 | 2047 | env = kzalloc(sizeof(*env), GFP_NOFS); |
597851ac | 2048 | if (!env) { |
d7e09d03 PT |
2049 | rc = -ENOMEM; |
2050 | goto out_srv_fini; | |
2051 | } | |
2052 | ||
2053 | rc = lu_context_init(&env->le_ctx, | |
cd94f231 | 2054 | svc->srv_ctx_tags | LCT_REMEMBER | LCT_NOREF); |
d7e09d03 PT |
2055 | if (rc) |
2056 | goto out_srv_fini; | |
2057 | ||
2058 | thread->t_env = env; | |
2059 | env->le_ctx.lc_thread = thread; | |
2060 | env->le_ctx.lc_cookie = 0x6; | |
2061 | ||
2062 | while (!list_empty(&svcpt->scp_rqbd_idle)) { | |
2063 | rc = ptlrpc_server_post_idle_rqbds(svcpt); | |
2064 | if (rc >= 0) | |
2065 | continue; | |
2066 | ||
2067 | CERROR("Failed to post rqbd for %s on CPT %d: %d\n", | |
30c0aa39 | 2068 | svc->srv_name, svcpt->scp_cpt, rc); |
d7e09d03 PT |
2069 | goto out_srv_fini; |
2070 | } | |
2071 | ||
2072 | /* Alloc reply state structure for this one */ | |
ee0ec194 | 2073 | rs = libcfs_kvzalloc(svc->srv_max_reply_size, GFP_NOFS); |
d7e09d03 PT |
2074 | if (!rs) { |
2075 | rc = -ENOMEM; | |
2076 | goto out_srv_fini; | |
2077 | } | |
2078 | ||
2079 | spin_lock(&svcpt->scp_lock); | |
2080 | ||
2081 | LASSERT(thread_is_starting(thread)); | |
2082 | thread_clear_flags(thread, SVC_STARTING); | |
2083 | ||
2084 | LASSERT(svcpt->scp_nthrs_starting == 1); | |
2085 | svcpt->scp_nthrs_starting--; | |
2086 | ||
2087 | /* SVC_STOPPING may already be set here if someone else is trying | |
2088 | * to stop the service while this new thread has been dynamically | |
2089 | * forked. We still set SVC_RUNNING to let our creator know that | |
dadfcdab OD |
2090 | * we are now running, however we will exit as soon as possible |
2091 | */ | |
d7e09d03 PT |
2092 | thread_add_flags(thread, SVC_RUNNING); |
2093 | svcpt->scp_nthrs_running++; | |
2094 | spin_unlock(&svcpt->scp_lock); | |
2095 | ||
2096 | /* wake up our creator in case he's still waiting. */ | |
2097 | wake_up(&thread->t_ctl_waitq); | |
2098 | ||
5d4450c4 | 2099 | /* |
d7e09d03 PT |
2100 | thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt), |
2101 | NULL, NULL); | |
5d4450c4 | 2102 | */ |
d7e09d03 PT |
2103 | |
2104 | spin_lock(&svcpt->scp_rep_lock); | |
2105 | list_add(&rs->rs_list, &svcpt->scp_rep_idle); | |
2106 | wake_up(&svcpt->scp_rep_waitq); | |
2107 | spin_unlock(&svcpt->scp_rep_lock); | |
2108 | ||
2109 | CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id, | |
2110 | svcpt->scp_nthrs_running); | |
2111 | ||
2112 | /* XXX maintain a list of all managed devices: insert here */ | |
2113 | while (!ptlrpc_thread_stopping(thread)) { | |
2114 | if (ptlrpc_wait_event(svcpt, thread)) | |
2115 | break; | |
2116 | ||
2117 | ptlrpc_check_rqbd_pool(svcpt); | |
2118 | ||
2119 | if (ptlrpc_threads_need_create(svcpt)) { | |
2120 | /* Ignore return code - we tried... */ | |
2121 | ptlrpc_start_thread(svcpt, 0); | |
2122 | } | |
2123 | ||
2124 | /* Process all incoming reqs before handling any */ | |
2125 | if (ptlrpc_server_request_incoming(svcpt)) { | |
2126 | lu_context_enter(&env->le_ctx); | |
4ee688d0 | 2127 | env->le_ses = NULL; |
d7e09d03 PT |
2128 | ptlrpc_server_handle_req_in(svcpt, thread); |
2129 | lu_context_exit(&env->le_ctx); | |
2130 | ||
2131 | /* but limit ourselves in case of flood */ | |
2132 | if (counter++ < 100) | |
2133 | continue; | |
2134 | counter = 0; | |
2135 | } | |
2136 | ||
2137 | if (ptlrpc_at_check(svcpt)) | |
2138 | ptlrpc_at_check_timed(svcpt); | |
2139 | ||
2140 | if (ptlrpc_server_request_pending(svcpt, false)) { | |
2141 | lu_context_enter(&env->le_ctx); | |
2142 | ptlrpc_server_handle_request(svcpt, thread); | |
2143 | lu_context_exit(&env->le_ctx); | |
2144 | } | |
2145 | ||
2146 | if (ptlrpc_rqbd_pending(svcpt) && | |
2147 | ptlrpc_server_post_idle_rqbds(svcpt) < 0) { | |
2148 | /* I just failed to repost request buffers. | |
2149 | * Wait for a timeout (unless something else | |
dadfcdab OD |
2150 | * happens) before I try again |
2151 | */ | |
d7e09d03 PT |
2152 | svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10; |
2153 | CDEBUG(D_RPCTRACE, "Posted buffers: %d\n", | |
2154 | svcpt->scp_nrqbds_posted); | |
2155 | } | |
2156 | } | |
2157 | ||
5d4450c4 | 2158 | /* |
d7e09d03 PT |
2159 | lc_watchdog_delete(thread->t_watchdog); |
2160 | thread->t_watchdog = NULL; | |
5d4450c4 | 2161 | */ |
d7e09d03 PT |
2162 | |
2163 | out_srv_fini: | |
2164 | /* | |
2165 | * deconstruct service specific state created by ptlrpc_start_thread() | |
2166 | */ | |
8b382089 | 2167 | if (svc->srv_ops.so_thr_done) |
d7e09d03 PT |
2168 | svc->srv_ops.so_thr_done(thread); |
2169 | ||
8b382089 | 2170 | if (env) { |
d7e09d03 | 2171 | lu_context_fini(&env->le_ctx); |
9ae10597 | 2172 | kfree(env); |
d7e09d03 PT |
2173 | } |
2174 | out: | |
2175 | CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n", | |
2176 | thread, thread->t_pid, thread->t_id, rc); | |
2177 | ||
2178 | spin_lock(&svcpt->scp_lock); | |
2179 | if (thread_test_and_clear_flags(thread, SVC_STARTING)) | |
2180 | svcpt->scp_nthrs_starting--; | |
2181 | ||
2182 | if (thread_test_and_clear_flags(thread, SVC_RUNNING)) { | |
2183 | /* must know immediately */ | |
2184 | svcpt->scp_nthrs_running--; | |
2185 | } | |
2186 | ||
2187 | thread->t_id = rc; | |
2188 | thread_add_flags(thread, SVC_STOPPED); | |
2189 | ||
2190 | wake_up(&thread->t_ctl_waitq); | |
2191 | spin_unlock(&svcpt->scp_lock); | |
2192 | ||
2193 | return rc; | |
2194 | } | |
2195 | ||
2196 | static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt, | |
2197 | struct list_head *replies) | |
2198 | { | |
2199 | int result; | |
2200 | ||
2201 | spin_lock(&hrt->hrt_lock); | |
2202 | ||
2203 | list_splice_init(&hrt->hrt_queue, replies); | |
2204 | result = ptlrpc_hr.hr_stopping || !list_empty(replies); | |
2205 | ||
2206 | spin_unlock(&hrt->hrt_lock); | |
2207 | return result; | |
2208 | } | |
2209 | ||
2210 | /** | |
2211 | * Main body of "handle reply" function. | |
2212 | * It processes acked reply states | |
2213 | */ | |
2214 | static int ptlrpc_hr_main(void *arg) | |
2215 | { | |
864ef621 | 2216 | struct ptlrpc_hr_thread *hrt = arg; |
d0bfef31 | 2217 | struct ptlrpc_hr_partition *hrp = hrt->hrt_partition; |
24721b29 | 2218 | LIST_HEAD(replies); |
d0bfef31 CH |
2219 | char threadname[20]; |
2220 | int rc; | |
d7e09d03 PT |
2221 | |
2222 | snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d", | |
2223 | hrp->hrp_cpt, hrt->hrt_id); | |
2224 | unshare_fs_struct(); | |
2225 | ||
2226 | rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt); | |
2227 | if (rc != 0) { | |
2228 | CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n", | |
2229 | threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc); | |
2230 | } | |
2231 | ||
2232 | atomic_inc(&hrp->hrp_nstarted); | |
2233 | wake_up(&ptlrpc_hr.hr_waitq); | |
2234 | ||
2235 | while (!ptlrpc_hr.hr_stopping) { | |
2236 | l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies)); | |
2237 | ||
2238 | while (!list_empty(&replies)) { | |
2239 | struct ptlrpc_reply_state *rs; | |
2240 | ||
30c0aa39 OD |
2241 | rs = list_entry(replies.prev, struct ptlrpc_reply_state, |
2242 | rs_list); | |
d7e09d03 PT |
2243 | list_del_init(&rs->rs_list); |
2244 | ptlrpc_handle_rs(rs); | |
2245 | } | |
2246 | } | |
2247 | ||
2248 | atomic_inc(&hrp->hrp_nstopped); | |
2249 | wake_up(&ptlrpc_hr.hr_waitq); | |
2250 | ||
2251 | return 0; | |
2252 | } | |
2253 | ||
2254 | static void ptlrpc_stop_hr_threads(void) | |
2255 | { | |
d0bfef31 CH |
2256 | struct ptlrpc_hr_partition *hrp; |
2257 | int i; | |
2258 | int j; | |
d7e09d03 PT |
2259 | |
2260 | ptlrpc_hr.hr_stopping = 1; | |
2261 | ||
2262 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
8b382089 | 2263 | if (!hrp->hrp_thrs) |
d7e09d03 PT |
2264 | continue; /* uninitialized */ |
2265 | for (j = 0; j < hrp->hrp_nthrs; j++) | |
2266 | wake_up_all(&hrp->hrp_thrs[j].hrt_waitq); | |
2267 | } | |
2268 | ||
2269 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
8b382089 | 2270 | if (!hrp->hrp_thrs) |
d7e09d03 PT |
2271 | continue; /* uninitialized */ |
2272 | wait_event(ptlrpc_hr.hr_waitq, | |
30c0aa39 OD |
2273 | atomic_read(&hrp->hrp_nstopped) == |
2274 | atomic_read(&hrp->hrp_nstarted)); | |
d7e09d03 PT |
2275 | } |
2276 | } | |
2277 | ||
2278 | static int ptlrpc_start_hr_threads(void) | |
2279 | { | |
d0bfef31 CH |
2280 | struct ptlrpc_hr_partition *hrp; |
2281 | int i; | |
2282 | int j; | |
d7e09d03 PT |
2283 | |
2284 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
d0bfef31 | 2285 | int rc = 0; |
d7e09d03 PT |
2286 | |
2287 | for (j = 0; j < hrp->hrp_nthrs; j++) { | |
2288 | struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j]; | |
060c2820 | 2289 | struct task_struct *task; |
50ffcb7e | 2290 | |
060c2820 | 2291 | task = kthread_run(ptlrpc_hr_main, |
30c0aa39 OD |
2292 | &hrp->hrp_thrs[j], |
2293 | "ptlrpc_hr%02d_%03d", | |
2294 | hrp->hrp_cpt, hrt->hrt_id); | |
060c2820 JH |
2295 | if (IS_ERR(task)) { |
2296 | rc = PTR_ERR(task); | |
d7e09d03 | 2297 | break; |
060c2820 | 2298 | } |
d7e09d03 PT |
2299 | } |
2300 | wait_event(ptlrpc_hr.hr_waitq, | |
30c0aa39 | 2301 | atomic_read(&hrp->hrp_nstarted) == j); |
d7e09d03 | 2302 | |
060c2820 JH |
2303 | if (rc < 0) { |
2304 | CERROR("cannot start reply handler thread %d:%d: rc = %d\n", | |
2305 | i, j, rc); | |
2306 | ptlrpc_stop_hr_threads(); | |
2307 | return rc; | |
2308 | } | |
d7e09d03 | 2309 | } |
0a3bdb00 | 2310 | return 0; |
d7e09d03 PT |
2311 | } |
2312 | ||
2313 | static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt) | |
2314 | { | |
d0bfef31 CH |
2315 | struct l_wait_info lwi = { 0 }; |
2316 | struct ptlrpc_thread *thread; | |
24721b29 | 2317 | LIST_HEAD(zombie); |
d7e09d03 | 2318 | |
d7e09d03 PT |
2319 | CDEBUG(D_INFO, "Stopping threads for service %s\n", |
2320 | svcpt->scp_service->srv_name); | |
2321 | ||
2322 | spin_lock(&svcpt->scp_lock); | |
2323 | /* let the thread know that we would like it to stop asap */ | |
2324 | list_for_each_entry(thread, &svcpt->scp_threads, t_link) { | |
2325 | CDEBUG(D_INFO, "Stopping thread %s #%u\n", | |
2326 | svcpt->scp_service->srv_thread_name, thread->t_id); | |
2327 | thread_add_flags(thread, SVC_STOPPING); | |
2328 | } | |
2329 | ||
2330 | wake_up_all(&svcpt->scp_waitq); | |
2331 | ||
2332 | while (!list_empty(&svcpt->scp_threads)) { | |
2333 | thread = list_entry(svcpt->scp_threads.next, | |
30c0aa39 | 2334 | struct ptlrpc_thread, t_link); |
d7e09d03 PT |
2335 | if (thread_is_stopped(thread)) { |
2336 | list_del(&thread->t_link); | |
2337 | list_add(&thread->t_link, &zombie); | |
2338 | continue; | |
2339 | } | |
2340 | spin_unlock(&svcpt->scp_lock); | |
2341 | ||
2342 | CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n", | |
2343 | svcpt->scp_service->srv_thread_name, thread->t_id); | |
2344 | l_wait_event(thread->t_ctl_waitq, | |
2345 | thread_is_stopped(thread), &lwi); | |
2346 | ||
2347 | spin_lock(&svcpt->scp_lock); | |
2348 | } | |
2349 | ||
2350 | spin_unlock(&svcpt->scp_lock); | |
2351 | ||
2352 | while (!list_empty(&zombie)) { | |
2353 | thread = list_entry(zombie.next, | |
24c198e9 | 2354 | struct ptlrpc_thread, t_link); |
d7e09d03 | 2355 | list_del(&thread->t_link); |
9ae10597 | 2356 | kfree(thread); |
d7e09d03 | 2357 | } |
d7e09d03 PT |
2358 | } |
2359 | ||
2360 | /** | |
2361 | * Stops all threads of a particular service \a svc | |
2362 | */ | |
230a8da1 | 2363 | static void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) |
d7e09d03 PT |
2364 | { |
2365 | struct ptlrpc_service_part *svcpt; | |
d0bfef31 | 2366 | int i; |
d7e09d03 PT |
2367 | |
2368 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
8b382089 | 2369 | if (svcpt->scp_service) |
d7e09d03 PT |
2370 | ptlrpc_svcpt_stop_threads(svcpt); |
2371 | } | |
d7e09d03 | 2372 | } |
d7e09d03 PT |
2373 | |
2374 | int ptlrpc_start_threads(struct ptlrpc_service *svc) | |
2375 | { | |
d0bfef31 CH |
2376 | int rc = 0; |
2377 | int i; | |
2378 | int j; | |
d7e09d03 PT |
2379 | |
2380 | /* We require 2 threads min, see note in ptlrpc_server_handle_request */ | |
2381 | LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT); | |
2382 | ||
2383 | for (i = 0; i < svc->srv_ncpts; i++) { | |
2384 | for (j = 0; j < svc->srv_nthrs_cpt_init; j++) { | |
2385 | rc = ptlrpc_start_thread(svc->srv_parts[i], 1); | |
2386 | if (rc == 0) | |
2387 | continue; | |
2388 | ||
2389 | if (rc != -EMFILE) | |
2390 | goto failed; | |
2391 | /* We have enough threads, don't start more. b=15759 */ | |
2392 | break; | |
2393 | } | |
2394 | } | |
2395 | ||
0a3bdb00 | 2396 | return 0; |
d7e09d03 PT |
2397 | failed: |
2398 | CERROR("cannot start %s thread #%d_%d: rc %d\n", | |
2399 | svc->srv_thread_name, i, j, rc); | |
2400 | ptlrpc_stop_all_threads(svc); | |
0a3bdb00 | 2401 | return rc; |
d7e09d03 | 2402 | } |
d7e09d03 PT |
2403 | |
2404 | int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait) | |
2405 | { | |
d0bfef31 CH |
2406 | struct l_wait_info lwi = { 0 }; |
2407 | struct ptlrpc_thread *thread; | |
2408 | struct ptlrpc_service *svc; | |
060c2820 | 2409 | struct task_struct *task; |
d0bfef31 | 2410 | int rc; |
d7e09d03 | 2411 | |
d7e09d03 PT |
2412 | svc = svcpt->scp_service; |
2413 | ||
2414 | CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n", | |
2415 | svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running, | |
2416 | svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit); | |
2417 | ||
2418 | again: | |
2419 | if (unlikely(svc->srv_is_stopping)) | |
0a3bdb00 | 2420 | return -ESRCH; |
d7e09d03 PT |
2421 | |
2422 | if (!ptlrpc_threads_increasable(svcpt) || | |
2423 | (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) && | |
2424 | svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1)) | |
0a3bdb00 | 2425 | return -EMFILE; |
d7e09d03 | 2426 | |
bae97e81 JL |
2427 | thread = kzalloc_node(sizeof(*thread), GFP_NOFS, |
2428 | cfs_cpt_spread_node(svc->srv_cptable, | |
2429 | svcpt->scp_cpt)); | |
8b382089 | 2430 | if (!thread) |
0a3bdb00 | 2431 | return -ENOMEM; |
d7e09d03 PT |
2432 | init_waitqueue_head(&thread->t_ctl_waitq); |
2433 | ||
2434 | spin_lock(&svcpt->scp_lock); | |
2435 | if (!ptlrpc_threads_increasable(svcpt)) { | |
2436 | spin_unlock(&svcpt->scp_lock); | |
9ae10597 | 2437 | kfree(thread); |
0a3bdb00 | 2438 | return -EMFILE; |
d7e09d03 PT |
2439 | } |
2440 | ||
2441 | if (svcpt->scp_nthrs_starting != 0) { | |
2442 | /* serialize starting because some modules (obdfilter) | |
dadfcdab OD |
2443 | * might require unique and contiguous t_id |
2444 | */ | |
d7e09d03 PT |
2445 | LASSERT(svcpt->scp_nthrs_starting == 1); |
2446 | spin_unlock(&svcpt->scp_lock); | |
9ae10597 | 2447 | kfree(thread); |
d7e09d03 PT |
2448 | if (wait) { |
2449 | CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n", | |
2450 | svc->srv_thread_name, svcpt->scp_thr_nextid); | |
2451 | schedule(); | |
2452 | goto again; | |
2453 | } | |
2454 | ||
2455 | CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n", | |
2456 | svc->srv_thread_name, svcpt->scp_thr_nextid); | |
0a3bdb00 | 2457 | return -EAGAIN; |
d7e09d03 PT |
2458 | } |
2459 | ||
2460 | svcpt->scp_nthrs_starting++; | |
2461 | thread->t_id = svcpt->scp_thr_nextid++; | |
2462 | thread_add_flags(thread, SVC_STARTING); | |
2463 | thread->t_svcpt = svcpt; | |
2464 | ||
2465 | list_add(&thread->t_link, &svcpt->scp_threads); | |
2466 | spin_unlock(&svcpt->scp_lock); | |
2467 | ||
2468 | if (svcpt->scp_cpt >= 0) { | |
9edf0f67 | 2469 | snprintf(thread->t_name, sizeof(thread->t_name), "%s%02d_%03d", |
d7e09d03 PT |
2470 | svc->srv_thread_name, svcpt->scp_cpt, thread->t_id); |
2471 | } else { | |
9edf0f67 | 2472 | snprintf(thread->t_name, sizeof(thread->t_name), "%s_%04d", |
d7e09d03 PT |
2473 | svc->srv_thread_name, thread->t_id); |
2474 | } | |
2475 | ||
2476 | CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name); | |
060c2820 JH |
2477 | task = kthread_run(ptlrpc_main, thread, "%s", thread->t_name); |
2478 | if (IS_ERR(task)) { | |
2479 | rc = PTR_ERR(task); | |
2480 | CERROR("cannot start thread '%s': rc = %d\n", | |
d7e09d03 PT |
2481 | thread->t_name, rc); |
2482 | spin_lock(&svcpt->scp_lock); | |
d7e09d03 | 2483 | --svcpt->scp_nthrs_starting; |
5be8e070 | 2484 | if (thread_is_stopping(thread)) { |
369e5c9a | 2485 | /* this ptlrpc_thread is being handled |
5be8e070 HN |
2486 | * by ptlrpc_svcpt_stop_threads now |
2487 | */ | |
2488 | thread_add_flags(thread, SVC_STOPPED); | |
2489 | wake_up(&thread->t_ctl_waitq); | |
2490 | spin_unlock(&svcpt->scp_lock); | |
2491 | } else { | |
2492 | list_del(&thread->t_link); | |
2493 | spin_unlock(&svcpt->scp_lock); | |
9ae10597 | 2494 | kfree(thread); |
5be8e070 | 2495 | } |
0a3bdb00 | 2496 | return rc; |
d7e09d03 PT |
2497 | } |
2498 | ||
2499 | if (!wait) | |
0a3bdb00 | 2500 | return 0; |
d7e09d03 PT |
2501 | |
2502 | l_wait_event(thread->t_ctl_waitq, | |
2503 | thread_is_running(thread) || thread_is_stopped(thread), | |
2504 | &lwi); | |
2505 | ||
2506 | rc = thread_is_stopped(thread) ? thread->t_id : 0; | |
0a3bdb00 | 2507 | return rc; |
d7e09d03 PT |
2508 | } |
2509 | ||
2510 | int ptlrpc_hr_init(void) | |
2511 | { | |
d0bfef31 CH |
2512 | struct ptlrpc_hr_partition *hrp; |
2513 | struct ptlrpc_hr_thread *hrt; | |
2514 | int rc; | |
2515 | int i; | |
2516 | int j; | |
2517 | int weight; | |
d7e09d03 PT |
2518 | |
2519 | memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr)); | |
2520 | ptlrpc_hr.hr_cpt_table = cfs_cpt_table; | |
2521 | ||
2522 | ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table, | |
2523 | sizeof(*hrp)); | |
8b382089 | 2524 | if (!ptlrpc_hr.hr_partitions) |
0a3bdb00 | 2525 | return -ENOMEM; |
d7e09d03 PT |
2526 | |
2527 | init_waitqueue_head(&ptlrpc_hr.hr_waitq); | |
2528 | ||
06931e62 | 2529 | weight = cpumask_weight(topology_sibling_cpumask(0)); |
3867ea5a | 2530 | |
d7e09d03 PT |
2531 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { |
2532 | hrp->hrp_cpt = i; | |
2533 | ||
2534 | atomic_set(&hrp->hrp_nstarted, 0); | |
2535 | atomic_set(&hrp->hrp_nstopped, 0); | |
2536 | ||
2537 | hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i); | |
3867ea5a | 2538 | hrp->hrp_nthrs /= weight; |
9cc58f1e AS |
2539 | if (hrp->hrp_nthrs == 0) |
2540 | hrp->hrp_nthrs = 1; | |
d7e09d03 | 2541 | |
bae97e81 JL |
2542 | hrp->hrp_thrs = |
2543 | kzalloc_node(hrp->hrp_nthrs * sizeof(*hrt), GFP_NOFS, | |
24c198e9 OD |
2544 | cfs_cpt_spread_node(ptlrpc_hr.hr_cpt_table, |
2545 | i)); | |
8b382089 | 2546 | if (!hrp->hrp_thrs) { |
a9b3e8f3 JL |
2547 | rc = -ENOMEM; |
2548 | goto out; | |
2549 | } | |
d7e09d03 PT |
2550 | |
2551 | for (j = 0; j < hrp->hrp_nthrs; j++) { | |
2552 | hrt = &hrp->hrp_thrs[j]; | |
2553 | ||
2554 | hrt->hrt_id = j; | |
2555 | hrt->hrt_partition = hrp; | |
2556 | init_waitqueue_head(&hrt->hrt_waitq); | |
2557 | spin_lock_init(&hrt->hrt_lock); | |
2558 | INIT_LIST_HEAD(&hrt->hrt_queue); | |
2559 | } | |
2560 | } | |
2561 | ||
2562 | rc = ptlrpc_start_hr_threads(); | |
2563 | out: | |
2564 | if (rc != 0) | |
2565 | ptlrpc_hr_fini(); | |
0a3bdb00 | 2566 | return rc; |
d7e09d03 PT |
2567 | } |
2568 | ||
2569 | void ptlrpc_hr_fini(void) | |
2570 | { | |
d0bfef31 CH |
2571 | struct ptlrpc_hr_partition *hrp; |
2572 | int i; | |
d7e09d03 | 2573 | |
8b382089 | 2574 | if (!ptlrpc_hr.hr_partitions) |
d7e09d03 PT |
2575 | return; |
2576 | ||
2577 | ptlrpc_stop_hr_threads(); | |
2578 | ||
2579 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
a5cb8880 | 2580 | kfree(hrp->hrp_thrs); |
d7e09d03 PT |
2581 | } |
2582 | ||
2583 | cfs_percpt_free(ptlrpc_hr.hr_partitions); | |
2584 | ptlrpc_hr.hr_partitions = NULL; | |
2585 | } | |
2586 | ||
d7e09d03 PT |
2587 | /** |
2588 | * Wait until all already scheduled replies are processed. | |
2589 | */ | |
2590 | static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt) | |
2591 | { | |
2592 | while (1) { | |
2593 | int rc; | |
2594 | struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10), | |
2595 | NULL, NULL); | |
2596 | ||
2597 | rc = l_wait_event(svcpt->scp_waitq, | |
24c198e9 OD |
2598 | atomic_read(&svcpt->scp_nreps_difficult) == 0, |
2599 | &lwi); | |
d7e09d03 PT |
2600 | if (rc == 0) |
2601 | break; | |
2602 | CWARN("Unexpectedly long timeout %s %p\n", | |
2603 | svcpt->scp_service->srv_name, svcpt->scp_service); | |
2604 | } | |
2605 | } | |
2606 | ||
2607 | static void | |
2608 | ptlrpc_service_del_atimer(struct ptlrpc_service *svc) | |
2609 | { | |
d0bfef31 CH |
2610 | struct ptlrpc_service_part *svcpt; |
2611 | int i; | |
d7e09d03 PT |
2612 | |
2613 | /* early disarm AT timer... */ | |
2614 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
8b382089 | 2615 | if (svcpt->scp_service) |
922da0c5 | 2616 | del_timer(&svcpt->scp_at_timer); |
d7e09d03 PT |
2617 | } |
2618 | } | |
2619 | ||
2620 | static void | |
2621 | ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) | |
2622 | { | |
d0bfef31 | 2623 | struct ptlrpc_service_part *svcpt; |
d7e09d03 | 2624 | struct ptlrpc_request_buffer_desc *rqbd; |
d0bfef31 CH |
2625 | struct l_wait_info lwi; |
2626 | int rc; | |
2627 | int i; | |
d7e09d03 PT |
2628 | |
2629 | /* All history will be culled when the next request buffer is | |
dadfcdab OD |
2630 | * freed in ptlrpc_service_purge_all() |
2631 | */ | |
d7e09d03 PT |
2632 | svc->srv_hist_nrqbds_cpt_max = 0; |
2633 | ||
2634 | rc = LNetClearLazyPortal(svc->srv_req_portal); | |
2635 | LASSERT(rc == 0); | |
2636 | ||
2637 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
8b382089 | 2638 | if (!svcpt->scp_service) |
d7e09d03 PT |
2639 | break; |
2640 | ||
2641 | /* Unlink all the request buffers. This forces a 'final' | |
dadfcdab OD |
2642 | * event with its 'unlink' flag set for each posted rqbd |
2643 | */ | |
d7e09d03 | 2644 | list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted, |
24c198e9 | 2645 | rqbd_list) { |
d7e09d03 PT |
2646 | rc = LNetMDUnlink(rqbd->rqbd_md_h); |
2647 | LASSERT(rc == 0 || rc == -ENOENT); | |
2648 | } | |
2649 | } | |
2650 | ||
2651 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
8b382089 | 2652 | if (!svcpt->scp_service) |
d7e09d03 PT |
2653 | break; |
2654 | ||
2655 | /* Wait for the network to release any buffers | |
dadfcdab OD |
2656 | * it's currently filling |
2657 | */ | |
d7e09d03 PT |
2658 | spin_lock(&svcpt->scp_lock); |
2659 | while (svcpt->scp_nrqbds_posted != 0) { | |
2660 | spin_unlock(&svcpt->scp_lock); | |
2661 | /* Network access will complete in finite time but | |
2662 | * the HUGE timeout lets us CWARN for visibility | |
dadfcdab OD |
2663 | * of sluggish LNDs |
2664 | */ | |
d7e09d03 PT |
2665 | lwi = LWI_TIMEOUT_INTERVAL( |
2666 | cfs_time_seconds(LONG_UNLINK), | |
2667 | cfs_time_seconds(1), NULL, NULL); | |
2668 | rc = l_wait_event(svcpt->scp_waitq, | |
2669 | svcpt->scp_nrqbds_posted == 0, &lwi); | |
2670 | if (rc == -ETIMEDOUT) { | |
2d00bd17 | 2671 | CWARN("Service %s waiting for request buffers\n", |
d7e09d03 PT |
2672 | svcpt->scp_service->srv_name); |
2673 | } | |
2674 | spin_lock(&svcpt->scp_lock); | |
2675 | } | |
2676 | spin_unlock(&svcpt->scp_lock); | |
2677 | } | |
2678 | } | |
2679 | ||
2680 | static void | |
2681 | ptlrpc_service_purge_all(struct ptlrpc_service *svc) | |
2682 | { | |
d0bfef31 CH |
2683 | struct ptlrpc_service_part *svcpt; |
2684 | struct ptlrpc_request_buffer_desc *rqbd; | |
2685 | struct ptlrpc_request *req; | |
2686 | struct ptlrpc_reply_state *rs; | |
2687 | int i; | |
d7e09d03 PT |
2688 | |
2689 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
8b382089 | 2690 | if (!svcpt->scp_service) |
d7e09d03 PT |
2691 | break; |
2692 | ||
2693 | spin_lock(&svcpt->scp_rep_lock); | |
2694 | while (!list_empty(&svcpt->scp_rep_active)) { | |
2695 | rs = list_entry(svcpt->scp_rep_active.next, | |
30c0aa39 | 2696 | struct ptlrpc_reply_state, rs_list); |
d7e09d03 PT |
2697 | spin_lock(&rs->rs_lock); |
2698 | ptlrpc_schedule_difficult_reply(rs); | |
2699 | spin_unlock(&rs->rs_lock); | |
2700 | } | |
2701 | spin_unlock(&svcpt->scp_rep_lock); | |
2702 | ||
2703 | /* purge the request queue. NB No new replies (rqbds | |
2704 | * all unlinked) and no service threads, so I'm the only | |
dadfcdab OD |
2705 | * thread noodling the request queue now |
2706 | */ | |
d7e09d03 PT |
2707 | while (!list_empty(&svcpt->scp_req_incoming)) { |
2708 | req = list_entry(svcpt->scp_req_incoming.next, | |
30c0aa39 | 2709 | struct ptlrpc_request, rq_list); |
d7e09d03 PT |
2710 | |
2711 | list_del(&req->rq_list); | |
2712 | svcpt->scp_nreqs_incoming--; | |
2713 | ptlrpc_server_finish_request(svcpt, req); | |
2714 | } | |
2715 | ||
2716 | while (ptlrpc_server_request_pending(svcpt, true)) { | |
2717 | req = ptlrpc_server_request_get(svcpt, true); | |
2718 | ptlrpc_server_finish_active_request(svcpt, req); | |
2719 | } | |
2720 | ||
2721 | LASSERT(list_empty(&svcpt->scp_rqbd_posted)); | |
2722 | LASSERT(svcpt->scp_nreqs_incoming == 0); | |
2723 | LASSERT(svcpt->scp_nreqs_active == 0); | |
2724 | /* history should have been culled by | |
dadfcdab OD |
2725 | * ptlrpc_server_finish_request |
2726 | */ | |
d7e09d03 PT |
2727 | LASSERT(svcpt->scp_hist_nrqbds == 0); |
2728 | ||
2729 | /* Now free all the request buffers since nothing | |
dadfcdab OD |
2730 | * references them any more... |
2731 | */ | |
d7e09d03 PT |
2732 | |
2733 | while (!list_empty(&svcpt->scp_rqbd_idle)) { | |
2734 | rqbd = list_entry(svcpt->scp_rqbd_idle.next, | |
30c0aa39 OD |
2735 | struct ptlrpc_request_buffer_desc, |
2736 | rqbd_list); | |
d7e09d03 PT |
2737 | ptlrpc_free_rqbd(rqbd); |
2738 | } | |
2739 | ptlrpc_wait_replies(svcpt); | |
2740 | ||
2741 | while (!list_empty(&svcpt->scp_rep_idle)) { | |
2742 | rs = list_entry(svcpt->scp_rep_idle.next, | |
30c0aa39 OD |
2743 | struct ptlrpc_reply_state, |
2744 | rs_list); | |
d7e09d03 | 2745 | list_del(&rs->rs_list); |
ee0ec194 | 2746 | kvfree(rs); |
d7e09d03 PT |
2747 | } |
2748 | } | |
2749 | } | |
2750 | ||
2751 | static void | |
2752 | ptlrpc_service_free(struct ptlrpc_service *svc) | |
2753 | { | |
d0bfef31 CH |
2754 | struct ptlrpc_service_part *svcpt; |
2755 | struct ptlrpc_at_array *array; | |
2756 | int i; | |
d7e09d03 PT |
2757 | |
2758 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
8b382089 | 2759 | if (!svcpt->scp_service) |
d7e09d03 PT |
2760 | break; |
2761 | ||
2762 | /* In case somebody rearmed this in the meantime */ | |
922da0c5 | 2763 | del_timer(&svcpt->scp_at_timer); |
d7e09d03 PT |
2764 | array = &svcpt->scp_at_array; |
2765 | ||
207e99c2 JL |
2766 | kfree(array->paa_reqs_array); |
2767 | array->paa_reqs_array = NULL; | |
2768 | kfree(array->paa_reqs_count); | |
2769 | array->paa_reqs_count = NULL; | |
d7e09d03 PT |
2770 | } |
2771 | ||
2772 | ptlrpc_service_for_each_part(svcpt, i, svc) | |
9ae10597 | 2773 | kfree(svcpt); |
d7e09d03 | 2774 | |
8b382089 | 2775 | if (svc->srv_cpts) |
d7e09d03 PT |
2776 | cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts); |
2777 | ||
9ae10597 | 2778 | kfree(svc); |
d7e09d03 PT |
2779 | } |
2780 | ||
2781 | int ptlrpc_unregister_service(struct ptlrpc_service *service) | |
2782 | { | |
d7e09d03 PT |
2783 | CDEBUG(D_NET, "%s: tearing down\n", service->srv_name); |
2784 | ||
2785 | service->srv_is_stopping = 1; | |
2786 | ||
2787 | mutex_lock(&ptlrpc_all_services_mutex); | |
2788 | list_del_init(&service->srv_list); | |
2789 | mutex_unlock(&ptlrpc_all_services_mutex); | |
2790 | ||
2791 | ptlrpc_service_del_atimer(service); | |
2792 | ptlrpc_stop_all_threads(service); | |
2793 | ||
2794 | ptlrpc_service_unlink_rqbd(service); | |
2795 | ptlrpc_service_purge_all(service); | |
2796 | ptlrpc_service_nrs_cleanup(service); | |
2797 | ||
2798 | ptlrpc_lprocfs_unregister_service(service); | |
328676f8 | 2799 | ptlrpc_sysfs_unregister_service(service); |
d7e09d03 PT |
2800 | |
2801 | ptlrpc_service_free(service); | |
2802 | ||
0a3bdb00 | 2803 | return 0; |
d7e09d03 PT |
2804 | } |
2805 | EXPORT_SYMBOL(ptlrpc_unregister_service); |