]>
Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2012, Intel Corporation. | |
31 | * | |
32 | */ | |
33 | /* | |
34 | * This file is part of Lustre, http://www.lustre.org/ | |
35 | * Lustre is a trademark of Sun Microsystems, Inc. | |
36 | * | |
37 | * osc cache management. | |
38 | * | |
39 | * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com> | |
40 | */ | |
41 | ||
42 | #define DEBUG_SUBSYSTEM S_OSC | |
43 | ||
44 | #include "osc_cl_internal.h" | |
45 | #include "osc_internal.h" | |
46 | ||
47 | static int extent_debug; /* set it to be true for more debug */ | |
48 | ||
49 | static void osc_update_pending(struct osc_object *obj, int cmd, int delta); | |
50 | static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, | |
51 | int state); | |
52 | static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, | |
53 | struct osc_async_page *oap, int sent, int rc); | |
54 | static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, | |
55 | int cmd); | |
56 | static int osc_refresh_count(const struct lu_env *env, | |
57 | struct osc_async_page *oap, int cmd); | |
58 | static int osc_io_unplug_async(const struct lu_env *env, | |
59 | struct client_obd *cli, struct osc_object *osc); | |
60 | static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, | |
61 | unsigned int lost_grant); | |
62 | ||
63 | static void osc_extent_tree_dump0(int level, struct osc_object *obj, | |
64 | const char *func, int line); | |
65 | #define osc_extent_tree_dump(lvl, obj) \ | |
66 | osc_extent_tree_dump0(lvl, obj, __func__, __LINE__) | |
67 | ||
68 | /** \addtogroup osc | |
69 | * @{ | |
70 | */ | |
71 | ||
72 | /* ------------------ osc extent ------------------ */ | |
73 | static inline char *ext_flags(struct osc_extent *ext, char *flags) | |
74 | { | |
75 | char *buf = flags; | |
76 | *buf++ = ext->oe_rw ? 'r' : 'w'; | |
77 | if (ext->oe_intree) | |
78 | *buf++ = 'i'; | |
79 | if (ext->oe_srvlock) | |
80 | *buf++ = 's'; | |
81 | if (ext->oe_hp) | |
82 | *buf++ = 'h'; | |
83 | if (ext->oe_urgent) | |
84 | *buf++ = 'u'; | |
85 | if (ext->oe_memalloc) | |
86 | *buf++ = 'm'; | |
87 | if (ext->oe_trunc_pending) | |
88 | *buf++ = 't'; | |
89 | if (ext->oe_fsync_wait) | |
90 | *buf++ = 'Y'; | |
91 | *buf = 0; | |
92 | return flags; | |
93 | } | |
94 | ||
95 | static inline char list_empty_marker(struct list_head *list) | |
96 | { | |
97 | return list_empty(list) ? '-' : '+'; | |
98 | } | |
99 | ||
100 | #define EXTSTR "[%lu -> %lu/%lu]" | |
101 | #define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end | |
cad6fafa BJ |
102 | static const char *oes_strings[] = { |
103 | "inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL }; | |
d7e09d03 PT |
104 | |
105 | #define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do { \ | |
106 | struct osc_extent *__ext = (extent); \ | |
d7e09d03 PT |
107 | char __buf[16]; \ |
108 | \ | |
109 | CDEBUG(lvl, \ | |
110 | "extent %p@{" EXTSTR ", " \ | |
111 | "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \ | |
112 | /* ----- extent part 0 ----- */ \ | |
113 | __ext, EXTPARA(__ext), \ | |
114 | /* ----- part 1 ----- */ \ | |
115 | atomic_read(&__ext->oe_refc), \ | |
116 | atomic_read(&__ext->oe_users), \ | |
117 | list_empty_marker(&__ext->oe_link), \ | |
cad6fafa | 118 | oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \ |
d7e09d03 PT |
119 | __ext->oe_obj, \ |
120 | /* ----- part 2 ----- */ \ | |
121 | __ext->oe_grants, __ext->oe_nr_pages, \ | |
122 | list_empty_marker(&__ext->oe_pages), \ | |
123 | waitqueue_active(&__ext->oe_waitq) ? '+' : '-', \ | |
124 | __ext->oe_osclock, __ext->oe_mppr, __ext->oe_owner, \ | |
125 | /* ----- part 4 ----- */ \ | |
126 | ## __VA_ARGS__); \ | |
127 | } while (0) | |
128 | ||
129 | #undef EASSERTF | |
130 | #define EASSERTF(expr, ext, fmt, args...) do { \ | |
131 | if (!(expr)) { \ | |
cad6fafa BJ |
132 | OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args); \ |
133 | osc_extent_tree_dump(D_ERROR, (ext)->oe_obj); \ | |
d7e09d03 | 134 | LASSERT(expr); \ |
cad6fafa | 135 | } \ |
d7e09d03 PT |
136 | } while (0) |
137 | ||
138 | #undef EASSERT | |
139 | #define EASSERT(expr, ext) EASSERTF(expr, ext, "\n") | |
140 | ||
141 | static inline struct osc_extent *rb_extent(struct rb_node *n) | |
142 | { | |
143 | if (n == NULL) | |
144 | return NULL; | |
145 | ||
146 | return container_of(n, struct osc_extent, oe_node); | |
147 | } | |
148 | ||
149 | static inline struct osc_extent *next_extent(struct osc_extent *ext) | |
150 | { | |
151 | if (ext == NULL) | |
152 | return NULL; | |
153 | ||
154 | LASSERT(ext->oe_intree); | |
155 | return rb_extent(rb_next(&ext->oe_node)); | |
156 | } | |
157 | ||
158 | static inline struct osc_extent *prev_extent(struct osc_extent *ext) | |
159 | { | |
160 | if (ext == NULL) | |
161 | return NULL; | |
162 | ||
163 | LASSERT(ext->oe_intree); | |
164 | return rb_extent(rb_prev(&ext->oe_node)); | |
165 | } | |
166 | ||
167 | static inline struct osc_extent *first_extent(struct osc_object *obj) | |
168 | { | |
169 | return rb_extent(rb_first(&obj->oo_root)); | |
170 | } | |
171 | ||
172 | /* object must be locked by caller. */ | |
173 | static int osc_extent_sanity_check0(struct osc_extent *ext, | |
174 | const char *func, const int line) | |
175 | { | |
176 | struct osc_object *obj = ext->oe_obj; | |
177 | struct osc_async_page *oap; | |
178 | int page_count; | |
179 | int rc = 0; | |
180 | ||
181 | if (!osc_object_is_locked(obj)) | |
182 | GOTO(out, rc = 9); | |
183 | ||
184 | if (ext->oe_state >= OES_STATE_MAX) | |
185 | GOTO(out, rc = 10); | |
186 | ||
187 | if (atomic_read(&ext->oe_refc) <= 0) | |
188 | GOTO(out, rc = 20); | |
189 | ||
190 | if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) | |
191 | GOTO(out, rc = 30); | |
192 | ||
193 | switch (ext->oe_state) { | |
194 | case OES_INV: | |
195 | if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages)) | |
196 | GOTO(out, rc = 35); | |
197 | GOTO(out, rc = 0); | |
198 | break; | |
199 | case OES_ACTIVE: | |
200 | if (atomic_read(&ext->oe_users) == 0) | |
201 | GOTO(out, rc = 40); | |
202 | if (ext->oe_hp) | |
203 | GOTO(out, rc = 50); | |
204 | if (ext->oe_fsync_wait && !ext->oe_urgent) | |
205 | GOTO(out, rc = 55); | |
206 | break; | |
207 | case OES_CACHE: | |
208 | if (ext->oe_grants == 0) | |
209 | GOTO(out, rc = 60); | |
210 | if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) | |
211 | GOTO(out, rc = 65); | |
212 | default: | |
213 | if (atomic_read(&ext->oe_users) > 0) | |
214 | GOTO(out, rc = 70); | |
215 | } | |
216 | ||
217 | if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) | |
218 | GOTO(out, rc = 80); | |
219 | ||
220 | if (ext->oe_osclock == NULL && ext->oe_grants > 0) | |
221 | GOTO(out, rc = 90); | |
222 | ||
223 | if (ext->oe_osclock) { | |
224 | struct cl_lock_descr *descr; | |
225 | descr = &ext->oe_osclock->cll_descr; | |
226 | if (!(descr->cld_start <= ext->oe_start && | |
227 | descr->cld_end >= ext->oe_max_end)) | |
228 | GOTO(out, rc = 100); | |
229 | } | |
230 | ||
231 | if (ext->oe_nr_pages > ext->oe_mppr) | |
232 | GOTO(out, rc = 105); | |
233 | ||
234 | /* Do not verify page list if extent is in RPC. This is because an | |
235 | * in-RPC extent is supposed to be exclusively accessible w/o lock. */ | |
236 | if (ext->oe_state > OES_CACHE) | |
237 | GOTO(out, rc = 0); | |
238 | ||
239 | if (!extent_debug) | |
240 | GOTO(out, rc = 0); | |
241 | ||
242 | page_count = 0; | |
243 | list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { | |
244 | pgoff_t index = oap2cl_page(oap)->cp_index; | |
245 | ++page_count; | |
246 | if (index > ext->oe_end || index < ext->oe_start) | |
247 | GOTO(out, rc = 110); | |
248 | } | |
249 | if (page_count != ext->oe_nr_pages) | |
250 | GOTO(out, rc = 120); | |
251 | ||
252 | out: | |
253 | if (rc != 0) | |
254 | OSC_EXTENT_DUMP(D_ERROR, ext, | |
255 | "%s:%d sanity check %p failed with rc = %d\n", | |
256 | func, line, ext, rc); | |
257 | return rc; | |
258 | } | |
259 | ||
260 | #define sanity_check_nolock(ext) \ | |
261 | osc_extent_sanity_check0(ext, __func__, __LINE__) | |
262 | ||
263 | #define sanity_check(ext) ({ \ | |
264 | int __res; \ | |
265 | osc_object_lock((ext)->oe_obj); \ | |
266 | __res = sanity_check_nolock(ext); \ | |
267 | osc_object_unlock((ext)->oe_obj); \ | |
268 | __res; \ | |
269 | }) | |
270 | ||
271 | ||
272 | /** | |
273 | * sanity check - to make sure there is no overlapped extent in the tree. | |
274 | */ | |
275 | static int osc_extent_is_overlapped(struct osc_object *obj, | |
276 | struct osc_extent *ext) | |
277 | { | |
278 | struct osc_extent *tmp; | |
279 | ||
280 | LASSERT(osc_object_is_locked(obj)); | |
281 | ||
282 | if (!extent_debug) | |
283 | return 0; | |
284 | ||
285 | for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) { | |
286 | if (tmp == ext) | |
287 | continue; | |
288 | if (tmp->oe_end >= ext->oe_start && | |
289 | tmp->oe_start <= ext->oe_end) | |
290 | return 1; | |
291 | } | |
292 | return 0; | |
293 | } | |
294 | ||
295 | static void osc_extent_state_set(struct osc_extent *ext, int state) | |
296 | { | |
297 | LASSERT(osc_object_is_locked(ext->oe_obj)); | |
298 | LASSERT(state >= OES_INV && state < OES_STATE_MAX); | |
299 | ||
300 | /* Never try to sanity check a state changing extent :-) */ | |
301 | /* LASSERT(sanity_check_nolock(ext) == 0); */ | |
302 | ||
303 | /* TODO: validate the state machine */ | |
304 | ext->oe_state = state; | |
305 | wake_up_all(&ext->oe_waitq); | |
306 | } | |
307 | ||
308 | static struct osc_extent *osc_extent_alloc(struct osc_object *obj) | |
309 | { | |
310 | struct osc_extent *ext; | |
311 | ||
312 | OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_IOFS); | |
313 | if (ext == NULL) | |
314 | return NULL; | |
315 | ||
316 | RB_CLEAR_NODE(&ext->oe_node); | |
317 | ext->oe_obj = obj; | |
318 | atomic_set(&ext->oe_refc, 1); | |
319 | atomic_set(&ext->oe_users, 0); | |
320 | INIT_LIST_HEAD(&ext->oe_link); | |
321 | ext->oe_state = OES_INV; | |
322 | INIT_LIST_HEAD(&ext->oe_pages); | |
323 | init_waitqueue_head(&ext->oe_waitq); | |
324 | ext->oe_osclock = NULL; | |
325 | ||
326 | return ext; | |
327 | } | |
328 | ||
329 | static void osc_extent_free(struct osc_extent *ext) | |
330 | { | |
331 | OBD_SLAB_FREE_PTR(ext, osc_extent_kmem); | |
332 | } | |
333 | ||
334 | static struct osc_extent *osc_extent_get(struct osc_extent *ext) | |
335 | { | |
336 | LASSERT(atomic_read(&ext->oe_refc) >= 0); | |
337 | atomic_inc(&ext->oe_refc); | |
338 | return ext; | |
339 | } | |
340 | ||
341 | static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) | |
342 | { | |
343 | LASSERT(atomic_read(&ext->oe_refc) > 0); | |
344 | if (atomic_dec_and_test(&ext->oe_refc)) { | |
345 | LASSERT(list_empty(&ext->oe_link)); | |
346 | LASSERT(atomic_read(&ext->oe_users) == 0); | |
347 | LASSERT(ext->oe_state == OES_INV); | |
348 | LASSERT(!ext->oe_intree); | |
349 | ||
350 | if (ext->oe_osclock) { | |
351 | cl_lock_put(env, ext->oe_osclock); | |
352 | ext->oe_osclock = NULL; | |
353 | } | |
354 | osc_extent_free(ext); | |
355 | } | |
356 | } | |
357 | ||
358 | /** | |
359 | * osc_extent_put_trust() is a special version of osc_extent_put() when | |
360 | * it's known that the caller is not the last user. This is to address the | |
361 | * problem of lacking of lu_env ;-). | |
362 | */ | |
363 | static void osc_extent_put_trust(struct osc_extent *ext) | |
364 | { | |
365 | LASSERT(atomic_read(&ext->oe_refc) > 1); | |
366 | LASSERT(osc_object_is_locked(ext->oe_obj)); | |
367 | atomic_dec(&ext->oe_refc); | |
368 | } | |
369 | ||
370 | /** | |
371 | * Return the extent which includes pgoff @index, or return the greatest | |
372 | * previous extent in the tree. | |
373 | */ | |
374 | static struct osc_extent *osc_extent_search(struct osc_object *obj, | |
375 | pgoff_t index) | |
376 | { | |
377 | struct rb_node *n = obj->oo_root.rb_node; | |
378 | struct osc_extent *tmp, *p = NULL; | |
379 | ||
380 | LASSERT(osc_object_is_locked(obj)); | |
381 | while (n != NULL) { | |
382 | tmp = rb_extent(n); | |
383 | if (index < tmp->oe_start) { | |
384 | n = n->rb_left; | |
385 | } else if (index > tmp->oe_end) { | |
386 | p = rb_extent(n); | |
387 | n = n->rb_right; | |
388 | } else { | |
389 | return tmp; | |
390 | } | |
391 | } | |
392 | return p; | |
393 | } | |
394 | ||
395 | /* | |
396 | * Return the extent covering @index, otherwise return NULL. | |
397 | * caller must have held object lock. | |
398 | */ | |
399 | static struct osc_extent *osc_extent_lookup(struct osc_object *obj, | |
400 | pgoff_t index) | |
401 | { | |
402 | struct osc_extent *ext; | |
403 | ||
404 | ext = osc_extent_search(obj, index); | |
405 | if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end) | |
406 | return osc_extent_get(ext); | |
407 | return NULL; | |
408 | } | |
409 | ||
410 | /* caller must have held object lock. */ | |
411 | static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) | |
412 | { | |
413 | struct rb_node **n = &obj->oo_root.rb_node; | |
414 | struct rb_node *parent = NULL; | |
415 | struct osc_extent *tmp; | |
416 | ||
417 | LASSERT(ext->oe_intree == 0); | |
418 | LASSERT(ext->oe_obj == obj); | |
419 | LASSERT(osc_object_is_locked(obj)); | |
420 | while (*n != NULL) { | |
421 | tmp = rb_extent(*n); | |
422 | parent = *n; | |
423 | ||
424 | if (ext->oe_end < tmp->oe_start) | |
425 | n = &(*n)->rb_left; | |
426 | else if (ext->oe_start > tmp->oe_end) | |
427 | n = &(*n)->rb_right; | |
428 | else | |
429 | EASSERTF(0, tmp, EXTSTR, EXTPARA(ext)); | |
430 | } | |
431 | rb_link_node(&ext->oe_node, parent, n); | |
432 | rb_insert_color(&ext->oe_node, &obj->oo_root); | |
433 | osc_extent_get(ext); | |
434 | ext->oe_intree = 1; | |
435 | } | |
436 | ||
437 | /* caller must have held object lock. */ | |
438 | static void osc_extent_erase(struct osc_extent *ext) | |
439 | { | |
440 | struct osc_object *obj = ext->oe_obj; | |
441 | LASSERT(osc_object_is_locked(obj)); | |
442 | if (ext->oe_intree) { | |
443 | rb_erase(&ext->oe_node, &obj->oo_root); | |
444 | ext->oe_intree = 0; | |
445 | /* rbtree held a refcount */ | |
446 | osc_extent_put_trust(ext); | |
447 | } | |
448 | } | |
449 | ||
450 | static struct osc_extent *osc_extent_hold(struct osc_extent *ext) | |
451 | { | |
452 | struct osc_object *obj = ext->oe_obj; | |
453 | ||
454 | LASSERT(osc_object_is_locked(obj)); | |
455 | LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE); | |
456 | if (ext->oe_state == OES_CACHE) { | |
457 | osc_extent_state_set(ext, OES_ACTIVE); | |
458 | osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages); | |
459 | } | |
460 | atomic_inc(&ext->oe_users); | |
461 | list_del_init(&ext->oe_link); | |
462 | return osc_extent_get(ext); | |
463 | } | |
464 | ||
465 | static void __osc_extent_remove(struct osc_extent *ext) | |
466 | { | |
467 | LASSERT(osc_object_is_locked(ext->oe_obj)); | |
468 | LASSERT(list_empty(&ext->oe_pages)); | |
469 | osc_extent_erase(ext); | |
470 | list_del_init(&ext->oe_link); | |
471 | osc_extent_state_set(ext, OES_INV); | |
472 | OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n"); | |
473 | } | |
474 | ||
475 | static void osc_extent_remove(struct osc_extent *ext) | |
476 | { | |
477 | struct osc_object *obj = ext->oe_obj; | |
478 | ||
479 | osc_object_lock(obj); | |
480 | __osc_extent_remove(ext); | |
481 | osc_object_unlock(obj); | |
482 | } | |
483 | ||
484 | /** | |
485 | * This function is used to merge extents to get better performance. It checks | |
486 | * if @cur and @victim are contiguous at chunk level. | |
487 | */ | |
488 | static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, | |
489 | struct osc_extent *victim) | |
490 | { | |
491 | struct osc_object *obj = cur->oe_obj; | |
492 | pgoff_t chunk_start; | |
493 | pgoff_t chunk_end; | |
494 | int ppc_bits; | |
495 | ||
496 | LASSERT(cur->oe_state == OES_CACHE); | |
497 | LASSERT(osc_object_is_locked(obj)); | |
498 | if (victim == NULL) | |
499 | return -EINVAL; | |
500 | ||
501 | if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait) | |
502 | return -EBUSY; | |
503 | ||
504 | if (cur->oe_max_end != victim->oe_max_end) | |
505 | return -ERANGE; | |
506 | ||
507 | LASSERT(cur->oe_osclock == victim->oe_osclock); | |
508 | ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT; | |
509 | chunk_start = cur->oe_start >> ppc_bits; | |
510 | chunk_end = cur->oe_end >> ppc_bits; | |
511 | if (chunk_start != (victim->oe_end >> ppc_bits) + 1 && | |
512 | chunk_end + 1 != victim->oe_start >> ppc_bits) | |
513 | return -ERANGE; | |
514 | ||
515 | OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur); | |
516 | ||
517 | cur->oe_start = min(cur->oe_start, victim->oe_start); | |
518 | cur->oe_end = max(cur->oe_end, victim->oe_end); | |
519 | cur->oe_grants += victim->oe_grants; | |
520 | cur->oe_nr_pages += victim->oe_nr_pages; | |
521 | /* only the following bits are needed to merge */ | |
522 | cur->oe_urgent |= victim->oe_urgent; | |
523 | cur->oe_memalloc |= victim->oe_memalloc; | |
524 | list_splice_init(&victim->oe_pages, &cur->oe_pages); | |
525 | list_del_init(&victim->oe_link); | |
526 | victim->oe_nr_pages = 0; | |
527 | ||
528 | osc_extent_get(victim); | |
529 | __osc_extent_remove(victim); | |
530 | osc_extent_put(env, victim); | |
531 | ||
532 | OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim); | |
533 | return 0; | |
534 | } | |
535 | ||
536 | /** | |
537 | * Drop user count of osc_extent, and unplug IO asynchronously. | |
538 | */ | |
539 | int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) | |
540 | { | |
541 | struct osc_object *obj = ext->oe_obj; | |
542 | int rc = 0; | |
d7e09d03 PT |
543 | |
544 | LASSERT(atomic_read(&ext->oe_users) > 0); | |
545 | LASSERT(sanity_check(ext) == 0); | |
546 | LASSERT(ext->oe_grants > 0); | |
547 | ||
548 | if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) { | |
549 | LASSERT(ext->oe_state == OES_ACTIVE); | |
550 | if (ext->oe_trunc_pending) { | |
551 | /* a truncate process is waiting for this extent. | |
552 | * This may happen due to a race, check | |
553 | * osc_cache_truncate_start(). */ | |
554 | osc_extent_state_set(ext, OES_TRUNC); | |
555 | ext->oe_trunc_pending = 0; | |
556 | } else { | |
557 | osc_extent_state_set(ext, OES_CACHE); | |
558 | osc_update_pending(obj, OBD_BRW_WRITE, | |
559 | ext->oe_nr_pages); | |
560 | ||
561 | /* try to merge the previous and next extent. */ | |
562 | osc_extent_merge(env, ext, prev_extent(ext)); | |
563 | osc_extent_merge(env, ext, next_extent(ext)); | |
564 | ||
565 | if (ext->oe_urgent) | |
566 | list_move_tail(&ext->oe_link, | |
567 | &obj->oo_urgent_exts); | |
568 | } | |
569 | osc_object_unlock(obj); | |
570 | ||
571 | osc_io_unplug_async(env, osc_cli(obj), obj); | |
572 | } | |
573 | osc_extent_put(env, ext); | |
0a3bdb00 | 574 | return rc; |
d7e09d03 PT |
575 | } |
576 | ||
577 | static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2) | |
578 | { | |
579 | return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start); | |
580 | } | |
581 | ||
582 | /** | |
583 | * Find or create an extent which includes @index, core function to manage | |
584 | * extent tree. | |
585 | */ | |
586 | struct osc_extent *osc_extent_find(const struct lu_env *env, | |
587 | struct osc_object *obj, pgoff_t index, | |
588 | int *grants) | |
589 | ||
590 | { | |
591 | struct client_obd *cli = osc_cli(obj); | |
592 | struct cl_lock *lock; | |
593 | struct osc_extent *cur; | |
594 | struct osc_extent *ext; | |
595 | struct osc_extent *conflict = NULL; | |
596 | struct osc_extent *found = NULL; | |
597 | pgoff_t chunk; | |
598 | pgoff_t max_end; | |
599 | int max_pages; /* max_pages_per_rpc */ | |
600 | int chunksize; | |
601 | int ppc_bits; /* pages per chunk bits */ | |
602 | int chunk_mask; | |
603 | int rc; | |
d7e09d03 PT |
604 | |
605 | cur = osc_extent_alloc(obj); | |
606 | if (cur == NULL) | |
0a3bdb00 | 607 | return ERR_PTR(-ENOMEM); |
d7e09d03 PT |
608 | |
609 | lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0); | |
610 | LASSERT(lock != NULL); | |
611 | LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE); | |
612 | ||
613 | LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT); | |
614 | ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; | |
615 | chunk_mask = ~((1 << ppc_bits) - 1); | |
616 | chunksize = 1 << cli->cl_chunkbits; | |
617 | chunk = index >> ppc_bits; | |
618 | ||
619 | /* align end to rpc edge, rpc size may not be a power 2 integer. */ | |
620 | max_pages = cli->cl_max_pages_per_rpc; | |
621 | LASSERT((max_pages & ~chunk_mask) == 0); | |
622 | max_end = index - (index % max_pages) + max_pages - 1; | |
623 | max_end = min_t(pgoff_t, max_end, lock->cll_descr.cld_end); | |
624 | ||
625 | /* initialize new extent by parameters so far */ | |
626 | cur->oe_max_end = max_end; | |
627 | cur->oe_start = index & chunk_mask; | |
628 | cur->oe_end = ((index + ~chunk_mask + 1) & chunk_mask) - 1; | |
629 | if (cur->oe_start < lock->cll_descr.cld_start) | |
630 | cur->oe_start = lock->cll_descr.cld_start; | |
631 | if (cur->oe_end > max_end) | |
632 | cur->oe_end = max_end; | |
633 | cur->oe_osclock = lock; | |
634 | cur->oe_grants = 0; | |
635 | cur->oe_mppr = max_pages; | |
636 | ||
637 | /* grants has been allocated by caller */ | |
638 | LASSERTF(*grants >= chunksize + cli->cl_extent_tax, | |
639 | "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax); | |
640 | LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR, EXTPARA(cur)); | |
641 | ||
642 | restart: | |
643 | osc_object_lock(obj); | |
644 | ext = osc_extent_search(obj, cur->oe_start); | |
645 | if (ext == NULL) | |
646 | ext = first_extent(obj); | |
647 | while (ext != NULL) { | |
648 | loff_t ext_chk_start = ext->oe_start >> ppc_bits; | |
649 | loff_t ext_chk_end = ext->oe_end >> ppc_bits; | |
650 | ||
651 | LASSERT(sanity_check_nolock(ext) == 0); | |
652 | if (chunk > ext_chk_end + 1) | |
653 | break; | |
654 | ||
655 | /* if covering by different locks, no chance to match */ | |
656 | if (lock != ext->oe_osclock) { | |
657 | EASSERTF(!overlapped(ext, cur), ext, | |
658 | EXTSTR, EXTPARA(cur)); | |
659 | ||
660 | ext = next_extent(ext); | |
661 | continue; | |
662 | } | |
663 | ||
664 | /* discontiguous chunks? */ | |
665 | if (chunk + 1 < ext_chk_start) { | |
666 | ext = next_extent(ext); | |
667 | continue; | |
668 | } | |
669 | ||
670 | /* ok, from now on, ext and cur have these attrs: | |
671 | * 1. covered by the same lock | |
672 | * 2. contiguous at chunk level or overlapping. */ | |
673 | ||
674 | if (overlapped(ext, cur)) { | |
675 | /* cur is the minimum unit, so overlapping means | |
676 | * full contain. */ | |
677 | EASSERTF((ext->oe_start <= cur->oe_start && | |
678 | ext->oe_end >= cur->oe_end), | |
679 | ext, EXTSTR, EXTPARA(cur)); | |
680 | ||
681 | if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) { | |
682 | /* for simplicity, we wait for this extent to | |
683 | * finish before going forward. */ | |
684 | conflict = osc_extent_get(ext); | |
685 | break; | |
686 | } | |
687 | ||
688 | found = osc_extent_hold(ext); | |
689 | break; | |
690 | } | |
691 | ||
692 | /* non-overlapped extent */ | |
693 | if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) { | |
694 | /* we can't do anything for a non OES_CACHE extent, or | |
695 | * if there is someone waiting for this extent to be | |
696 | * flushed, try next one. */ | |
697 | ext = next_extent(ext); | |
698 | continue; | |
699 | } | |
700 | ||
701 | /* check if they belong to the same rpc slot before trying to | |
702 | * merge. the extents are not overlapped and contiguous at | |
703 | * chunk level to get here. */ | |
704 | if (ext->oe_max_end != max_end) { | |
705 | /* if they don't belong to the same RPC slot or | |
706 | * max_pages_per_rpc has ever changed, do not merge. */ | |
707 | ext = next_extent(ext); | |
708 | continue; | |
709 | } | |
710 | ||
711 | /* it's required that an extent must be contiguous at chunk | |
712 | * level so that we know the whole extent is covered by grant | |
713 | * (the pages in the extent are NOT required to be contiguous). | |
714 | * Otherwise, it will be too much difficult to know which | |
715 | * chunks have grants allocated. */ | |
716 | ||
717 | /* try to do front merge - extend ext's start */ | |
718 | if (chunk + 1 == ext_chk_start) { | |
719 | /* ext must be chunk size aligned */ | |
720 | EASSERT((ext->oe_start & ~chunk_mask) == 0, ext); | |
721 | ||
722 | /* pull ext's start back to cover cur */ | |
723 | ext->oe_start = cur->oe_start; | |
724 | ext->oe_grants += chunksize; | |
725 | *grants -= chunksize; | |
726 | ||
727 | found = osc_extent_hold(ext); | |
728 | } else if (chunk == ext_chk_end + 1) { | |
729 | /* rear merge */ | |
730 | ext->oe_end = cur->oe_end; | |
731 | ext->oe_grants += chunksize; | |
732 | *grants -= chunksize; | |
733 | ||
734 | /* try to merge with the next one because we just fill | |
735 | * in a gap */ | |
736 | if (osc_extent_merge(env, ext, next_extent(ext)) == 0) | |
737 | /* we can save extent tax from next extent */ | |
738 | *grants += cli->cl_extent_tax; | |
739 | ||
740 | found = osc_extent_hold(ext); | |
741 | } | |
742 | if (found != NULL) | |
743 | break; | |
744 | ||
745 | ext = next_extent(ext); | |
746 | } | |
747 | ||
748 | osc_extent_tree_dump(D_CACHE, obj); | |
749 | if (found != NULL) { | |
750 | LASSERT(conflict == NULL); | |
751 | if (!IS_ERR(found)) { | |
752 | LASSERT(found->oe_osclock == cur->oe_osclock); | |
753 | OSC_EXTENT_DUMP(D_CACHE, found, | |
754 | "found caching ext for %lu.\n", index); | |
755 | } | |
756 | } else if (conflict == NULL) { | |
757 | /* create a new extent */ | |
758 | EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur); | |
759 | cur->oe_grants = chunksize + cli->cl_extent_tax; | |
760 | *grants -= cur->oe_grants; | |
761 | LASSERT(*grants >= 0); | |
762 | ||
763 | cur->oe_state = OES_CACHE; | |
764 | found = osc_extent_hold(cur); | |
765 | osc_extent_insert(obj, cur); | |
766 | OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n", | |
767 | index, lock->cll_descr.cld_end); | |
768 | } | |
769 | osc_object_unlock(obj); | |
770 | ||
771 | if (conflict != NULL) { | |
772 | LASSERT(found == NULL); | |
773 | ||
774 | /* waiting for IO to finish. Please notice that it's impossible | |
775 | * to be an OES_TRUNC extent. */ | |
776 | rc = osc_extent_wait(env, conflict, OES_INV); | |
777 | osc_extent_put(env, conflict); | |
778 | conflict = NULL; | |
779 | if (rc < 0) | |
780 | GOTO(out, found = ERR_PTR(rc)); | |
781 | ||
782 | goto restart; | |
783 | } | |
d7e09d03 PT |
784 | |
785 | out: | |
786 | osc_extent_put(env, cur); | |
787 | LASSERT(*grants >= 0); | |
788 | return found; | |
789 | } | |
790 | ||
791 | /** | |
792 | * Called when IO is finished to an extent. | |
793 | */ | |
794 | int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, | |
795 | int sent, int rc) | |
796 | { | |
797 | struct client_obd *cli = osc_cli(ext->oe_obj); | |
798 | struct osc_async_page *oap; | |
799 | struct osc_async_page *tmp; | |
800 | int nr_pages = ext->oe_nr_pages; | |
801 | int lost_grant = 0; | |
802 | int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096; | |
803 | __u64 last_off = 0; | |
804 | int last_count = -1; | |
d7e09d03 PT |
805 | |
806 | OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n"); | |
807 | ||
808 | ext->oe_rc = rc ?: ext->oe_nr_pages; | |
809 | EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext); | |
810 | list_for_each_entry_safe(oap, tmp, &ext->oe_pages, | |
811 | oap_pending_item) { | |
812 | list_del_init(&oap->oap_rpc_item); | |
813 | list_del_init(&oap->oap_pending_item); | |
814 | if (last_off <= oap->oap_obj_off) { | |
815 | last_off = oap->oap_obj_off; | |
816 | last_count = oap->oap_count; | |
817 | } | |
818 | ||
819 | --ext->oe_nr_pages; | |
820 | osc_ap_completion(env, cli, oap, sent, rc); | |
821 | } | |
822 | EASSERT(ext->oe_nr_pages == 0, ext); | |
823 | ||
824 | if (!sent) { | |
825 | lost_grant = ext->oe_grants; | |
826 | } else if (blocksize < PAGE_CACHE_SIZE && | |
827 | last_count != PAGE_CACHE_SIZE) { | |
828 | /* For short writes we shouldn't count parts of pages that | |
829 | * span a whole chunk on the OST side, or our accounting goes | |
830 | * wrong. Should match the code in filter_grant_check. */ | |
831 | int offset = oap->oap_page_off & ~CFS_PAGE_MASK; | |
832 | int count = oap->oap_count + (offset & (blocksize - 1)); | |
833 | int end = (offset + oap->oap_count) & (blocksize - 1); | |
834 | if (end) | |
835 | count += blocksize - end; | |
836 | ||
837 | lost_grant = PAGE_CACHE_SIZE - count; | |
838 | } | |
839 | if (ext->oe_grants > 0) | |
840 | osc_free_grant(cli, nr_pages, lost_grant); | |
841 | ||
842 | osc_extent_remove(ext); | |
843 | /* put the refcount for RPC */ | |
844 | osc_extent_put(env, ext); | |
0a3bdb00 | 845 | return 0; |
d7e09d03 PT |
846 | } |
847 | ||
848 | static int extent_wait_cb(struct osc_extent *ext, int state) | |
849 | { | |
850 | int ret; | |
851 | ||
852 | osc_object_lock(ext->oe_obj); | |
853 | ret = ext->oe_state == state; | |
854 | osc_object_unlock(ext->oe_obj); | |
855 | ||
856 | return ret; | |
857 | } | |
858 | ||
859 | /** | |
860 | * Wait for the extent's state to become @state. | |
861 | */ | |
862 | static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, | |
863 | int state) | |
864 | { | |
865 | struct osc_object *obj = ext->oe_obj; | |
866 | struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL, | |
867 | LWI_ON_SIGNAL_NOOP, NULL); | |
868 | int rc = 0; | |
d7e09d03 PT |
869 | |
870 | osc_object_lock(obj); | |
871 | LASSERT(sanity_check_nolock(ext) == 0); | |
872 | /* `Kick' this extent only if the caller is waiting for it to be | |
873 | * written out. */ | |
ce248d59 AS |
874 | if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp && |
875 | !ext->oe_trunc_pending) { | |
d7e09d03 PT |
876 | if (ext->oe_state == OES_ACTIVE) { |
877 | ext->oe_urgent = 1; | |
878 | } else if (ext->oe_state == OES_CACHE) { | |
879 | ext->oe_urgent = 1; | |
880 | osc_extent_hold(ext); | |
881 | rc = 1; | |
882 | } | |
883 | } | |
884 | osc_object_unlock(obj); | |
885 | if (rc == 1) | |
886 | osc_extent_release(env, ext); | |
887 | ||
888 | /* wait for the extent until its state becomes @state */ | |
889 | rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi); | |
890 | if (rc == -ETIMEDOUT) { | |
891 | OSC_EXTENT_DUMP(D_ERROR, ext, | |
892 | "%s: wait ext to %d timedout, recovery in progress?\n", | |
893 | osc_export(obj)->exp_obd->obd_name, state); | |
894 | ||
895 | lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); | |
896 | rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), | |
897 | &lwi); | |
898 | } | |
899 | if (rc == 0 && ext->oe_rc < 0) | |
900 | rc = ext->oe_rc; | |
0a3bdb00 | 901 | return rc; |
d7e09d03 PT |
902 | } |
903 | ||
904 | /** | |
905 | * Discard pages with index greater than @size. If @ext is overlapped with | |
906 | * @size, then partial truncate happens. | |
907 | */ | |
908 | static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, | |
909 | bool partial) | |
910 | { | |
911 | struct cl_env_nest nest; | |
912 | struct lu_env *env; | |
913 | struct cl_io *io; | |
914 | struct osc_object *obj = ext->oe_obj; | |
915 | struct client_obd *cli = osc_cli(obj); | |
916 | struct osc_async_page *oap; | |
917 | struct osc_async_page *tmp; | |
918 | int pages_in_chunk = 0; | |
919 | int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; | |
920 | __u64 trunc_chunk = trunc_index >> ppc_bits; | |
921 | int grants = 0; | |
922 | int nr_pages = 0; | |
923 | int rc = 0; | |
d7e09d03 PT |
924 | |
925 | LASSERT(sanity_check(ext) == 0); | |
ce248d59 AS |
926 | EASSERT(ext->oe_state == OES_TRUNC, ext); |
927 | EASSERT(!ext->oe_urgent, ext); | |
d7e09d03 PT |
928 | |
929 | /* Request new lu_env. | |
930 | * We can't use that env from osc_cache_truncate_start() because | |
931 | * it's from lov_io_sub and not fully initialized. */ | |
932 | env = cl_env_nested_get(&nest); | |
933 | io = &osc_env_info(env)->oti_io; | |
934 | io->ci_obj = cl_object_top(osc2cl(obj)); | |
935 | rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); | |
936 | if (rc < 0) | |
937 | GOTO(out, rc); | |
938 | ||
939 | /* discard all pages with index greater then trunc_index */ | |
940 | list_for_each_entry_safe(oap, tmp, &ext->oe_pages, | |
941 | oap_pending_item) { | |
942 | struct cl_page *sub = oap2cl_page(oap); | |
943 | struct cl_page *page = cl_page_top(sub); | |
944 | ||
945 | LASSERT(list_empty(&oap->oap_rpc_item)); | |
946 | ||
947 | /* only discard the pages with their index greater than | |
948 | * trunc_index, and ... */ | |
949 | if (sub->cp_index < trunc_index || | |
950 | (sub->cp_index == trunc_index && partial)) { | |
951 | /* accounting how many pages remaining in the chunk | |
952 | * so that we can calculate grants correctly. */ | |
953 | if (sub->cp_index >> ppc_bits == trunc_chunk) | |
954 | ++pages_in_chunk; | |
955 | continue; | |
956 | } | |
957 | ||
958 | list_del_init(&oap->oap_pending_item); | |
959 | ||
960 | cl_page_get(page); | |
961 | lu_ref_add(&page->cp_reference, "truncate", current); | |
962 | ||
963 | if (cl_page_own(env, io, page) == 0) { | |
964 | cl_page_unmap(env, io, page); | |
965 | cl_page_discard(env, io, page); | |
966 | cl_page_disown(env, io, page); | |
967 | } else { | |
968 | LASSERT(page->cp_state == CPS_FREEING); | |
969 | LASSERT(0); | |
970 | } | |
971 | ||
972 | lu_ref_del(&page->cp_reference, "truncate", current); | |
973 | cl_page_put(env, page); | |
974 | ||
975 | --ext->oe_nr_pages; | |
976 | ++nr_pages; | |
977 | } | |
978 | EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial, | |
979 | ext->oe_nr_pages == 0), | |
980 | ext, "trunc_index %lu, partial %d\n", trunc_index, partial); | |
981 | ||
982 | osc_object_lock(obj); | |
983 | if (ext->oe_nr_pages == 0) { | |
984 | LASSERT(pages_in_chunk == 0); | |
985 | grants = ext->oe_grants; | |
986 | ext->oe_grants = 0; | |
987 | } else { /* calculate how many grants we can free */ | |
988 | int chunks = (ext->oe_end >> ppc_bits) - trunc_chunk; | |
989 | pgoff_t last_index; | |
990 | ||
991 | ||
992 | /* if there is no pages in this chunk, we can also free grants | |
993 | * for the last chunk */ | |
994 | if (pages_in_chunk == 0) { | |
995 | /* if this is the 1st chunk and no pages in this chunk, | |
996 | * ext->oe_nr_pages must be zero, so we should be in | |
997 | * the other if-clause. */ | |
998 | LASSERT(trunc_chunk > 0); | |
999 | --trunc_chunk; | |
1000 | ++chunks; | |
1001 | } | |
1002 | ||
1003 | /* this is what we can free from this extent */ | |
1004 | grants = chunks << cli->cl_chunkbits; | |
1005 | ext->oe_grants -= grants; | |
1006 | last_index = ((trunc_chunk + 1) << ppc_bits) - 1; | |
1007 | ext->oe_end = min(last_index, ext->oe_max_end); | |
1008 | LASSERT(ext->oe_end >= ext->oe_start); | |
1009 | LASSERT(ext->oe_grants > 0); | |
1010 | } | |
1011 | osc_object_unlock(obj); | |
1012 | ||
1013 | if (grants > 0 || nr_pages > 0) | |
1014 | osc_free_grant(cli, nr_pages, grants); | |
1015 | ||
1016 | out: | |
1017 | cl_io_fini(env, io); | |
1018 | cl_env_nested_put(&nest, env); | |
0a3bdb00 | 1019 | return rc; |
d7e09d03 PT |
1020 | } |
1021 | ||
1022 | /** | |
1023 | * This function is used to make the extent prepared for transfer. | |
34ca8748 | 1024 | * A race with flushing page - ll_writepage() has to be handled cautiously. |
d7e09d03 PT |
1025 | */ |
1026 | static int osc_extent_make_ready(const struct lu_env *env, | |
1027 | struct osc_extent *ext) | |
1028 | { | |
1029 | struct osc_async_page *oap; | |
1030 | struct osc_async_page *last = NULL; | |
1031 | struct osc_object *obj = ext->oe_obj; | |
1032 | int page_count = 0; | |
1033 | int rc; | |
d7e09d03 PT |
1034 | |
1035 | /* we're going to grab page lock, so object lock must not be taken. */ | |
1036 | LASSERT(sanity_check(ext) == 0); | |
1037 | /* in locking state, any process should not touch this extent. */ | |
1038 | EASSERT(ext->oe_state == OES_LOCKING, ext); | |
1039 | EASSERT(ext->oe_owner != NULL, ext); | |
1040 | ||
1041 | OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n"); | |
1042 | ||
1043 | list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { | |
1044 | ++page_count; | |
1045 | if (last == NULL || last->oap_obj_off < oap->oap_obj_off) | |
1046 | last = oap; | |
1047 | ||
1048 | /* checking ASYNC_READY is race safe */ | |
1049 | if ((oap->oap_async_flags & ASYNC_READY) != 0) | |
1050 | continue; | |
1051 | ||
1052 | rc = osc_make_ready(env, oap, OBD_BRW_WRITE); | |
1053 | switch (rc) { | |
1054 | case 0: | |
1055 | spin_lock(&oap->oap_lock); | |
1056 | oap->oap_async_flags |= ASYNC_READY; | |
1057 | spin_unlock(&oap->oap_lock); | |
1058 | break; | |
1059 | case -EALREADY: | |
1060 | LASSERT((oap->oap_async_flags & ASYNC_READY) != 0); | |
1061 | break; | |
1062 | default: | |
1063 | LASSERTF(0, "unknown return code: %d\n", rc); | |
1064 | } | |
1065 | } | |
1066 | ||
1067 | LASSERT(page_count == ext->oe_nr_pages); | |
1068 | LASSERT(last != NULL); | |
1069 | /* the last page is the only one we need to refresh its count by | |
1070 | * the size of file. */ | |
1071 | if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) { | |
1072 | last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); | |
1073 | LASSERT(last->oap_count > 0); | |
1074 | LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE); | |
1075 | last->oap_async_flags |= ASYNC_COUNT_STABLE; | |
1076 | } | |
1077 | ||
1078 | /* for the rest of pages, we don't need to call osf_refresh_count() | |
1079 | * because it's known they are not the last page */ | |
1080 | list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { | |
1081 | if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { | |
1082 | oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off; | |
1083 | oap->oap_async_flags |= ASYNC_COUNT_STABLE; | |
1084 | } | |
1085 | } | |
1086 | ||
1087 | osc_object_lock(obj); | |
1088 | osc_extent_state_set(ext, OES_RPC); | |
1089 | osc_object_unlock(obj); | |
1090 | /* get a refcount for RPC. */ | |
1091 | osc_extent_get(ext); | |
1092 | ||
0a3bdb00 | 1093 | return 0; |
d7e09d03 PT |
1094 | } |
1095 | ||
1096 | /** | |
1097 | * Quick and simple version of osc_extent_find(). This function is frequently | |
1098 | * called to expand the extent for the same IO. To expand the extent, the | |
1099 | * page index must be in the same or next chunk of ext->oe_end. | |
1100 | */ | |
1101 | static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants) | |
1102 | { | |
1103 | struct osc_object *obj = ext->oe_obj; | |
1104 | struct client_obd *cli = osc_cli(obj); | |
1105 | struct osc_extent *next; | |
1106 | int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; | |
1107 | pgoff_t chunk = index >> ppc_bits; | |
1108 | pgoff_t end_chunk; | |
1109 | pgoff_t end_index; | |
1110 | int chunksize = 1 << cli->cl_chunkbits; | |
1111 | int rc = 0; | |
d7e09d03 PT |
1112 | |
1113 | LASSERT(ext->oe_max_end >= index && ext->oe_start <= index); | |
1114 | osc_object_lock(obj); | |
1115 | LASSERT(sanity_check_nolock(ext) == 0); | |
1116 | end_chunk = ext->oe_end >> ppc_bits; | |
1117 | if (chunk > end_chunk + 1) | |
1118 | GOTO(out, rc = -ERANGE); | |
1119 | ||
1120 | if (end_chunk >= chunk) | |
1121 | GOTO(out, rc = 0); | |
1122 | ||
1123 | LASSERT(end_chunk + 1 == chunk); | |
1124 | /* try to expand this extent to cover @index */ | |
1125 | end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1); | |
1126 | ||
1127 | next = next_extent(ext); | |
1128 | if (next != NULL && next->oe_start <= end_index) | |
1129 | /* complex mode - overlapped with the next extent, | |
1130 | * this case will be handled by osc_extent_find() */ | |
1131 | GOTO(out, rc = -EAGAIN); | |
1132 | ||
1133 | ext->oe_end = end_index; | |
1134 | ext->oe_grants += chunksize; | |
1135 | *grants -= chunksize; | |
1136 | LASSERT(*grants >= 0); | |
1137 | EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext, | |
1138 | "overlapped after expanding for %lu.\n", index); | |
d7e09d03 PT |
1139 | |
1140 | out: | |
1141 | osc_object_unlock(obj); | |
0a3bdb00 | 1142 | return rc; |
d7e09d03 PT |
1143 | } |
1144 | ||
1145 | static void osc_extent_tree_dump0(int level, struct osc_object *obj, | |
1146 | const char *func, int line) | |
1147 | { | |
1148 | struct osc_extent *ext; | |
1149 | int cnt; | |
1150 | ||
1151 | CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n", | |
1152 | obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc); | |
1153 | ||
1154 | /* osc_object_lock(obj); */ | |
1155 | cnt = 1; | |
1156 | for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext)) | |
1157 | OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++); | |
1158 | ||
1159 | cnt = 1; | |
1160 | list_for_each_entry(ext, &obj->oo_hp_exts, oe_link) | |
1161 | OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++); | |
1162 | ||
1163 | cnt = 1; | |
1164 | list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link) | |
1165 | OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++); | |
1166 | ||
1167 | cnt = 1; | |
1168 | list_for_each_entry(ext, &obj->oo_reading_exts, oe_link) | |
1169 | OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++); | |
1170 | /* osc_object_unlock(obj); */ | |
1171 | } | |
1172 | ||
1173 | /* ------------------ osc extent end ------------------ */ | |
1174 | ||
1175 | static inline int osc_is_ready(struct osc_object *osc) | |
1176 | { | |
1177 | return !list_empty(&osc->oo_ready_item) || | |
1178 | !list_empty(&osc->oo_hp_ready_item); | |
1179 | } | |
1180 | ||
1181 | #define OSC_IO_DEBUG(OSC, STR, args...) \ | |
1182 | CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR, \ | |
1183 | (OSC), osc_is_ready(OSC), \ | |
1184 | list_empty_marker(&(OSC)->oo_hp_ready_item), \ | |
1185 | list_empty_marker(&(OSC)->oo_ready_item), \ | |
1186 | atomic_read(&(OSC)->oo_nr_writes), \ | |
1187 | list_empty_marker(&(OSC)->oo_hp_exts), \ | |
1188 | list_empty_marker(&(OSC)->oo_urgent_exts), \ | |
1189 | atomic_read(&(OSC)->oo_nr_reads), \ | |
1190 | list_empty_marker(&(OSC)->oo_reading_exts), \ | |
1191 | ##args) | |
1192 | ||
1193 | static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, | |
1194 | int cmd) | |
1195 | { | |
1196 | struct osc_page *opg = oap2osc_page(oap); | |
1197 | struct cl_page *page = cl_page_top(oap2cl_page(oap)); | |
1198 | int result; | |
1199 | ||
1200 | LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */ | |
1201 | ||
d7e09d03 PT |
1202 | result = cl_page_make_ready(env, page, CRT_WRITE); |
1203 | if (result == 0) | |
1204 | opg->ops_submit_time = cfs_time_current(); | |
0a3bdb00 | 1205 | return result; |
d7e09d03 PT |
1206 | } |
1207 | ||
1208 | static int osc_refresh_count(const struct lu_env *env, | |
1209 | struct osc_async_page *oap, int cmd) | |
1210 | { | |
1211 | struct osc_page *opg = oap2osc_page(oap); | |
1212 | struct cl_page *page = oap2cl_page(oap); | |
1213 | struct cl_object *obj; | |
1214 | struct cl_attr *attr = &osc_env_info(env)->oti_attr; | |
1215 | ||
1216 | int result; | |
1217 | loff_t kms; | |
1218 | ||
1219 | /* readpage queues with _COUNT_STABLE, shouldn't get here. */ | |
1220 | LASSERT(!(cmd & OBD_BRW_READ)); | |
1221 | LASSERT(opg != NULL); | |
1222 | obj = opg->ops_cl.cpl_obj; | |
1223 | ||
1224 | cl_object_attr_lock(obj); | |
1225 | result = cl_object_attr_get(env, obj, attr); | |
1226 | cl_object_attr_unlock(obj); | |
1227 | if (result < 0) | |
1228 | return result; | |
1229 | kms = attr->cat_kms; | |
1230 | if (cl_offset(obj, page->cp_index) >= kms) | |
1231 | /* catch race with truncate */ | |
1232 | return 0; | |
1233 | else if (cl_offset(obj, page->cp_index + 1) > kms) | |
1234 | /* catch sub-page write at end of file */ | |
1235 | return kms % PAGE_CACHE_SIZE; | |
1236 | else | |
1237 | return PAGE_CACHE_SIZE; | |
1238 | } | |
1239 | ||
1240 | static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, | |
1241 | int cmd, int rc) | |
1242 | { | |
1243 | struct osc_page *opg = oap2osc_page(oap); | |
1244 | struct cl_page *page = cl_page_top(oap2cl_page(oap)); | |
1245 | struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); | |
1246 | enum cl_req_type crt; | |
1247 | int srvlock; | |
1248 | ||
d7e09d03 PT |
1249 | cmd &= ~OBD_BRW_NOQUOTA; |
1250 | LASSERT(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ)); | |
1251 | LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE)); | |
1252 | LASSERT(opg->ops_transfer_pinned); | |
1253 | ||
1254 | /* | |
1255 | * page->cp_req can be NULL if io submission failed before | |
1256 | * cl_req was allocated. | |
1257 | */ | |
1258 | if (page->cp_req != NULL) | |
1259 | cl_req_page_done(env, page); | |
1260 | LASSERT(page->cp_req == NULL); | |
1261 | ||
1262 | crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; | |
1263 | /* Clear opg->ops_transfer_pinned before VM lock is released. */ | |
1264 | opg->ops_transfer_pinned = 0; | |
1265 | ||
1266 | spin_lock(&obj->oo_seatbelt); | |
1267 | LASSERT(opg->ops_submitter != NULL); | |
1268 | LASSERT(!list_empty(&opg->ops_inflight)); | |
1269 | list_del_init(&opg->ops_inflight); | |
1270 | opg->ops_submitter = NULL; | |
1271 | spin_unlock(&obj->oo_seatbelt); | |
1272 | ||
1273 | opg->ops_submit_time = 0; | |
1274 | srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK; | |
1275 | ||
1276 | /* statistic */ | |
1277 | if (rc == 0 && srvlock) { | |
1278 | struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev; | |
1279 | struct osc_stats *stats = &lu2osc_dev(ld)->od_stats; | |
1280 | int bytes = oap->oap_count; | |
1281 | ||
1282 | if (crt == CRT_READ) | |
1283 | stats->os_lockless_reads += bytes; | |
1284 | else | |
1285 | stats->os_lockless_writes += bytes; | |
1286 | } | |
1287 | ||
1288 | /* | |
1289 | * This has to be the last operation with the page, as locks are | |
1290 | * released in cl_page_completion() and nothing except for the | |
1291 | * reference counter protects page from concurrent reclaim. | |
1292 | */ | |
1293 | lu_ref_del(&page->cp_reference, "transfer", page); | |
1294 | ||
1295 | cl_page_completion(env, page, crt, rc); | |
1296 | ||
0a3bdb00 | 1297 | return 0; |
d7e09d03 PT |
1298 | } |
1299 | ||
1300 | #define OSC_DUMP_GRANT(cli, fmt, args...) do { \ | |
1301 | struct client_obd *__tmp = (cli); \ | |
1302 | CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d " \ | |
c52f69c5 | 1303 | "dropped: %ld avail: %ld, reserved: %ld, flight: %d } " fmt, \ |
d7e09d03 PT |
1304 | __tmp->cl_import->imp_obd->obd_name, \ |
1305 | __tmp->cl_dirty, __tmp->cl_dirty_max, \ | |
1306 | atomic_read(&obd_dirty_pages), obd_max_dirty_pages, \ | |
d7e09d03 PT |
1307 | __tmp->cl_lost_grant, __tmp->cl_avail_grant, \ |
1308 | __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args); \ | |
1309 | } while (0) | |
1310 | ||
1311 | /* caller must hold loi_list_lock */ | |
1312 | static void osc_consume_write_grant(struct client_obd *cli, | |
1313 | struct brw_page *pga) | |
1314 | { | |
5e42bc9d | 1315 | assert_spin_locked(&cli->cl_loi_list_lock.lock); |
d7e09d03 PT |
1316 | LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); |
1317 | atomic_inc(&obd_dirty_pages); | |
1318 | cli->cl_dirty += PAGE_CACHE_SIZE; | |
1319 | pga->flag |= OBD_BRW_FROM_GRANT; | |
1320 | CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", | |
1321 | PAGE_CACHE_SIZE, pga, pga->pg); | |
1322 | osc_update_next_shrink(cli); | |
1323 | } | |
1324 | ||
1325 | /* the companion to osc_consume_write_grant, called when a brw has completed. | |
1326 | * must be called with the loi lock held. */ | |
1327 | static void osc_release_write_grant(struct client_obd *cli, | |
1328 | struct brw_page *pga) | |
1329 | { | |
5e42bc9d | 1330 | assert_spin_locked(&cli->cl_loi_list_lock.lock); |
d7e09d03 | 1331 | if (!(pga->flag & OBD_BRW_FROM_GRANT)) { |
d7e09d03 PT |
1332 | return; |
1333 | } | |
1334 | ||
1335 | pga->flag &= ~OBD_BRW_FROM_GRANT; | |
1336 | atomic_dec(&obd_dirty_pages); | |
1337 | cli->cl_dirty -= PAGE_CACHE_SIZE; | |
1338 | if (pga->flag & OBD_BRW_NOCACHE) { | |
1339 | pga->flag &= ~OBD_BRW_NOCACHE; | |
1340 | atomic_dec(&obd_dirty_transit_pages); | |
1341 | cli->cl_dirty_transit -= PAGE_CACHE_SIZE; | |
1342 | } | |
d7e09d03 PT |
1343 | } |
1344 | ||
1345 | /** | |
1346 | * To avoid sleeping with object lock held, it's good for us allocate enough | |
1347 | * grants before entering into critical section. | |
1348 | * | |
1349 | * client_obd_list_lock held by caller | |
1350 | */ | |
1351 | static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes) | |
1352 | { | |
1353 | int rc = -EDQUOT; | |
1354 | ||
1355 | if (cli->cl_avail_grant >= bytes) { | |
1356 | cli->cl_avail_grant -= bytes; | |
1357 | cli->cl_reserved_grant += bytes; | |
1358 | rc = 0; | |
1359 | } | |
1360 | return rc; | |
1361 | } | |
1362 | ||
1363 | static void __osc_unreserve_grant(struct client_obd *cli, | |
1364 | unsigned int reserved, unsigned int unused) | |
1365 | { | |
1366 | /* it's quite normal for us to get more grant than reserved. | |
1367 | * Thinking about a case that two extents merged by adding a new | |
1368 | * chunk, we can save one extent tax. If extent tax is greater than | |
1369 | * one chunk, we can save more grant by adding a new chunk */ | |
1370 | cli->cl_reserved_grant -= reserved; | |
1371 | if (unused > reserved) { | |
1372 | cli->cl_avail_grant += reserved; | |
1373 | cli->cl_lost_grant += unused - reserved; | |
1374 | } else { | |
1375 | cli->cl_avail_grant += unused; | |
1376 | } | |
1377 | } | |
1378 | ||
1379 | void osc_unreserve_grant(struct client_obd *cli, | |
1380 | unsigned int reserved, unsigned int unused) | |
1381 | { | |
1382 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1383 | __osc_unreserve_grant(cli, reserved, unused); | |
1384 | if (unused > 0) | |
1385 | osc_wake_cache_waiters(cli); | |
1386 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1387 | } | |
1388 | ||
1389 | /** | |
1390 | * Free grant after IO is finished or canceled. | |
1391 | * | |
1392 | * @lost_grant is used to remember how many grants we have allocated but not | |
1393 | * used, we should return these grants to OST. There're two cases where grants | |
1394 | * can be lost: | |
1395 | * 1. truncate; | |
1396 | * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was | |
1397 | * written. In this case OST may use less chunks to serve this partial | |
1398 | * write. OSTs don't actually know the page size on the client side. so | |
1399 | * clients have to calculate lost grant by the blocksize on the OST. | |
1400 | * See filter_grant_check() for details. | |
1401 | */ | |
1402 | static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, | |
1403 | unsigned int lost_grant) | |
1404 | { | |
1405 | int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; | |
1406 | ||
1407 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1408 | atomic_sub(nr_pages, &obd_dirty_pages); | |
1409 | cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT; | |
1410 | cli->cl_lost_grant += lost_grant; | |
1411 | if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) { | |
1412 | /* borrow some grant from truncate to avoid the case that | |
1413 | * truncate uses up all avail grant */ | |
1414 | cli->cl_lost_grant -= grant; | |
1415 | cli->cl_avail_grant += grant; | |
1416 | } | |
1417 | osc_wake_cache_waiters(cli); | |
1418 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1419 | CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n", | |
1420 | lost_grant, cli->cl_lost_grant, | |
1421 | cli->cl_avail_grant, cli->cl_dirty); | |
1422 | } | |
1423 | ||
1424 | /** | |
1425 | * The companion to osc_enter_cache(), called when @oap is no longer part of | |
1426 | * the dirty accounting due to error. | |
1427 | */ | |
1428 | static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap) | |
1429 | { | |
1430 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1431 | osc_release_write_grant(cli, &oap->oap_brw_page); | |
1432 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1433 | } | |
1434 | ||
1435 | /** | |
1436 | * Non-blocking version of osc_enter_cache() that consumes grant only when it | |
1437 | * is available. | |
1438 | */ | |
1439 | static int osc_enter_cache_try(struct client_obd *cli, | |
1440 | struct osc_async_page *oap, | |
1441 | int bytes, int transient) | |
1442 | { | |
1443 | int rc; | |
1444 | ||
1445 | OSC_DUMP_GRANT(cli, "need:%d.\n", bytes); | |
1446 | ||
1447 | rc = osc_reserve_grant(cli, bytes); | |
1448 | if (rc < 0) | |
1449 | return 0; | |
1450 | ||
1451 | if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max && | |
c52f69c5 | 1452 | atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) { |
d7e09d03 PT |
1453 | osc_consume_write_grant(cli, &oap->oap_brw_page); |
1454 | if (transient) { | |
1455 | cli->cl_dirty_transit += PAGE_CACHE_SIZE; | |
1456 | atomic_inc(&obd_dirty_transit_pages); | |
1457 | oap->oap_brw_flags |= OBD_BRW_NOCACHE; | |
1458 | } | |
1459 | rc = 1; | |
1460 | } else { | |
1461 | __osc_unreserve_grant(cli, bytes, bytes); | |
1462 | rc = 0; | |
1463 | } | |
1464 | return rc; | |
1465 | } | |
1466 | ||
1467 | static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) | |
1468 | { | |
1469 | int rc; | |
1470 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1471 | rc = list_empty(&ocw->ocw_entry); | |
1472 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1473 | return rc; | |
1474 | } | |
1475 | ||
1476 | /** | |
1477 | * The main entry to reserve dirty page accounting. Usually the grant reserved | |
1478 | * in this function will be freed in bulk in osc_free_grant() unless it fails | |
1479 | * to add osc cache, in that case, it will be freed in osc_exit_cache(). | |
1480 | * | |
1481 | * The process will be put into sleep if it's already run out of grant. | |
1482 | */ | |
1483 | static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, | |
1484 | struct osc_async_page *oap, int bytes) | |
1485 | { | |
1486 | struct osc_object *osc = oap->oap_obj; | |
1487 | struct lov_oinfo *loi = osc->oo_oinfo; | |
1488 | struct osc_cache_waiter ocw; | |
1489 | struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); | |
1490 | int rc = -EDQUOT; | |
d7e09d03 PT |
1491 | |
1492 | OSC_DUMP_GRANT(cli, "need:%d.\n", bytes); | |
1493 | ||
1494 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1495 | ||
1496 | /* force the caller to try sync io. this can jump the list | |
1497 | * of queued writes and create a discontiguous rpc stream */ | |
1498 | if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || | |
1499 | cli->cl_dirty_max < PAGE_CACHE_SIZE || | |
1500 | cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) | |
1501 | GOTO(out, rc = -EDQUOT); | |
1502 | ||
1503 | /* Hopefully normal case - cache space and write credits available */ | |
1504 | if (osc_enter_cache_try(cli, oap, bytes, 0)) | |
1505 | GOTO(out, rc = 0); | |
1506 | ||
1507 | /* We can get here for two reasons: too many dirty pages in cache, or | |
1508 | * run out of grants. In both cases we should write dirty pages out. | |
1509 | * Adding a cache waiter will trigger urgent write-out no matter what | |
1510 | * RPC size will be. | |
1511 | * The exiting condition is no avail grants and no dirty pages caching, | |
1512 | * that really means there is no space on the OST. */ | |
1513 | init_waitqueue_head(&ocw.ocw_waitq); | |
1514 | ocw.ocw_oap = oap; | |
1515 | ocw.ocw_grant = bytes; | |
1516 | while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) { | |
1517 | list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); | |
1518 | ocw.ocw_rc = 0; | |
1519 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1520 | ||
1521 | osc_io_unplug_async(env, cli, NULL); | |
1522 | ||
1523 | CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", | |
1524 | cli->cl_import->imp_obd->obd_name, &ocw, oap); | |
1525 | ||
1526 | rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi); | |
1527 | ||
1528 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1529 | ||
1530 | /* l_wait_event is interrupted by signal */ | |
1531 | if (rc < 0) { | |
1532 | list_del_init(&ocw.ocw_entry); | |
1533 | GOTO(out, rc); | |
1534 | } | |
1535 | ||
1536 | LASSERT(list_empty(&ocw.ocw_entry)); | |
1537 | rc = ocw.ocw_rc; | |
1538 | ||
1539 | if (rc != -EDQUOT) | |
1540 | GOTO(out, rc); | |
1541 | if (osc_enter_cache_try(cli, oap, bytes, 0)) | |
1542 | GOTO(out, rc = 0); | |
1543 | } | |
d7e09d03 PT |
1544 | out: |
1545 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1546 | OSC_DUMP_GRANT(cli, "returned %d.\n", rc); | |
0a3bdb00 | 1547 | return rc; |
d7e09d03 PT |
1548 | } |
1549 | ||
1550 | /* caller must hold loi_list_lock */ | |
1551 | void osc_wake_cache_waiters(struct client_obd *cli) | |
1552 | { | |
1553 | struct list_head *l, *tmp; | |
1554 | struct osc_cache_waiter *ocw; | |
1555 | ||
d7e09d03 PT |
1556 | list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { |
1557 | ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); | |
1558 | list_del_init(&ocw->ocw_entry); | |
1559 | ||
1560 | ocw->ocw_rc = -EDQUOT; | |
1561 | /* we can't dirty more */ | |
c52f69c5 OD |
1562 | if ((cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max) || |
1563 | (atomic_read(&obd_dirty_pages) + 1 > | |
1564 | obd_max_dirty_pages)) { | |
d7e09d03 PT |
1565 | CDEBUG(D_CACHE, "no dirty room: dirty: %ld " |
1566 | "osc max %ld, sys max %d\n", cli->cl_dirty, | |
1567 | cli->cl_dirty_max, obd_max_dirty_pages); | |
1568 | goto wakeup; | |
1569 | } | |
1570 | ||
1571 | ocw->ocw_rc = 0; | |
1572 | if (!osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) | |
1573 | ocw->ocw_rc = -EDQUOT; | |
1574 | ||
1575 | wakeup: | |
1576 | CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n", | |
1577 | ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc); | |
1578 | ||
1579 | wake_up(&ocw->ocw_waitq); | |
1580 | } | |
d7e09d03 PT |
1581 | } |
1582 | ||
1583 | static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) | |
1584 | { | |
1585 | int hprpc = !!list_empty(&osc->oo_hp_exts); | |
1586 | return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc; | |
1587 | } | |
1588 | ||
1589 | /* This maintains the lists of pending pages to read/write for a given object | |
1590 | * (lop). This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint() | |
1591 | * to quickly find objects that are ready to send an RPC. */ | |
1592 | static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, | |
1593 | int cmd) | |
1594 | { | |
1595 | int invalid_import = 0; | |
d7e09d03 PT |
1596 | |
1597 | /* if we have an invalid import we want to drain the queued pages | |
1598 | * by forcing them through rpcs that immediately fail and complete | |
1599 | * the pages. recovery relies on this to empty the queued pages | |
1600 | * before canceling the locks and evicting down the llite pages */ | |
1601 | if ((cli->cl_import == NULL || cli->cl_import->imp_invalid)) | |
1602 | invalid_import = 1; | |
1603 | ||
1604 | if (cmd & OBD_BRW_WRITE) { | |
1605 | if (atomic_read(&osc->oo_nr_writes) == 0) | |
0a3bdb00 | 1606 | return 0; |
d7e09d03 PT |
1607 | if (invalid_import) { |
1608 | CDEBUG(D_CACHE, "invalid import forcing RPC\n"); | |
0a3bdb00 | 1609 | return 1; |
d7e09d03 PT |
1610 | } |
1611 | if (!list_empty(&osc->oo_hp_exts)) { | |
1612 | CDEBUG(D_CACHE, "high prio request forcing RPC\n"); | |
0a3bdb00 | 1613 | return 1; |
d7e09d03 PT |
1614 | } |
1615 | if (!list_empty(&osc->oo_urgent_exts)) { | |
1616 | CDEBUG(D_CACHE, "urgent request forcing RPC\n"); | |
0a3bdb00 | 1617 | return 1; |
d7e09d03 PT |
1618 | } |
1619 | /* trigger a write rpc stream as long as there are dirtiers | |
1620 | * waiting for space. as they're waiting, they're not going to | |
1621 | * create more pages to coalesce with what's waiting.. */ | |
1622 | if (!list_empty(&cli->cl_cache_waiters)) { | |
1623 | CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); | |
0a3bdb00 | 1624 | return 1; |
d7e09d03 PT |
1625 | } |
1626 | if (atomic_read(&osc->oo_nr_writes) >= | |
1627 | cli->cl_max_pages_per_rpc) | |
0a3bdb00 | 1628 | return 1; |
d7e09d03 PT |
1629 | } else { |
1630 | if (atomic_read(&osc->oo_nr_reads) == 0) | |
0a3bdb00 | 1631 | return 0; |
d7e09d03 PT |
1632 | if (invalid_import) { |
1633 | CDEBUG(D_CACHE, "invalid import forcing RPC\n"); | |
0a3bdb00 | 1634 | return 1; |
d7e09d03 PT |
1635 | } |
1636 | /* all read are urgent. */ | |
1637 | if (!list_empty(&osc->oo_reading_exts)) | |
0a3bdb00 | 1638 | return 1; |
d7e09d03 PT |
1639 | } |
1640 | ||
0a3bdb00 | 1641 | return 0; |
d7e09d03 PT |
1642 | } |
1643 | ||
1644 | static void osc_update_pending(struct osc_object *obj, int cmd, int delta) | |
1645 | { | |
1646 | struct client_obd *cli = osc_cli(obj); | |
1647 | if (cmd & OBD_BRW_WRITE) { | |
1648 | atomic_add(delta, &obj->oo_nr_writes); | |
1649 | atomic_add(delta, &cli->cl_pending_w_pages); | |
1650 | LASSERT(atomic_read(&obj->oo_nr_writes) >= 0); | |
1651 | } else { | |
1652 | atomic_add(delta, &obj->oo_nr_reads); | |
1653 | atomic_add(delta, &cli->cl_pending_r_pages); | |
1654 | LASSERT(atomic_read(&obj->oo_nr_reads) >= 0); | |
1655 | } | |
1656 | OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta); | |
1657 | } | |
1658 | ||
1659 | static int osc_makes_hprpc(struct osc_object *obj) | |
1660 | { | |
1661 | return !list_empty(&obj->oo_hp_exts); | |
1662 | } | |
1663 | ||
1664 | static void on_list(struct list_head *item, struct list_head *list, int should_be_on) | |
1665 | { | |
1666 | if (list_empty(item) && should_be_on) | |
1667 | list_add_tail(item, list); | |
1668 | else if (!list_empty(item) && !should_be_on) | |
1669 | list_del_init(item); | |
1670 | } | |
1671 | ||
1672 | /* maintain the osc's cli list membership invariants so that osc_send_oap_rpc | |
1673 | * can find pages to build into rpcs quickly */ | |
1674 | static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc) | |
1675 | { | |
1676 | if (osc_makes_hprpc(osc)) { | |
1677 | /* HP rpc */ | |
1678 | on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0); | |
1679 | on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1); | |
1680 | } else { | |
1681 | on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0); | |
1682 | on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, | |
1683 | osc_makes_rpc(cli, osc, OBD_BRW_WRITE) || | |
1684 | osc_makes_rpc(cli, osc, OBD_BRW_READ)); | |
1685 | } | |
1686 | ||
1687 | on_list(&osc->oo_write_item, &cli->cl_loi_write_list, | |
1688 | atomic_read(&osc->oo_nr_writes) > 0); | |
1689 | ||
1690 | on_list(&osc->oo_read_item, &cli->cl_loi_read_list, | |
1691 | atomic_read(&osc->oo_nr_reads) > 0); | |
1692 | ||
1693 | return osc_is_ready(osc); | |
1694 | } | |
1695 | ||
1696 | static int osc_list_maint(struct client_obd *cli, struct osc_object *osc) | |
1697 | { | |
1698 | int is_ready; | |
1699 | ||
1700 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1701 | is_ready = __osc_list_maint(cli, osc); | |
1702 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1703 | ||
1704 | return is_ready; | |
1705 | } | |
1706 | ||
11d66e89 | 1707 | /* this is trying to propagate async writeback errors back up to the |
d7e09d03 PT |
1708 | * application. As an async write fails we record the error code for later if |
1709 | * the app does an fsync. As long as errors persist we force future rpcs to be | |
1710 | * sync so that the app can get a sync error and break the cycle of queueing | |
1711 | * pages for which writeback will fail. */ | |
1712 | static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, | |
1713 | int rc) | |
1714 | { | |
1715 | if (rc) { | |
1716 | if (!ar->ar_rc) | |
1717 | ar->ar_rc = rc; | |
1718 | ||
1719 | ar->ar_force_sync = 1; | |
1720 | ar->ar_min_xid = ptlrpc_sample_next_xid(); | |
1721 | return; | |
1722 | ||
1723 | } | |
1724 | ||
1725 | if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) | |
1726 | ar->ar_force_sync = 0; | |
1727 | } | |
1728 | ||
d7e09d03 PT |
1729 | |
1730 | /* this must be called holding the loi list lock to give coverage to exit_cache, | |
1731 | * async_flag maintenance, and oap_request */ | |
1732 | static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, | |
1733 | struct osc_async_page *oap, int sent, int rc) | |
1734 | { | |
1735 | struct osc_object *osc = oap->oap_obj; | |
1736 | struct lov_oinfo *loi = osc->oo_oinfo; | |
1737 | __u64 xid = 0; | |
1738 | ||
d7e09d03 | 1739 | if (oap->oap_request != NULL) { |
d7e09d03 PT |
1740 | xid = ptlrpc_req_xid(oap->oap_request); |
1741 | ptlrpc_req_finished(oap->oap_request); | |
1742 | oap->oap_request = NULL; | |
1743 | } | |
1744 | ||
1745 | /* As the transfer for this page is being done, clear the flags */ | |
1746 | spin_lock(&oap->oap_lock); | |
1747 | oap->oap_async_flags = 0; | |
1748 | spin_unlock(&oap->oap_lock); | |
1749 | oap->oap_interrupted = 0; | |
1750 | ||
1751 | if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) { | |
1752 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1753 | osc_process_ar(&cli->cl_ar, xid, rc); | |
1754 | osc_process_ar(&loi->loi_ar, xid, rc); | |
1755 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1756 | } | |
1757 | ||
1758 | rc = osc_completion(env, oap, oap->oap_cmd, rc); | |
1759 | if (rc) | |
1760 | CERROR("completion on oap %p obj %p returns %d.\n", | |
1761 | oap, osc, rc); | |
d7e09d03 PT |
1762 | } |
1763 | ||
1764 | /** | |
1765 | * Try to add extent to one RPC. We need to think about the following things: | |
1766 | * - # of pages must not be over max_pages_per_rpc | |
1767 | * - extent must be compatible with previous ones | |
1768 | */ | |
1769 | static int try_to_add_extent_for_io(struct client_obd *cli, | |
1770 | struct osc_extent *ext, struct list_head *rpclist, | |
1771 | int *pc, unsigned int *max_pages) | |
1772 | { | |
1773 | struct osc_extent *tmp; | |
d7e09d03 PT |
1774 | |
1775 | EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE), | |
1776 | ext); | |
1777 | ||
1778 | *max_pages = max(ext->oe_mppr, *max_pages); | |
1779 | if (*pc + ext->oe_nr_pages > *max_pages) | |
0a3bdb00 | 1780 | return 0; |
d7e09d03 PT |
1781 | |
1782 | list_for_each_entry(tmp, rpclist, oe_link) { | |
1783 | EASSERT(tmp->oe_owner == current, tmp); | |
1784 | #if 0 | |
1785 | if (overlapped(tmp, ext)) { | |
1786 | OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext); | |
1787 | EASSERT(0, ext); | |
1788 | } | |
1789 | #endif | |
1790 | ||
1791 | if (tmp->oe_srvlock != ext->oe_srvlock || | |
1792 | !tmp->oe_grants != !ext->oe_grants) | |
0a3bdb00 | 1793 | return 0; |
d7e09d03 PT |
1794 | |
1795 | /* remove break for strict check */ | |
1796 | break; | |
1797 | } | |
1798 | ||
1799 | *pc += ext->oe_nr_pages; | |
1800 | list_move_tail(&ext->oe_link, rpclist); | |
1801 | ext->oe_owner = current; | |
0a3bdb00 | 1802 | return 1; |
d7e09d03 PT |
1803 | } |
1804 | ||
1805 | /** | |
1806 | * In order to prevent multiple ptlrpcd from breaking contiguous extents, | |
1807 | * get_write_extent() takes all appropriate extents in atomic. | |
1808 | * | |
1809 | * The following policy is used to collect extents for IO: | |
1810 | * 1. Add as many HP extents as possible; | |
1811 | * 2. Add the first urgent extent in urgent extent list and take it out of | |
1812 | * urgent list; | |
1813 | * 3. Add subsequent extents of this urgent extent; | |
1814 | * 4. If urgent list is not empty, goto 2; | |
1815 | * 5. Traverse the extent tree from the 1st extent; | |
1816 | * 6. Above steps exit if there is no space in this RPC. | |
1817 | */ | |
1818 | static int get_write_extents(struct osc_object *obj, struct list_head *rpclist) | |
1819 | { | |
1820 | struct client_obd *cli = osc_cli(obj); | |
1821 | struct osc_extent *ext; | |
1822 | int page_count = 0; | |
1823 | unsigned int max_pages = cli->cl_max_pages_per_rpc; | |
1824 | ||
1825 | LASSERT(osc_object_is_locked(obj)); | |
1826 | while (!list_empty(&obj->oo_hp_exts)) { | |
1827 | ext = list_entry(obj->oo_hp_exts.next, struct osc_extent, | |
1828 | oe_link); | |
1829 | LASSERT(ext->oe_state == OES_CACHE); | |
1830 | if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, | |
1831 | &max_pages)) | |
1832 | return page_count; | |
1833 | EASSERT(ext->oe_nr_pages <= max_pages, ext); | |
1834 | } | |
1835 | if (page_count == max_pages) | |
1836 | return page_count; | |
1837 | ||
1838 | while (!list_empty(&obj->oo_urgent_exts)) { | |
1839 | ext = list_entry(obj->oo_urgent_exts.next, | |
1840 | struct osc_extent, oe_link); | |
1841 | if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, | |
1842 | &max_pages)) | |
1843 | return page_count; | |
1844 | ||
1845 | if (!ext->oe_intree) | |
1846 | continue; | |
1847 | ||
1848 | while ((ext = next_extent(ext)) != NULL) { | |
1849 | if ((ext->oe_state != OES_CACHE) || | |
1850 | (!list_empty(&ext->oe_link) && | |
1851 | ext->oe_owner != NULL)) | |
1852 | continue; | |
1853 | ||
1854 | if (!try_to_add_extent_for_io(cli, ext, rpclist, | |
1855 | &page_count, &max_pages)) | |
1856 | return page_count; | |
1857 | } | |
1858 | } | |
1859 | if (page_count == max_pages) | |
1860 | return page_count; | |
1861 | ||
1862 | ext = first_extent(obj); | |
1863 | while (ext != NULL) { | |
1864 | if ((ext->oe_state != OES_CACHE) || | |
1865 | /* this extent may be already in current rpclist */ | |
1866 | (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) { | |
1867 | ext = next_extent(ext); | |
1868 | continue; | |
1869 | } | |
1870 | ||
1871 | if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, | |
1872 | &max_pages)) | |
1873 | return page_count; | |
1874 | ||
1875 | ext = next_extent(ext); | |
1876 | } | |
1877 | return page_count; | |
1878 | } | |
1879 | ||
1880 | static int | |
1881 | osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli, | |
1882 | struct osc_object *osc, pdl_policy_t pol) | |
1883 | { | |
1884 | LIST_HEAD(rpclist); | |
1885 | struct osc_extent *ext; | |
1886 | struct osc_extent *tmp; | |
1887 | struct osc_extent *first = NULL; | |
21aef7d9 | 1888 | u32 page_count = 0; |
d7e09d03 PT |
1889 | int srvlock = 0; |
1890 | int rc = 0; | |
d7e09d03 PT |
1891 | |
1892 | LASSERT(osc_object_is_locked(osc)); | |
1893 | ||
1894 | page_count = get_write_extents(osc, &rpclist); | |
1895 | LASSERT(equi(page_count == 0, list_empty(&rpclist))); | |
1896 | ||
1897 | if (list_empty(&rpclist)) | |
0a3bdb00 | 1898 | return 0; |
d7e09d03 PT |
1899 | |
1900 | osc_update_pending(osc, OBD_BRW_WRITE, -page_count); | |
1901 | ||
1902 | list_for_each_entry(ext, &rpclist, oe_link) { | |
1903 | LASSERT(ext->oe_state == OES_CACHE || | |
1904 | ext->oe_state == OES_LOCK_DONE); | |
1905 | if (ext->oe_state == OES_CACHE) | |
1906 | osc_extent_state_set(ext, OES_LOCKING); | |
1907 | else | |
1908 | osc_extent_state_set(ext, OES_RPC); | |
1909 | } | |
1910 | ||
1911 | /* we're going to grab page lock, so release object lock because | |
1912 | * lock order is page lock -> object lock. */ | |
1913 | osc_object_unlock(osc); | |
1914 | ||
1915 | list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) { | |
1916 | if (ext->oe_state == OES_LOCKING) { | |
1917 | rc = osc_extent_make_ready(env, ext); | |
1918 | if (unlikely(rc < 0)) { | |
1919 | list_del_init(&ext->oe_link); | |
1920 | osc_extent_finish(env, ext, 0, rc); | |
1921 | continue; | |
1922 | } | |
1923 | } | |
1924 | if (first == NULL) { | |
1925 | first = ext; | |
1926 | srvlock = ext->oe_srvlock; | |
1927 | } else { | |
1928 | LASSERT(srvlock == ext->oe_srvlock); | |
1929 | } | |
1930 | } | |
1931 | ||
1932 | if (!list_empty(&rpclist)) { | |
1933 | LASSERT(page_count > 0); | |
1934 | rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE, pol); | |
1935 | LASSERT(list_empty(&rpclist)); | |
1936 | } | |
1937 | ||
1938 | osc_object_lock(osc); | |
0a3bdb00 | 1939 | return rc; |
d7e09d03 PT |
1940 | } |
1941 | ||
1942 | /** | |
1943 | * prepare pages for ASYNC io and put pages in send queue. | |
1944 | * | |
1945 | * \param cmd OBD_BRW_* macroses | |
1946 | * \param lop pending pages | |
1947 | * | |
1948 | * \return zero if no page added to send queue. | |
1949 | * \return 1 if pages successfully added to send queue. | |
1950 | * \return negative on errors. | |
1951 | */ | |
1952 | static int | |
1953 | osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli, | |
1954 | struct osc_object *osc, pdl_policy_t pol) | |
1955 | { | |
1956 | struct osc_extent *ext; | |
1957 | struct osc_extent *next; | |
1958 | LIST_HEAD(rpclist); | |
1959 | int page_count = 0; | |
1960 | unsigned int max_pages = cli->cl_max_pages_per_rpc; | |
1961 | int rc = 0; | |
d7e09d03 PT |
1962 | |
1963 | LASSERT(osc_object_is_locked(osc)); | |
1964 | list_for_each_entry_safe(ext, next, | |
1965 | &osc->oo_reading_exts, oe_link) { | |
1966 | EASSERT(ext->oe_state == OES_LOCK_DONE, ext); | |
1967 | if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count, | |
1968 | &max_pages)) | |
1969 | break; | |
1970 | osc_extent_state_set(ext, OES_RPC); | |
1971 | EASSERT(ext->oe_nr_pages <= max_pages, ext); | |
1972 | } | |
1973 | LASSERT(page_count <= max_pages); | |
1974 | ||
1975 | osc_update_pending(osc, OBD_BRW_READ, -page_count); | |
1976 | ||
1977 | if (!list_empty(&rpclist)) { | |
1978 | osc_object_unlock(osc); | |
1979 | ||
1980 | LASSERT(page_count > 0); | |
1981 | rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ, pol); | |
1982 | LASSERT(list_empty(&rpclist)); | |
1983 | ||
1984 | osc_object_lock(osc); | |
1985 | } | |
0a3bdb00 | 1986 | return rc; |
d7e09d03 PT |
1987 | } |
1988 | ||
1989 | #define list_to_obj(list, item) ({ \ | |
1990 | struct list_head *__tmp = (list)->next; \ | |
1991 | list_del_init(__tmp); \ | |
1992 | list_entry(__tmp, struct osc_object, oo_##item); \ | |
1993 | }) | |
1994 | ||
1995 | /* This is called by osc_check_rpcs() to find which objects have pages that | |
1996 | * we could be sending. These lists are maintained by osc_makes_rpc(). */ | |
1997 | static struct osc_object *osc_next_obj(struct client_obd *cli) | |
1998 | { | |
d7e09d03 PT |
1999 | /* First return objects that have blocked locks so that they |
2000 | * will be flushed quickly and other clients can get the lock, | |
2001 | * then objects which have pages ready to be stuffed into RPCs */ | |
2002 | if (!list_empty(&cli->cl_loi_hp_ready_list)) | |
0a3bdb00 | 2003 | return list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item); |
d7e09d03 | 2004 | if (!list_empty(&cli->cl_loi_ready_list)) |
0a3bdb00 | 2005 | return list_to_obj(&cli->cl_loi_ready_list, ready_item); |
d7e09d03 PT |
2006 | |
2007 | /* then if we have cache waiters, return all objects with queued | |
2008 | * writes. This is especially important when many small files | |
2009 | * have filled up the cache and not been fired into rpcs because | |
11d66e89 | 2010 | * they don't pass the nr_pending/object threshold */ |
d7e09d03 PT |
2011 | if (!list_empty(&cli->cl_cache_waiters) && |
2012 | !list_empty(&cli->cl_loi_write_list)) | |
0a3bdb00 | 2013 | return list_to_obj(&cli->cl_loi_write_list, write_item); |
d7e09d03 PT |
2014 | |
2015 | /* then return all queued objects when we have an invalid import | |
2016 | * so that they get flushed */ | |
2017 | if (cli->cl_import == NULL || cli->cl_import->imp_invalid) { | |
2018 | if (!list_empty(&cli->cl_loi_write_list)) | |
0a3bdb00 | 2019 | return list_to_obj(&cli->cl_loi_write_list, write_item); |
d7e09d03 | 2020 | if (!list_empty(&cli->cl_loi_read_list)) |
0a3bdb00 | 2021 | return list_to_obj(&cli->cl_loi_read_list, read_item); |
d7e09d03 | 2022 | } |
0a3bdb00 | 2023 | return NULL; |
d7e09d03 PT |
2024 | } |
2025 | ||
2026 | /* called with the loi list lock held */ | |
2027 | static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli, | |
2028 | pdl_policy_t pol) | |
2029 | { | |
2030 | struct osc_object *osc; | |
2031 | int rc = 0; | |
d7e09d03 PT |
2032 | |
2033 | while ((osc = osc_next_obj(cli)) != NULL) { | |
2034 | struct cl_object *obj = osc2cl(osc); | |
631abc6e | 2035 | struct lu_ref_link link; |
d7e09d03 PT |
2036 | |
2037 | OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli)); | |
2038 | ||
2039 | if (osc_max_rpc_in_flight(cli, osc)) { | |
2040 | __osc_list_maint(cli, osc); | |
2041 | break; | |
2042 | } | |
2043 | ||
2044 | cl_object_get(obj); | |
2045 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
631abc6e JH |
2046 | lu_object_ref_add_at(&obj->co_lu, &link, "check", |
2047 | current); | |
d7e09d03 PT |
2048 | |
2049 | /* attempt some read/write balancing by alternating between | |
2050 | * reads and writes in an object. The makes_rpc checks here | |
2051 | * would be redundant if we were getting read/write work items | |
2052 | * instead of objects. we don't want send_oap_rpc to drain a | |
2053 | * partial read pending queue when we're given this object to | |
2054 | * do io on writes while there are cache waiters */ | |
2055 | osc_object_lock(osc); | |
2056 | if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) { | |
2057 | rc = osc_send_write_rpc(env, cli, osc, pol); | |
2058 | if (rc < 0) { | |
2059 | CERROR("Write request failed with %d\n", rc); | |
2060 | ||
2061 | /* osc_send_write_rpc failed, mostly because of | |
2062 | * memory pressure. | |
2063 | * | |
2064 | * It can't break here, because if: | |
2065 | * - a page was submitted by osc_io_submit, so | |
2066 | * page locked; | |
2067 | * - no request in flight | |
2068 | * - no subsequent request | |
2069 | * The system will be in live-lock state, | |
2070 | * because there is no chance to call | |
2071 | * osc_io_unplug() and osc_check_rpcs() any | |
2072 | * more. pdflush can't help in this case, | |
2073 | * because it might be blocked at grabbing | |
2074 | * the page lock as we mentioned. | |
2075 | * | |
2076 | * Anyway, continue to drain pages. */ | |
2077 | /* break; */ | |
2078 | } | |
2079 | } | |
2080 | if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) { | |
2081 | rc = osc_send_read_rpc(env, cli, osc, pol); | |
2082 | if (rc < 0) | |
2083 | CERROR("Read request failed with %d\n", rc); | |
2084 | } | |
2085 | osc_object_unlock(osc); | |
2086 | ||
2087 | osc_list_maint(cli, osc); | |
631abc6e JH |
2088 | lu_object_ref_del_at(&obj->co_lu, &link, "check", |
2089 | current); | |
d7e09d03 PT |
2090 | cl_object_put(env, obj); |
2091 | ||
2092 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
2093 | } | |
2094 | } | |
2095 | ||
2096 | static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, | |
2097 | struct osc_object *osc, pdl_policy_t pol, int async) | |
2098 | { | |
d7e09d03 PT |
2099 | int rc = 0; |
2100 | ||
cad6fafa BJ |
2101 | if (osc != NULL && osc_list_maint(cli, osc) == 0) |
2102 | return 0; | |
2103 | ||
2104 | if (!async) { | |
2105 | /* disable osc_lru_shrink() temporarily to avoid | |
2106 | * potential stack overrun problem. LU-2859 */ | |
2107 | atomic_inc(&cli->cl_lru_shrinkers); | |
2108 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
2109 | osc_check_rpcs(env, cli, pol); | |
2110 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
2111 | atomic_dec(&cli->cl_lru_shrinkers); | |
2112 | } else { | |
2113 | CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli); | |
2114 | LASSERT(cli->cl_writeback_work != NULL); | |
2115 | rc = ptlrpcd_queue_work(cli->cl_writeback_work); | |
d7e09d03 | 2116 | } |
d7e09d03 PT |
2117 | return rc; |
2118 | } | |
2119 | ||
2120 | static int osc_io_unplug_async(const struct lu_env *env, | |
2121 | struct client_obd *cli, struct osc_object *osc) | |
2122 | { | |
2123 | /* XXX: policy is no use actually. */ | |
2124 | return osc_io_unplug0(env, cli, osc, PDL_POLICY_ROUND, 1); | |
2125 | } | |
2126 | ||
2127 | void osc_io_unplug(const struct lu_env *env, struct client_obd *cli, | |
2128 | struct osc_object *osc, pdl_policy_t pol) | |
2129 | { | |
2130 | (void)osc_io_unplug0(env, cli, osc, pol, 0); | |
2131 | } | |
2132 | ||
2133 | int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, | |
2134 | struct page *page, loff_t offset) | |
2135 | { | |
2136 | struct obd_export *exp = osc_export(osc); | |
2137 | struct osc_async_page *oap = &ops->ops_oap; | |
d7e09d03 PT |
2138 | |
2139 | if (!page) | |
2140 | return cfs_size_round(sizeof(*oap)); | |
2141 | ||
2142 | oap->oap_magic = OAP_MAGIC; | |
2143 | oap->oap_cli = &exp->exp_obd->u.cli; | |
2144 | oap->oap_obj = osc; | |
2145 | ||
2146 | oap->oap_page = page; | |
2147 | oap->oap_obj_off = offset; | |
2148 | LASSERT(!(offset & ~CFS_PAGE_MASK)); | |
2149 | ||
2eb90a75 | 2150 | if (!client_is_remote(exp) && capable(CFS_CAP_SYS_RESOURCE)) |
d7e09d03 PT |
2151 | oap->oap_brw_flags = OBD_BRW_NOQUOTA; |
2152 | ||
2153 | INIT_LIST_HEAD(&oap->oap_pending_item); | |
2154 | INIT_LIST_HEAD(&oap->oap_rpc_item); | |
2155 | ||
2156 | spin_lock_init(&oap->oap_lock); | |
b0f5aad5 | 2157 | CDEBUG(D_INFO, "oap %p page %p obj off %llu\n", |
d7e09d03 | 2158 | oap, page, oap->oap_obj_off); |
0a3bdb00 | 2159 | return 0; |
d7e09d03 PT |
2160 | } |
2161 | ||
2162 | int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, | |
2163 | struct osc_page *ops) | |
2164 | { | |
2165 | struct osc_io *oio = osc_env_io(env); | |
2166 | struct osc_extent *ext = NULL; | |
2167 | struct osc_async_page *oap = &ops->ops_oap; | |
2168 | struct client_obd *cli = oap->oap_cli; | |
2169 | struct osc_object *osc = oap->oap_obj; | |
2170 | pgoff_t index; | |
2171 | int grants = 0; | |
2172 | int brw_flags = OBD_BRW_ASYNC; | |
2173 | int cmd = OBD_BRW_WRITE; | |
2174 | int need_release = 0; | |
2175 | int rc = 0; | |
d7e09d03 PT |
2176 | |
2177 | if (oap->oap_magic != OAP_MAGIC) | |
0a3bdb00 | 2178 | return -EINVAL; |
d7e09d03 PT |
2179 | |
2180 | if (cli->cl_import == NULL || cli->cl_import->imp_invalid) | |
0a3bdb00 | 2181 | return -EIO; |
d7e09d03 PT |
2182 | |
2183 | if (!list_empty(&oap->oap_pending_item) || | |
2184 | !list_empty(&oap->oap_rpc_item)) | |
0a3bdb00 | 2185 | return -EBUSY; |
d7e09d03 PT |
2186 | |
2187 | /* Set the OBD_BRW_SRVLOCK before the page is queued. */ | |
2188 | brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; | |
2189 | if (!client_is_remote(osc_export(osc)) && | |
2eb90a75 | 2190 | capable(CFS_CAP_SYS_RESOURCE)) { |
d7e09d03 PT |
2191 | brw_flags |= OBD_BRW_NOQUOTA; |
2192 | cmd |= OBD_BRW_NOQUOTA; | |
2193 | } | |
2194 | ||
2195 | /* check if the file's owner/group is over quota */ | |
2196 | if (!(cmd & OBD_BRW_NOQUOTA)) { | |
2197 | struct cl_object *obj; | |
2198 | struct cl_attr *attr; | |
2199 | unsigned int qid[MAXQUOTAS]; | |
2200 | ||
2201 | obj = cl_object_top(&osc->oo_cl); | |
2202 | attr = &osc_env_info(env)->oti_attr; | |
2203 | ||
2204 | cl_object_attr_lock(obj); | |
2205 | rc = cl_object_attr_get(env, obj, attr); | |
2206 | cl_object_attr_unlock(obj); | |
2207 | ||
2208 | qid[USRQUOTA] = attr->cat_uid; | |
2209 | qid[GRPQUOTA] = attr->cat_gid; | |
2210 | if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA) | |
2211 | rc = -EDQUOT; | |
2212 | if (rc) | |
0a3bdb00 | 2213 | return rc; |
d7e09d03 PT |
2214 | } |
2215 | ||
2216 | oap->oap_cmd = cmd; | |
2217 | oap->oap_page_off = ops->ops_from; | |
2218 | oap->oap_count = ops->ops_to - ops->ops_from; | |
2219 | oap->oap_async_flags = 0; | |
2220 | oap->oap_brw_flags = brw_flags; | |
2221 | ||
2222 | OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n", | |
2223 | oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK); | |
2224 | ||
2225 | index = oap2cl_page(oap)->cp_index; | |
2226 | ||
2227 | /* Add this page into extent by the following steps: | |
2228 | * 1. if there exists an active extent for this IO, mostly this page | |
2229 | * can be added to the active extent and sometimes we need to | |
11d66e89 | 2230 | * expand extent to accommodate this page; |
d7e09d03 PT |
2231 | * 2. otherwise, a new extent will be allocated. */ |
2232 | ||
2233 | ext = oio->oi_active; | |
2234 | if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) { | |
2235 | /* one chunk plus extent overhead must be enough to write this | |
2236 | * page */ | |
2237 | grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; | |
2238 | if (ext->oe_end >= index) | |
2239 | grants = 0; | |
2240 | ||
2241 | /* it doesn't need any grant to dirty this page */ | |
2242 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
2243 | rc = osc_enter_cache_try(cli, oap, grants, 0); | |
2244 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
2245 | if (rc == 0) { /* try failed */ | |
2246 | grants = 0; | |
2247 | need_release = 1; | |
2248 | } else if (ext->oe_end < index) { | |
2249 | int tmp = grants; | |
2250 | /* try to expand this extent */ | |
2251 | rc = osc_extent_expand(ext, index, &tmp); | |
2252 | if (rc < 0) { | |
2253 | need_release = 1; | |
2254 | /* don't free reserved grant */ | |
2255 | } else { | |
2256 | OSC_EXTENT_DUMP(D_CACHE, ext, | |
2257 | "expanded for %lu.\n", index); | |
2258 | osc_unreserve_grant(cli, grants, tmp); | |
2259 | grants = 0; | |
2260 | } | |
2261 | } | |
2262 | rc = 0; | |
2263 | } else if (ext != NULL) { | |
2264 | /* index is located outside of active extent */ | |
2265 | need_release = 1; | |
2266 | } | |
2267 | if (need_release) { | |
2268 | osc_extent_release(env, ext); | |
2269 | oio->oi_active = NULL; | |
2270 | ext = NULL; | |
2271 | } | |
2272 | ||
2273 | if (ext == NULL) { | |
2274 | int tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; | |
2275 | ||
2276 | /* try to find new extent to cover this page */ | |
2277 | LASSERT(oio->oi_active == NULL); | |
2278 | /* we may have allocated grant for this page if we failed | |
2279 | * to expand the previous active extent. */ | |
2280 | LASSERT(ergo(grants > 0, grants >= tmp)); | |
2281 | ||
2282 | rc = 0; | |
2283 | if (grants == 0) { | |
2284 | /* we haven't allocated grant for this page. */ | |
2285 | rc = osc_enter_cache(env, cli, oap, tmp); | |
2286 | if (rc == 0) | |
2287 | grants = tmp; | |
2288 | } | |
2289 | ||
2290 | tmp = grants; | |
2291 | if (rc == 0) { | |
2292 | ext = osc_extent_find(env, osc, index, &tmp); | |
2293 | if (IS_ERR(ext)) { | |
2294 | LASSERT(tmp == grants); | |
2295 | osc_exit_cache(cli, oap); | |
2296 | rc = PTR_ERR(ext); | |
2297 | ext = NULL; | |
2298 | } else { | |
2299 | oio->oi_active = ext; | |
2300 | } | |
2301 | } | |
2302 | if (grants > 0) | |
2303 | osc_unreserve_grant(cli, grants, tmp); | |
2304 | } | |
2305 | ||
2306 | LASSERT(ergo(rc == 0, ext != NULL)); | |
2307 | if (ext != NULL) { | |
2308 | EASSERTF(ext->oe_end >= index && ext->oe_start <= index, | |
2309 | ext, "index = %lu.\n", index); | |
2310 | LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0); | |
2311 | ||
2312 | osc_object_lock(osc); | |
2313 | if (ext->oe_nr_pages == 0) | |
2314 | ext->oe_srvlock = ops->ops_srvlock; | |
2315 | else | |
2316 | LASSERT(ext->oe_srvlock == ops->ops_srvlock); | |
2317 | ++ext->oe_nr_pages; | |
2318 | list_add_tail(&oap->oap_pending_item, &ext->oe_pages); | |
2319 | osc_object_unlock(osc); | |
2320 | } | |
0a3bdb00 | 2321 | return rc; |
d7e09d03 PT |
2322 | } |
2323 | ||
2324 | int osc_teardown_async_page(const struct lu_env *env, | |
2325 | struct osc_object *obj, struct osc_page *ops) | |
2326 | { | |
2327 | struct osc_async_page *oap = &ops->ops_oap; | |
2328 | struct osc_extent *ext = NULL; | |
2329 | int rc = 0; | |
d7e09d03 PT |
2330 | |
2331 | LASSERT(oap->oap_magic == OAP_MAGIC); | |
2332 | ||
2333 | CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n", | |
2334 | oap, ops, oap2cl_page(oap)->cp_index); | |
2335 | ||
2336 | osc_object_lock(obj); | |
2337 | if (!list_empty(&oap->oap_rpc_item)) { | |
2338 | CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap); | |
2339 | rc = -EBUSY; | |
2340 | } else if (!list_empty(&oap->oap_pending_item)) { | |
2341 | ext = osc_extent_lookup(obj, oap2cl_page(oap)->cp_index); | |
2342 | /* only truncated pages are allowed to be taken out. | |
2343 | * See osc_extent_truncate() and osc_cache_truncate_start() | |
2344 | * for details. */ | |
2345 | if (ext != NULL && ext->oe_state != OES_TRUNC) { | |
2346 | OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n", | |
2347 | oap2cl_page(oap)->cp_index); | |
2348 | rc = -EBUSY; | |
2349 | } | |
2350 | } | |
2351 | osc_object_unlock(obj); | |
2352 | if (ext != NULL) | |
2353 | osc_extent_put(env, ext); | |
0a3bdb00 | 2354 | return rc; |
d7e09d03 PT |
2355 | } |
2356 | ||
2357 | /** | |
2358 | * This is called when a page is picked up by kernel to write out. | |
2359 | * | |
2360 | * We should find out the corresponding extent and add the whole extent | |
2361 | * into urgent list. The extent may be being truncated or used, handle it | |
2362 | * carefully. | |
2363 | */ | |
2364 | int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, | |
2365 | struct osc_page *ops) | |
2366 | { | |
2367 | struct osc_extent *ext = NULL; | |
2368 | struct osc_object *obj = cl2osc(ops->ops_cl.cpl_obj); | |
2369 | struct cl_page *cp = ops->ops_cl.cpl_page; | |
2370 | pgoff_t index = cp->cp_index; | |
2371 | struct osc_async_page *oap = &ops->ops_oap; | |
2372 | bool unplug = false; | |
2373 | int rc = 0; | |
d7e09d03 PT |
2374 | |
2375 | osc_object_lock(obj); | |
2376 | ext = osc_extent_lookup(obj, index); | |
2377 | if (ext == NULL) { | |
2378 | osc_extent_tree_dump(D_ERROR, obj); | |
2379 | LASSERTF(0, "page index %lu is NOT covered.\n", index); | |
2380 | } | |
2381 | ||
2382 | switch (ext->oe_state) { | |
2383 | case OES_RPC: | |
2384 | case OES_LOCK_DONE: | |
2385 | CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(cp), | |
2386 | "flush an in-rpc page?\n"); | |
2387 | LASSERT(0); | |
2388 | break; | |
2389 | case OES_LOCKING: | |
2390 | /* If we know this extent is being written out, we should abort | |
2391 | * so that the writer can make this page ready. Otherwise, there | |
2392 | * exists a deadlock problem because other process can wait for | |
2393 | * page writeback bit holding page lock; and meanwhile in | |
2394 | * vvp_page_make_ready(), we need to grab page lock before | |
2395 | * really sending the RPC. */ | |
2396 | case OES_TRUNC: | |
2397 | /* race with truncate, page will be redirtied */ | |
15f13cde AK |
2398 | case OES_ACTIVE: |
2399 | /* The extent is active so we need to abort and let the caller | |
2400 | * re-dirty the page. If we continued on here, and we were the | |
2401 | * one making the extent active, we could deadlock waiting for | |
2402 | * the page writeback to clear but it won't because the extent | |
2403 | * is active and won't be written out. */ | |
d7e09d03 PT |
2404 | GOTO(out, rc = -EAGAIN); |
2405 | default: | |
2406 | break; | |
2407 | } | |
2408 | ||
2409 | rc = cl_page_prep(env, io, cl_page_top(cp), CRT_WRITE); | |
2410 | if (rc) | |
2411 | GOTO(out, rc); | |
2412 | ||
2413 | spin_lock(&oap->oap_lock); | |
2414 | oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT; | |
2415 | spin_unlock(&oap->oap_lock); | |
2416 | ||
2417 | if (memory_pressure_get()) | |
2418 | ext->oe_memalloc = 1; | |
2419 | ||
2420 | ext->oe_urgent = 1; | |
2421 | if (ext->oe_state == OES_CACHE) { | |
2422 | OSC_EXTENT_DUMP(D_CACHE, ext, | |
2423 | "flush page %p make it urgent.\n", oap); | |
2424 | if (list_empty(&ext->oe_link)) | |
2425 | list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); | |
2426 | unplug = true; | |
2427 | } | |
2428 | rc = 0; | |
d7e09d03 PT |
2429 | |
2430 | out: | |
2431 | osc_object_unlock(obj); | |
2432 | osc_extent_put(env, ext); | |
2433 | if (unplug) | |
2434 | osc_io_unplug_async(env, osc_cli(obj), obj); | |
2435 | return rc; | |
2436 | } | |
2437 | ||
2438 | /** | |
2439 | * this is called when a sync waiter receives an interruption. Its job is to | |
2440 | * get the caller woken as soon as possible. If its page hasn't been put in an | |
2441 | * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as | |
2442 | * desiring interruption which will forcefully complete the rpc once the rpc | |
2443 | * has timed out. | |
2444 | */ | |
2445 | int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops) | |
2446 | { | |
2447 | struct osc_async_page *oap = &ops->ops_oap; | |
2448 | struct osc_object *obj = oap->oap_obj; | |
2449 | struct client_obd *cli = osc_cli(obj); | |
2450 | struct osc_extent *ext; | |
2451 | struct osc_extent *found = NULL; | |
2452 | struct list_head *plist; | |
2453 | pgoff_t index = oap2cl_page(oap)->cp_index; | |
2454 | int rc = -EBUSY; | |
2455 | int cmd; | |
d7e09d03 PT |
2456 | |
2457 | LASSERT(!oap->oap_interrupted); | |
2458 | oap->oap_interrupted = 1; | |
2459 | ||
2460 | /* Find out the caching extent */ | |
2461 | osc_object_lock(obj); | |
2462 | if (oap->oap_cmd & OBD_BRW_WRITE) { | |
2463 | plist = &obj->oo_urgent_exts; | |
2464 | cmd = OBD_BRW_WRITE; | |
2465 | } else { | |
2466 | plist = &obj->oo_reading_exts; | |
2467 | cmd = OBD_BRW_READ; | |
2468 | } | |
2469 | list_for_each_entry(ext, plist, oe_link) { | |
2470 | if (ext->oe_start <= index && ext->oe_end >= index) { | |
2471 | LASSERT(ext->oe_state == OES_LOCK_DONE); | |
2472 | /* For OES_LOCK_DONE state extent, it has already held | |
2473 | * a refcount for RPC. */ | |
2474 | found = osc_extent_get(ext); | |
2475 | break; | |
2476 | } | |
2477 | } | |
2478 | if (found != NULL) { | |
2479 | list_del_init(&found->oe_link); | |
2480 | osc_update_pending(obj, cmd, -found->oe_nr_pages); | |
2481 | osc_object_unlock(obj); | |
2482 | ||
2483 | osc_extent_finish(env, found, 0, -EINTR); | |
2484 | osc_extent_put(env, found); | |
2485 | rc = 0; | |
2486 | } else { | |
2487 | osc_object_unlock(obj); | |
2488 | /* ok, it's been put in an rpc. only one oap gets a request | |
2489 | * reference */ | |
2490 | if (oap->oap_request != NULL) { | |
2491 | ptlrpc_mark_interrupted(oap->oap_request); | |
2492 | ptlrpcd_wake(oap->oap_request); | |
2493 | ptlrpc_req_finished(oap->oap_request); | |
2494 | oap->oap_request = NULL; | |
2495 | } | |
2496 | } | |
2497 | ||
2498 | osc_list_maint(cli, obj); | |
0a3bdb00 | 2499 | return rc; |
d7e09d03 PT |
2500 | } |
2501 | ||
2502 | int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, | |
2503 | struct list_head *list, int cmd, int brw_flags) | |
2504 | { | |
2505 | struct client_obd *cli = osc_cli(obj); | |
2506 | struct osc_extent *ext; | |
f13ab92e | 2507 | struct osc_async_page *oap, *tmp; |
d7e09d03 PT |
2508 | int page_count = 0; |
2509 | int mppr = cli->cl_max_pages_per_rpc; | |
2510 | pgoff_t start = CL_PAGE_EOF; | |
2511 | pgoff_t end = 0; | |
d7e09d03 PT |
2512 | |
2513 | list_for_each_entry(oap, list, oap_pending_item) { | |
2514 | struct cl_page *cp = oap2cl_page(oap); | |
2515 | if (cp->cp_index > end) | |
2516 | end = cp->cp_index; | |
2517 | if (cp->cp_index < start) | |
2518 | start = cp->cp_index; | |
2519 | ++page_count; | |
2520 | mppr <<= (page_count > mppr); | |
2521 | } | |
2522 | ||
2523 | ext = osc_extent_alloc(obj); | |
2524 | if (ext == NULL) { | |
f13ab92e | 2525 | list_for_each_entry_safe(oap, tmp, list, oap_pending_item) { |
d7e09d03 PT |
2526 | list_del_init(&oap->oap_pending_item); |
2527 | osc_ap_completion(env, cli, oap, 0, -ENOMEM); | |
2528 | } | |
0a3bdb00 | 2529 | return -ENOMEM; |
d7e09d03 PT |
2530 | } |
2531 | ||
2532 | ext->oe_rw = !!(cmd & OBD_BRW_READ); | |
2533 | ext->oe_urgent = 1; | |
2534 | ext->oe_start = start; | |
2535 | ext->oe_end = ext->oe_max_end = end; | |
2536 | ext->oe_obj = obj; | |
2537 | ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK); | |
2538 | ext->oe_nr_pages = page_count; | |
2539 | ext->oe_mppr = mppr; | |
2540 | list_splice_init(list, &ext->oe_pages); | |
2541 | ||
2542 | osc_object_lock(obj); | |
2543 | /* Reuse the initial refcount for RPC, don't drop it */ | |
2544 | osc_extent_state_set(ext, OES_LOCK_DONE); | |
2545 | if (cmd & OBD_BRW_WRITE) { | |
2546 | list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); | |
2547 | osc_update_pending(obj, OBD_BRW_WRITE, page_count); | |
2548 | } else { | |
2549 | list_add_tail(&ext->oe_link, &obj->oo_reading_exts); | |
2550 | osc_update_pending(obj, OBD_BRW_READ, page_count); | |
2551 | } | |
2552 | osc_object_unlock(obj); | |
2553 | ||
2554 | osc_io_unplug(env, cli, obj, PDL_POLICY_ROUND); | |
0a3bdb00 | 2555 | return 0; |
d7e09d03 PT |
2556 | } |
2557 | ||
2558 | /** | |
2559 | * Called by osc_io_setattr_start() to freeze and destroy covering extents. | |
2560 | */ | |
2561 | int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio, | |
2562 | struct osc_object *obj, __u64 size) | |
2563 | { | |
2564 | struct client_obd *cli = osc_cli(obj); | |
2565 | struct osc_extent *ext; | |
2566 | struct osc_extent *waiting = NULL; | |
2567 | pgoff_t index; | |
2568 | LIST_HEAD(list); | |
2569 | int result = 0; | |
2570 | bool partial; | |
d7e09d03 PT |
2571 | |
2572 | /* pages with index greater or equal to index will be truncated. */ | |
2573 | index = cl_index(osc2cl(obj), size); | |
2574 | partial = size > cl_offset(osc2cl(obj), index); | |
2575 | ||
2576 | again: | |
2577 | osc_object_lock(obj); | |
2578 | ext = osc_extent_search(obj, index); | |
2579 | if (ext == NULL) | |
2580 | ext = first_extent(obj); | |
2581 | else if (ext->oe_end < index) | |
2582 | ext = next_extent(ext); | |
2583 | while (ext != NULL) { | |
2584 | EASSERT(ext->oe_state != OES_TRUNC, ext); | |
2585 | ||
2586 | if (ext->oe_state > OES_CACHE || ext->oe_urgent) { | |
2587 | /* if ext is in urgent state, it means there must exist | |
2588 | * a page already having been flushed by write_page(). | |
2589 | * We have to wait for this extent because we can't | |
2590 | * truncate that page. */ | |
2591 | LASSERT(!ext->oe_hp); | |
2592 | OSC_EXTENT_DUMP(D_CACHE, ext, | |
2593 | "waiting for busy extent\n"); | |
2594 | waiting = osc_extent_get(ext); | |
2595 | break; | |
2596 | } | |
2597 | ||
b0f5aad5 | 2598 | OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size); |
d7e09d03 PT |
2599 | |
2600 | osc_extent_get(ext); | |
2601 | if (ext->oe_state == OES_ACTIVE) { | |
2602 | /* though we grab inode mutex for write path, but we | |
2603 | * release it before releasing extent(in osc_io_end()), | |
2604 | * so there is a race window that an extent is still | |
2605 | * in OES_ACTIVE when truncate starts. */ | |
2606 | LASSERT(!ext->oe_trunc_pending); | |
2607 | ext->oe_trunc_pending = 1; | |
2608 | } else { | |
2609 | EASSERT(ext->oe_state == OES_CACHE, ext); | |
2610 | osc_extent_state_set(ext, OES_TRUNC); | |
2611 | osc_update_pending(obj, OBD_BRW_WRITE, | |
2612 | -ext->oe_nr_pages); | |
2613 | } | |
2614 | EASSERT(list_empty(&ext->oe_link), ext); | |
2615 | list_add_tail(&ext->oe_link, &list); | |
2616 | ||
2617 | ext = next_extent(ext); | |
2618 | } | |
2619 | osc_object_unlock(obj); | |
2620 | ||
2621 | osc_list_maint(cli, obj); | |
2622 | ||
2623 | while (!list_empty(&list)) { | |
2624 | int rc; | |
2625 | ||
2626 | ext = list_entry(list.next, struct osc_extent, oe_link); | |
2627 | list_del_init(&ext->oe_link); | |
2628 | ||
2629 | /* extent may be in OES_ACTIVE state because inode mutex | |
2630 | * is released before osc_io_end() in file write case */ | |
2631 | if (ext->oe_state != OES_TRUNC) | |
2632 | osc_extent_wait(env, ext, OES_TRUNC); | |
2633 | ||
2634 | rc = osc_extent_truncate(ext, index, partial); | |
2635 | if (rc < 0) { | |
2636 | if (result == 0) | |
2637 | result = rc; | |
2638 | ||
2639 | OSC_EXTENT_DUMP(D_ERROR, ext, | |
2640 | "truncate error %d\n", rc); | |
2641 | } else if (ext->oe_nr_pages == 0) { | |
2642 | osc_extent_remove(ext); | |
2643 | } else { | |
2644 | /* this must be an overlapped extent which means only | |
2645 | * part of pages in this extent have been truncated. | |
2646 | */ | |
2647 | EASSERTF(ext->oe_start <= index, ext, | |
2648 | "trunc index = %lu/%d.\n", index, partial); | |
2649 | /* fix index to skip this partially truncated extent */ | |
2650 | index = ext->oe_end + 1; | |
2651 | partial = false; | |
2652 | ||
2653 | /* we need to hold this extent in OES_TRUNC state so | |
2654 | * that no writeback will happen. This is to avoid | |
2655 | * BUG 17397. */ | |
2656 | LASSERT(oio->oi_trunc == NULL); | |
2657 | oio->oi_trunc = osc_extent_get(ext); | |
2658 | OSC_EXTENT_DUMP(D_CACHE, ext, | |
b0f5aad5 | 2659 | "trunc at %llu\n", size); |
d7e09d03 PT |
2660 | } |
2661 | osc_extent_put(env, ext); | |
2662 | } | |
2663 | if (waiting != NULL) { | |
2664 | int rc; | |
2665 | ||
2666 | /* ignore the result of osc_extent_wait the write initiator | |
2667 | * should take care of it. */ | |
2668 | rc = osc_extent_wait(env, waiting, OES_INV); | |
2669 | if (rc < 0) | |
451721cc | 2670 | OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc); |
d7e09d03 PT |
2671 | |
2672 | osc_extent_put(env, waiting); | |
2673 | waiting = NULL; | |
2674 | goto again; | |
2675 | } | |
0a3bdb00 | 2676 | return result; |
d7e09d03 PT |
2677 | } |
2678 | ||
2679 | /** | |
2680 | * Called after osc_io_setattr_end to add oio->oi_trunc back to cache. | |
2681 | */ | |
2682 | void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio, | |
2683 | struct osc_object *obj) | |
2684 | { | |
2685 | struct osc_extent *ext = oio->oi_trunc; | |
2686 | ||
2687 | oio->oi_trunc = NULL; | |
2688 | if (ext != NULL) { | |
2689 | bool unplug = false; | |
2690 | ||
2691 | EASSERT(ext->oe_nr_pages > 0, ext); | |
2692 | EASSERT(ext->oe_state == OES_TRUNC, ext); | |
2693 | EASSERT(!ext->oe_urgent, ext); | |
2694 | ||
2695 | OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n"); | |
2696 | osc_object_lock(obj); | |
2697 | osc_extent_state_set(ext, OES_CACHE); | |
2698 | if (ext->oe_fsync_wait && !ext->oe_urgent) { | |
2699 | ext->oe_urgent = 1; | |
2700 | list_move_tail(&ext->oe_link, &obj->oo_urgent_exts); | |
2701 | unplug = true; | |
2702 | } | |
2703 | osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages); | |
2704 | osc_object_unlock(obj); | |
2705 | osc_extent_put(env, ext); | |
2706 | ||
2707 | if (unplug) | |
2708 | osc_io_unplug_async(env, osc_cli(obj), obj); | |
2709 | } | |
2710 | } | |
2711 | ||
2712 | /** | |
2713 | * Wait for extents in a specific range to be written out. | |
2714 | * The caller must have called osc_cache_writeback_range() to issue IO | |
2715 | * otherwise it will take a long time for this function to finish. | |
2716 | * | |
2717 | * Caller must hold inode_mutex , or cancel exclusive dlm lock so that | |
2718 | * nobody else can dirty this range of file while we're waiting for | |
2719 | * extents to be written. | |
2720 | */ | |
2721 | int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, | |
2722 | pgoff_t start, pgoff_t end) | |
2723 | { | |
2724 | struct osc_extent *ext; | |
2725 | pgoff_t index = start; | |
2726 | int result = 0; | |
d7e09d03 PT |
2727 | |
2728 | again: | |
2729 | osc_object_lock(obj); | |
2730 | ext = osc_extent_search(obj, index); | |
2731 | if (ext == NULL) | |
2732 | ext = first_extent(obj); | |
2733 | else if (ext->oe_end < index) | |
2734 | ext = next_extent(ext); | |
2735 | while (ext != NULL) { | |
2736 | int rc; | |
2737 | ||
2738 | if (ext->oe_start > end) | |
2739 | break; | |
2740 | ||
2741 | if (!ext->oe_fsync_wait) { | |
2742 | ext = next_extent(ext); | |
2743 | continue; | |
2744 | } | |
2745 | ||
2746 | EASSERT(ergo(ext->oe_state == OES_CACHE, | |
2747 | ext->oe_hp || ext->oe_urgent), ext); | |
2748 | EASSERT(ergo(ext->oe_state == OES_ACTIVE, | |
2749 | !ext->oe_hp && ext->oe_urgent), ext); | |
2750 | ||
2751 | index = ext->oe_end + 1; | |
2752 | osc_extent_get(ext); | |
2753 | osc_object_unlock(obj); | |
2754 | ||
2755 | rc = osc_extent_wait(env, ext, OES_INV); | |
2756 | if (result == 0) | |
2757 | result = rc; | |
2758 | osc_extent_put(env, ext); | |
2759 | goto again; | |
2760 | } | |
2761 | osc_object_unlock(obj); | |
2762 | ||
2763 | OSC_IO_DEBUG(obj, "sync file range.\n"); | |
0a3bdb00 | 2764 | return result; |
d7e09d03 PT |
2765 | } |
2766 | ||
2767 | /** | |
2768 | * Called to write out a range of osc object. | |
2769 | * | |
2770 | * @hp : should be set this is caused by lock cancel; | |
2771 | * @discard: is set if dirty pages should be dropped - file will be deleted or | |
2772 | * truncated, this implies there is no partially discarding extents. | |
2773 | * | |
2774 | * Return how many pages will be issued, or error code if error occurred. | |
2775 | */ | |
2776 | int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, | |
2777 | pgoff_t start, pgoff_t end, int hp, int discard) | |
2778 | { | |
2779 | struct osc_extent *ext; | |
2780 | LIST_HEAD(discard_list); | |
2781 | bool unplug = false; | |
2782 | int result = 0; | |
d7e09d03 PT |
2783 | |
2784 | osc_object_lock(obj); | |
2785 | ext = osc_extent_search(obj, start); | |
2786 | if (ext == NULL) | |
2787 | ext = first_extent(obj); | |
2788 | else if (ext->oe_end < start) | |
2789 | ext = next_extent(ext); | |
2790 | while (ext != NULL) { | |
2791 | if (ext->oe_start > end) | |
2792 | break; | |
2793 | ||
2794 | ext->oe_fsync_wait = 1; | |
2795 | switch (ext->oe_state) { | |
2796 | case OES_CACHE: | |
2797 | result += ext->oe_nr_pages; | |
2798 | if (!discard) { | |
2799 | struct list_head *list = NULL; | |
2800 | if (hp) { | |
2801 | EASSERT(!ext->oe_hp, ext); | |
2802 | ext->oe_hp = 1; | |
2803 | list = &obj->oo_hp_exts; | |
2804 | } else if (!ext->oe_urgent) { | |
2805 | ext->oe_urgent = 1; | |
2806 | list = &obj->oo_urgent_exts; | |
2807 | } | |
2808 | if (list != NULL) | |
2809 | list_move_tail(&ext->oe_link, list); | |
2810 | unplug = true; | |
2811 | } else { | |
2812 | /* the only discarder is lock cancelling, so | |
2813 | * [start, end] must contain this extent */ | |
2814 | EASSERT(ext->oe_start >= start && | |
2815 | ext->oe_max_end <= end, ext); | |
2816 | osc_extent_state_set(ext, OES_LOCKING); | |
2817 | ext->oe_owner = current; | |
2818 | list_move_tail(&ext->oe_link, | |
2819 | &discard_list); | |
2820 | osc_update_pending(obj, OBD_BRW_WRITE, | |
2821 | -ext->oe_nr_pages); | |
2822 | } | |
2823 | break; | |
2824 | case OES_ACTIVE: | |
2825 | /* It's pretty bad to wait for ACTIVE extents, because | |
2826 | * we don't know how long we will wait for it to be | |
2827 | * flushed since it may be blocked at awaiting more | |
2828 | * grants. We do this for the correctness of fsync. */ | |
2829 | LASSERT(hp == 0 && discard == 0); | |
2830 | ext->oe_urgent = 1; | |
2831 | break; | |
2832 | case OES_TRUNC: | |
2833 | /* this extent is being truncated, can't do anything | |
2834 | * for it now. it will be set to urgent after truncate | |
2835 | * is finished in osc_cache_truncate_end(). */ | |
2836 | default: | |
2837 | break; | |
2838 | } | |
2839 | ext = next_extent(ext); | |
2840 | } | |
2841 | osc_object_unlock(obj); | |
2842 | ||
2843 | LASSERT(ergo(!discard, list_empty(&discard_list))); | |
2844 | if (!list_empty(&discard_list)) { | |
2845 | struct osc_extent *tmp; | |
2846 | int rc; | |
2847 | ||
2848 | osc_list_maint(osc_cli(obj), obj); | |
2849 | list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) { | |
2850 | list_del_init(&ext->oe_link); | |
2851 | EASSERT(ext->oe_state == OES_LOCKING, ext); | |
2852 | ||
2853 | /* Discard caching pages. We don't actually write this | |
2854 | * extent out but we complete it as if we did. */ | |
2855 | rc = osc_extent_make_ready(env, ext); | |
2856 | if (unlikely(rc < 0)) { | |
2857 | OSC_EXTENT_DUMP(D_ERROR, ext, | |
2858 | "make_ready returned %d\n", rc); | |
2859 | if (result >= 0) | |
2860 | result = rc; | |
2861 | } | |
2862 | ||
2863 | /* finish the extent as if the pages were sent */ | |
2864 | osc_extent_finish(env, ext, 0, 0); | |
2865 | } | |
2866 | } | |
2867 | ||
2868 | if (unplug) | |
2869 | osc_io_unplug(env, osc_cli(obj), obj, PDL_POLICY_ROUND); | |
2870 | ||
2871 | if (hp || discard) { | |
2872 | int rc; | |
2873 | rc = osc_cache_wait_range(env, obj, start, end); | |
2874 | if (result >= 0 && rc < 0) | |
2875 | result = rc; | |
2876 | } | |
2877 | ||
2878 | OSC_IO_DEBUG(obj, "cache page out.\n"); | |
0a3bdb00 | 2879 | return result; |
d7e09d03 PT |
2880 | } |
2881 | ||
2882 | /** @} osc */ |