]>
Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2012, Intel Corporation. | |
31 | * | |
32 | */ | |
33 | /* | |
34 | * This file is part of Lustre, http://www.lustre.org/ | |
35 | * Lustre is a trademark of Sun Microsystems, Inc. | |
36 | * | |
37 | * osc cache management. | |
38 | * | |
39 | * Author: Jinshan Xiong <jinshan.xiong@whamcloud.com> | |
40 | */ | |
41 | ||
42 | #define DEBUG_SUBSYSTEM S_OSC | |
43 | ||
44 | #include "osc_cl_internal.h" | |
45 | #include "osc_internal.h" | |
46 | ||
47 | static int extent_debug; /* set it to be true for more debug */ | |
48 | ||
49 | static void osc_update_pending(struct osc_object *obj, int cmd, int delta); | |
50 | static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, | |
51 | int state); | |
52 | static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, | |
53 | struct osc_async_page *oap, int sent, int rc); | |
54 | static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, | |
55 | int cmd); | |
56 | static int osc_refresh_count(const struct lu_env *env, | |
57 | struct osc_async_page *oap, int cmd); | |
58 | static int osc_io_unplug_async(const struct lu_env *env, | |
59 | struct client_obd *cli, struct osc_object *osc); | |
60 | static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, | |
61 | unsigned int lost_grant); | |
62 | ||
63 | static void osc_extent_tree_dump0(int level, struct osc_object *obj, | |
64 | const char *func, int line); | |
65 | #define osc_extent_tree_dump(lvl, obj) \ | |
66 | osc_extent_tree_dump0(lvl, obj, __func__, __LINE__) | |
67 | ||
68 | /** \addtogroup osc | |
69 | * @{ | |
70 | */ | |
71 | ||
72 | /* ------------------ osc extent ------------------ */ | |
73 | static inline char *ext_flags(struct osc_extent *ext, char *flags) | |
74 | { | |
75 | char *buf = flags; | |
76 | *buf++ = ext->oe_rw ? 'r' : 'w'; | |
77 | if (ext->oe_intree) | |
78 | *buf++ = 'i'; | |
79 | if (ext->oe_srvlock) | |
80 | *buf++ = 's'; | |
81 | if (ext->oe_hp) | |
82 | *buf++ = 'h'; | |
83 | if (ext->oe_urgent) | |
84 | *buf++ = 'u'; | |
85 | if (ext->oe_memalloc) | |
86 | *buf++ = 'm'; | |
87 | if (ext->oe_trunc_pending) | |
88 | *buf++ = 't'; | |
89 | if (ext->oe_fsync_wait) | |
90 | *buf++ = 'Y'; | |
91 | *buf = 0; | |
92 | return flags; | |
93 | } | |
94 | ||
95 | static inline char list_empty_marker(struct list_head *list) | |
96 | { | |
97 | return list_empty(list) ? '-' : '+'; | |
98 | } | |
99 | ||
100 | #define EXTSTR "[%lu -> %lu/%lu]" | |
101 | #define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end | |
102 | ||
103 | #define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do { \ | |
104 | struct osc_extent *__ext = (extent); \ | |
105 | const char *__str[] = OES_STRINGS; \ | |
106 | char __buf[16]; \ | |
107 | \ | |
108 | CDEBUG(lvl, \ | |
109 | "extent %p@{" EXTSTR ", " \ | |
110 | "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \ | |
111 | /* ----- extent part 0 ----- */ \ | |
112 | __ext, EXTPARA(__ext), \ | |
113 | /* ----- part 1 ----- */ \ | |
114 | atomic_read(&__ext->oe_refc), \ | |
115 | atomic_read(&__ext->oe_users), \ | |
116 | list_empty_marker(&__ext->oe_link), \ | |
117 | __str[__ext->oe_state], ext_flags(__ext, __buf), \ | |
118 | __ext->oe_obj, \ | |
119 | /* ----- part 2 ----- */ \ | |
120 | __ext->oe_grants, __ext->oe_nr_pages, \ | |
121 | list_empty_marker(&__ext->oe_pages), \ | |
122 | waitqueue_active(&__ext->oe_waitq) ? '+' : '-', \ | |
123 | __ext->oe_osclock, __ext->oe_mppr, __ext->oe_owner, \ | |
124 | /* ----- part 4 ----- */ \ | |
125 | ## __VA_ARGS__); \ | |
126 | } while (0) | |
127 | ||
128 | #undef EASSERTF | |
129 | #define EASSERTF(expr, ext, fmt, args...) do { \ | |
130 | if (!(expr)) { \ | |
131 | OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args); \ | |
132 | osc_extent_tree_dump(D_ERROR, (ext)->oe_obj); \ | |
133 | LASSERT(expr); \ | |
134 | } \ | |
135 | } while (0) | |
136 | ||
137 | #undef EASSERT | |
138 | #define EASSERT(expr, ext) EASSERTF(expr, ext, "\n") | |
139 | ||
140 | static inline struct osc_extent *rb_extent(struct rb_node *n) | |
141 | { | |
142 | if (n == NULL) | |
143 | return NULL; | |
144 | ||
145 | return container_of(n, struct osc_extent, oe_node); | |
146 | } | |
147 | ||
148 | static inline struct osc_extent *next_extent(struct osc_extent *ext) | |
149 | { | |
150 | if (ext == NULL) | |
151 | return NULL; | |
152 | ||
153 | LASSERT(ext->oe_intree); | |
154 | return rb_extent(rb_next(&ext->oe_node)); | |
155 | } | |
156 | ||
157 | static inline struct osc_extent *prev_extent(struct osc_extent *ext) | |
158 | { | |
159 | if (ext == NULL) | |
160 | return NULL; | |
161 | ||
162 | LASSERT(ext->oe_intree); | |
163 | return rb_extent(rb_prev(&ext->oe_node)); | |
164 | } | |
165 | ||
166 | static inline struct osc_extent *first_extent(struct osc_object *obj) | |
167 | { | |
168 | return rb_extent(rb_first(&obj->oo_root)); | |
169 | } | |
170 | ||
171 | /* object must be locked by caller. */ | |
172 | static int osc_extent_sanity_check0(struct osc_extent *ext, | |
173 | const char *func, const int line) | |
174 | { | |
175 | struct osc_object *obj = ext->oe_obj; | |
176 | struct osc_async_page *oap; | |
177 | int page_count; | |
178 | int rc = 0; | |
179 | ||
180 | if (!osc_object_is_locked(obj)) | |
181 | GOTO(out, rc = 9); | |
182 | ||
183 | if (ext->oe_state >= OES_STATE_MAX) | |
184 | GOTO(out, rc = 10); | |
185 | ||
186 | if (atomic_read(&ext->oe_refc) <= 0) | |
187 | GOTO(out, rc = 20); | |
188 | ||
189 | if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) | |
190 | GOTO(out, rc = 30); | |
191 | ||
192 | switch (ext->oe_state) { | |
193 | case OES_INV: | |
194 | if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages)) | |
195 | GOTO(out, rc = 35); | |
196 | GOTO(out, rc = 0); | |
197 | break; | |
198 | case OES_ACTIVE: | |
199 | if (atomic_read(&ext->oe_users) == 0) | |
200 | GOTO(out, rc = 40); | |
201 | if (ext->oe_hp) | |
202 | GOTO(out, rc = 50); | |
203 | if (ext->oe_fsync_wait && !ext->oe_urgent) | |
204 | GOTO(out, rc = 55); | |
205 | break; | |
206 | case OES_CACHE: | |
207 | if (ext->oe_grants == 0) | |
208 | GOTO(out, rc = 60); | |
209 | if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) | |
210 | GOTO(out, rc = 65); | |
211 | default: | |
212 | if (atomic_read(&ext->oe_users) > 0) | |
213 | GOTO(out, rc = 70); | |
214 | } | |
215 | ||
216 | if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) | |
217 | GOTO(out, rc = 80); | |
218 | ||
219 | if (ext->oe_osclock == NULL && ext->oe_grants > 0) | |
220 | GOTO(out, rc = 90); | |
221 | ||
222 | if (ext->oe_osclock) { | |
223 | struct cl_lock_descr *descr; | |
224 | descr = &ext->oe_osclock->cll_descr; | |
225 | if (!(descr->cld_start <= ext->oe_start && | |
226 | descr->cld_end >= ext->oe_max_end)) | |
227 | GOTO(out, rc = 100); | |
228 | } | |
229 | ||
230 | if (ext->oe_nr_pages > ext->oe_mppr) | |
231 | GOTO(out, rc = 105); | |
232 | ||
233 | /* Do not verify page list if extent is in RPC. This is because an | |
234 | * in-RPC extent is supposed to be exclusively accessible w/o lock. */ | |
235 | if (ext->oe_state > OES_CACHE) | |
236 | GOTO(out, rc = 0); | |
237 | ||
238 | if (!extent_debug) | |
239 | GOTO(out, rc = 0); | |
240 | ||
241 | page_count = 0; | |
242 | list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { | |
243 | pgoff_t index = oap2cl_page(oap)->cp_index; | |
244 | ++page_count; | |
245 | if (index > ext->oe_end || index < ext->oe_start) | |
246 | GOTO(out, rc = 110); | |
247 | } | |
248 | if (page_count != ext->oe_nr_pages) | |
249 | GOTO(out, rc = 120); | |
250 | ||
251 | out: | |
252 | if (rc != 0) | |
253 | OSC_EXTENT_DUMP(D_ERROR, ext, | |
254 | "%s:%d sanity check %p failed with rc = %d\n", | |
255 | func, line, ext, rc); | |
256 | return rc; | |
257 | } | |
258 | ||
259 | #define sanity_check_nolock(ext) \ | |
260 | osc_extent_sanity_check0(ext, __func__, __LINE__) | |
261 | ||
262 | #define sanity_check(ext) ({ \ | |
263 | int __res; \ | |
264 | osc_object_lock((ext)->oe_obj); \ | |
265 | __res = sanity_check_nolock(ext); \ | |
266 | osc_object_unlock((ext)->oe_obj); \ | |
267 | __res; \ | |
268 | }) | |
269 | ||
270 | ||
271 | /** | |
272 | * sanity check - to make sure there is no overlapped extent in the tree. | |
273 | */ | |
274 | static int osc_extent_is_overlapped(struct osc_object *obj, | |
275 | struct osc_extent *ext) | |
276 | { | |
277 | struct osc_extent *tmp; | |
278 | ||
279 | LASSERT(osc_object_is_locked(obj)); | |
280 | ||
281 | if (!extent_debug) | |
282 | return 0; | |
283 | ||
284 | for (tmp = first_extent(obj); tmp != NULL; tmp = next_extent(tmp)) { | |
285 | if (tmp == ext) | |
286 | continue; | |
287 | if (tmp->oe_end >= ext->oe_start && | |
288 | tmp->oe_start <= ext->oe_end) | |
289 | return 1; | |
290 | } | |
291 | return 0; | |
292 | } | |
293 | ||
294 | static void osc_extent_state_set(struct osc_extent *ext, int state) | |
295 | { | |
296 | LASSERT(osc_object_is_locked(ext->oe_obj)); | |
297 | LASSERT(state >= OES_INV && state < OES_STATE_MAX); | |
298 | ||
299 | /* Never try to sanity check a state changing extent :-) */ | |
300 | /* LASSERT(sanity_check_nolock(ext) == 0); */ | |
301 | ||
302 | /* TODO: validate the state machine */ | |
303 | ext->oe_state = state; | |
304 | wake_up_all(&ext->oe_waitq); | |
305 | } | |
306 | ||
307 | static struct osc_extent *osc_extent_alloc(struct osc_object *obj) | |
308 | { | |
309 | struct osc_extent *ext; | |
310 | ||
311 | OBD_SLAB_ALLOC_PTR_GFP(ext, osc_extent_kmem, GFP_IOFS); | |
312 | if (ext == NULL) | |
313 | return NULL; | |
314 | ||
315 | RB_CLEAR_NODE(&ext->oe_node); | |
316 | ext->oe_obj = obj; | |
317 | atomic_set(&ext->oe_refc, 1); | |
318 | atomic_set(&ext->oe_users, 0); | |
319 | INIT_LIST_HEAD(&ext->oe_link); | |
320 | ext->oe_state = OES_INV; | |
321 | INIT_LIST_HEAD(&ext->oe_pages); | |
322 | init_waitqueue_head(&ext->oe_waitq); | |
323 | ext->oe_osclock = NULL; | |
324 | ||
325 | return ext; | |
326 | } | |
327 | ||
328 | static void osc_extent_free(struct osc_extent *ext) | |
329 | { | |
330 | OBD_SLAB_FREE_PTR(ext, osc_extent_kmem); | |
331 | } | |
332 | ||
333 | static struct osc_extent *osc_extent_get(struct osc_extent *ext) | |
334 | { | |
335 | LASSERT(atomic_read(&ext->oe_refc) >= 0); | |
336 | atomic_inc(&ext->oe_refc); | |
337 | return ext; | |
338 | } | |
339 | ||
340 | static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) | |
341 | { | |
342 | LASSERT(atomic_read(&ext->oe_refc) > 0); | |
343 | if (atomic_dec_and_test(&ext->oe_refc)) { | |
344 | LASSERT(list_empty(&ext->oe_link)); | |
345 | LASSERT(atomic_read(&ext->oe_users) == 0); | |
346 | LASSERT(ext->oe_state == OES_INV); | |
347 | LASSERT(!ext->oe_intree); | |
348 | ||
349 | if (ext->oe_osclock) { | |
350 | cl_lock_put(env, ext->oe_osclock); | |
351 | ext->oe_osclock = NULL; | |
352 | } | |
353 | osc_extent_free(ext); | |
354 | } | |
355 | } | |
356 | ||
357 | /** | |
358 | * osc_extent_put_trust() is a special version of osc_extent_put() when | |
359 | * it's known that the caller is not the last user. This is to address the | |
360 | * problem of lacking of lu_env ;-). | |
361 | */ | |
362 | static void osc_extent_put_trust(struct osc_extent *ext) | |
363 | { | |
364 | LASSERT(atomic_read(&ext->oe_refc) > 1); | |
365 | LASSERT(osc_object_is_locked(ext->oe_obj)); | |
366 | atomic_dec(&ext->oe_refc); | |
367 | } | |
368 | ||
369 | /** | |
370 | * Return the extent which includes pgoff @index, or return the greatest | |
371 | * previous extent in the tree. | |
372 | */ | |
373 | static struct osc_extent *osc_extent_search(struct osc_object *obj, | |
374 | pgoff_t index) | |
375 | { | |
376 | struct rb_node *n = obj->oo_root.rb_node; | |
377 | struct osc_extent *tmp, *p = NULL; | |
378 | ||
379 | LASSERT(osc_object_is_locked(obj)); | |
380 | while (n != NULL) { | |
381 | tmp = rb_extent(n); | |
382 | if (index < tmp->oe_start) { | |
383 | n = n->rb_left; | |
384 | } else if (index > tmp->oe_end) { | |
385 | p = rb_extent(n); | |
386 | n = n->rb_right; | |
387 | } else { | |
388 | return tmp; | |
389 | } | |
390 | } | |
391 | return p; | |
392 | } | |
393 | ||
394 | /* | |
395 | * Return the extent covering @index, otherwise return NULL. | |
396 | * caller must have held object lock. | |
397 | */ | |
398 | static struct osc_extent *osc_extent_lookup(struct osc_object *obj, | |
399 | pgoff_t index) | |
400 | { | |
401 | struct osc_extent *ext; | |
402 | ||
403 | ext = osc_extent_search(obj, index); | |
404 | if (ext != NULL && ext->oe_start <= index && index <= ext->oe_end) | |
405 | return osc_extent_get(ext); | |
406 | return NULL; | |
407 | } | |
408 | ||
409 | /* caller must have held object lock. */ | |
410 | static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) | |
411 | { | |
412 | struct rb_node **n = &obj->oo_root.rb_node; | |
413 | struct rb_node *parent = NULL; | |
414 | struct osc_extent *tmp; | |
415 | ||
416 | LASSERT(ext->oe_intree == 0); | |
417 | LASSERT(ext->oe_obj == obj); | |
418 | LASSERT(osc_object_is_locked(obj)); | |
419 | while (*n != NULL) { | |
420 | tmp = rb_extent(*n); | |
421 | parent = *n; | |
422 | ||
423 | if (ext->oe_end < tmp->oe_start) | |
424 | n = &(*n)->rb_left; | |
425 | else if (ext->oe_start > tmp->oe_end) | |
426 | n = &(*n)->rb_right; | |
427 | else | |
428 | EASSERTF(0, tmp, EXTSTR, EXTPARA(ext)); | |
429 | } | |
430 | rb_link_node(&ext->oe_node, parent, n); | |
431 | rb_insert_color(&ext->oe_node, &obj->oo_root); | |
432 | osc_extent_get(ext); | |
433 | ext->oe_intree = 1; | |
434 | } | |
435 | ||
436 | /* caller must have held object lock. */ | |
437 | static void osc_extent_erase(struct osc_extent *ext) | |
438 | { | |
439 | struct osc_object *obj = ext->oe_obj; | |
440 | LASSERT(osc_object_is_locked(obj)); | |
441 | if (ext->oe_intree) { | |
442 | rb_erase(&ext->oe_node, &obj->oo_root); | |
443 | ext->oe_intree = 0; | |
444 | /* rbtree held a refcount */ | |
445 | osc_extent_put_trust(ext); | |
446 | } | |
447 | } | |
448 | ||
449 | static struct osc_extent *osc_extent_hold(struct osc_extent *ext) | |
450 | { | |
451 | struct osc_object *obj = ext->oe_obj; | |
452 | ||
453 | LASSERT(osc_object_is_locked(obj)); | |
454 | LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE); | |
455 | if (ext->oe_state == OES_CACHE) { | |
456 | osc_extent_state_set(ext, OES_ACTIVE); | |
457 | osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages); | |
458 | } | |
459 | atomic_inc(&ext->oe_users); | |
460 | list_del_init(&ext->oe_link); | |
461 | return osc_extent_get(ext); | |
462 | } | |
463 | ||
464 | static void __osc_extent_remove(struct osc_extent *ext) | |
465 | { | |
466 | LASSERT(osc_object_is_locked(ext->oe_obj)); | |
467 | LASSERT(list_empty(&ext->oe_pages)); | |
468 | osc_extent_erase(ext); | |
469 | list_del_init(&ext->oe_link); | |
470 | osc_extent_state_set(ext, OES_INV); | |
471 | OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n"); | |
472 | } | |
473 | ||
474 | static void osc_extent_remove(struct osc_extent *ext) | |
475 | { | |
476 | struct osc_object *obj = ext->oe_obj; | |
477 | ||
478 | osc_object_lock(obj); | |
479 | __osc_extent_remove(ext); | |
480 | osc_object_unlock(obj); | |
481 | } | |
482 | ||
483 | /** | |
484 | * This function is used to merge extents to get better performance. It checks | |
485 | * if @cur and @victim are contiguous at chunk level. | |
486 | */ | |
487 | static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, | |
488 | struct osc_extent *victim) | |
489 | { | |
490 | struct osc_object *obj = cur->oe_obj; | |
491 | pgoff_t chunk_start; | |
492 | pgoff_t chunk_end; | |
493 | int ppc_bits; | |
494 | ||
495 | LASSERT(cur->oe_state == OES_CACHE); | |
496 | LASSERT(osc_object_is_locked(obj)); | |
497 | if (victim == NULL) | |
498 | return -EINVAL; | |
499 | ||
500 | if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait) | |
501 | return -EBUSY; | |
502 | ||
503 | if (cur->oe_max_end != victim->oe_max_end) | |
504 | return -ERANGE; | |
505 | ||
506 | LASSERT(cur->oe_osclock == victim->oe_osclock); | |
507 | ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT; | |
508 | chunk_start = cur->oe_start >> ppc_bits; | |
509 | chunk_end = cur->oe_end >> ppc_bits; | |
510 | if (chunk_start != (victim->oe_end >> ppc_bits) + 1 && | |
511 | chunk_end + 1 != victim->oe_start >> ppc_bits) | |
512 | return -ERANGE; | |
513 | ||
514 | OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur); | |
515 | ||
516 | cur->oe_start = min(cur->oe_start, victim->oe_start); | |
517 | cur->oe_end = max(cur->oe_end, victim->oe_end); | |
518 | cur->oe_grants += victim->oe_grants; | |
519 | cur->oe_nr_pages += victim->oe_nr_pages; | |
520 | /* only the following bits are needed to merge */ | |
521 | cur->oe_urgent |= victim->oe_urgent; | |
522 | cur->oe_memalloc |= victim->oe_memalloc; | |
523 | list_splice_init(&victim->oe_pages, &cur->oe_pages); | |
524 | list_del_init(&victim->oe_link); | |
525 | victim->oe_nr_pages = 0; | |
526 | ||
527 | osc_extent_get(victim); | |
528 | __osc_extent_remove(victim); | |
529 | osc_extent_put(env, victim); | |
530 | ||
531 | OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim); | |
532 | return 0; | |
533 | } | |
534 | ||
535 | /** | |
536 | * Drop user count of osc_extent, and unplug IO asynchronously. | |
537 | */ | |
538 | int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) | |
539 | { | |
540 | struct osc_object *obj = ext->oe_obj; | |
541 | int rc = 0; | |
542 | ENTRY; | |
543 | ||
544 | LASSERT(atomic_read(&ext->oe_users) > 0); | |
545 | LASSERT(sanity_check(ext) == 0); | |
546 | LASSERT(ext->oe_grants > 0); | |
547 | ||
548 | if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) { | |
549 | LASSERT(ext->oe_state == OES_ACTIVE); | |
550 | if (ext->oe_trunc_pending) { | |
551 | /* a truncate process is waiting for this extent. | |
552 | * This may happen due to a race, check | |
553 | * osc_cache_truncate_start(). */ | |
554 | osc_extent_state_set(ext, OES_TRUNC); | |
555 | ext->oe_trunc_pending = 0; | |
556 | } else { | |
557 | osc_extent_state_set(ext, OES_CACHE); | |
558 | osc_update_pending(obj, OBD_BRW_WRITE, | |
559 | ext->oe_nr_pages); | |
560 | ||
561 | /* try to merge the previous and next extent. */ | |
562 | osc_extent_merge(env, ext, prev_extent(ext)); | |
563 | osc_extent_merge(env, ext, next_extent(ext)); | |
564 | ||
565 | if (ext->oe_urgent) | |
566 | list_move_tail(&ext->oe_link, | |
567 | &obj->oo_urgent_exts); | |
568 | } | |
569 | osc_object_unlock(obj); | |
570 | ||
571 | osc_io_unplug_async(env, osc_cli(obj), obj); | |
572 | } | |
573 | osc_extent_put(env, ext); | |
574 | RETURN(rc); | |
575 | } | |
576 | ||
577 | static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2) | |
578 | { | |
579 | return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start); | |
580 | } | |
581 | ||
582 | /** | |
583 | * Find or create an extent which includes @index, core function to manage | |
584 | * extent tree. | |
585 | */ | |
586 | struct osc_extent *osc_extent_find(const struct lu_env *env, | |
587 | struct osc_object *obj, pgoff_t index, | |
588 | int *grants) | |
589 | ||
590 | { | |
591 | struct client_obd *cli = osc_cli(obj); | |
592 | struct cl_lock *lock; | |
593 | struct osc_extent *cur; | |
594 | struct osc_extent *ext; | |
595 | struct osc_extent *conflict = NULL; | |
596 | struct osc_extent *found = NULL; | |
597 | pgoff_t chunk; | |
598 | pgoff_t max_end; | |
599 | int max_pages; /* max_pages_per_rpc */ | |
600 | int chunksize; | |
601 | int ppc_bits; /* pages per chunk bits */ | |
602 | int chunk_mask; | |
603 | int rc; | |
604 | ENTRY; | |
605 | ||
606 | cur = osc_extent_alloc(obj); | |
607 | if (cur == NULL) | |
608 | RETURN(ERR_PTR(-ENOMEM)); | |
609 | ||
610 | lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0); | |
611 | LASSERT(lock != NULL); | |
612 | LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE); | |
613 | ||
614 | LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT); | |
615 | ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; | |
616 | chunk_mask = ~((1 << ppc_bits) - 1); | |
617 | chunksize = 1 << cli->cl_chunkbits; | |
618 | chunk = index >> ppc_bits; | |
619 | ||
620 | /* align end to rpc edge, rpc size may not be a power 2 integer. */ | |
621 | max_pages = cli->cl_max_pages_per_rpc; | |
622 | LASSERT((max_pages & ~chunk_mask) == 0); | |
623 | max_end = index - (index % max_pages) + max_pages - 1; | |
624 | max_end = min_t(pgoff_t, max_end, lock->cll_descr.cld_end); | |
625 | ||
626 | /* initialize new extent by parameters so far */ | |
627 | cur->oe_max_end = max_end; | |
628 | cur->oe_start = index & chunk_mask; | |
629 | cur->oe_end = ((index + ~chunk_mask + 1) & chunk_mask) - 1; | |
630 | if (cur->oe_start < lock->cll_descr.cld_start) | |
631 | cur->oe_start = lock->cll_descr.cld_start; | |
632 | if (cur->oe_end > max_end) | |
633 | cur->oe_end = max_end; | |
634 | cur->oe_osclock = lock; | |
635 | cur->oe_grants = 0; | |
636 | cur->oe_mppr = max_pages; | |
637 | ||
638 | /* grants has been allocated by caller */ | |
639 | LASSERTF(*grants >= chunksize + cli->cl_extent_tax, | |
640 | "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax); | |
641 | LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR, EXTPARA(cur)); | |
642 | ||
643 | restart: | |
644 | osc_object_lock(obj); | |
645 | ext = osc_extent_search(obj, cur->oe_start); | |
646 | if (ext == NULL) | |
647 | ext = first_extent(obj); | |
648 | while (ext != NULL) { | |
649 | loff_t ext_chk_start = ext->oe_start >> ppc_bits; | |
650 | loff_t ext_chk_end = ext->oe_end >> ppc_bits; | |
651 | ||
652 | LASSERT(sanity_check_nolock(ext) == 0); | |
653 | if (chunk > ext_chk_end + 1) | |
654 | break; | |
655 | ||
656 | /* if covering by different locks, no chance to match */ | |
657 | if (lock != ext->oe_osclock) { | |
658 | EASSERTF(!overlapped(ext, cur), ext, | |
659 | EXTSTR, EXTPARA(cur)); | |
660 | ||
661 | ext = next_extent(ext); | |
662 | continue; | |
663 | } | |
664 | ||
665 | /* discontiguous chunks? */ | |
666 | if (chunk + 1 < ext_chk_start) { | |
667 | ext = next_extent(ext); | |
668 | continue; | |
669 | } | |
670 | ||
671 | /* ok, from now on, ext and cur have these attrs: | |
672 | * 1. covered by the same lock | |
673 | * 2. contiguous at chunk level or overlapping. */ | |
674 | ||
675 | if (overlapped(ext, cur)) { | |
676 | /* cur is the minimum unit, so overlapping means | |
677 | * full contain. */ | |
678 | EASSERTF((ext->oe_start <= cur->oe_start && | |
679 | ext->oe_end >= cur->oe_end), | |
680 | ext, EXTSTR, EXTPARA(cur)); | |
681 | ||
682 | if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) { | |
683 | /* for simplicity, we wait for this extent to | |
684 | * finish before going forward. */ | |
685 | conflict = osc_extent_get(ext); | |
686 | break; | |
687 | } | |
688 | ||
689 | found = osc_extent_hold(ext); | |
690 | break; | |
691 | } | |
692 | ||
693 | /* non-overlapped extent */ | |
694 | if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) { | |
695 | /* we can't do anything for a non OES_CACHE extent, or | |
696 | * if there is someone waiting for this extent to be | |
697 | * flushed, try next one. */ | |
698 | ext = next_extent(ext); | |
699 | continue; | |
700 | } | |
701 | ||
702 | /* check if they belong to the same rpc slot before trying to | |
703 | * merge. the extents are not overlapped and contiguous at | |
704 | * chunk level to get here. */ | |
705 | if (ext->oe_max_end != max_end) { | |
706 | /* if they don't belong to the same RPC slot or | |
707 | * max_pages_per_rpc has ever changed, do not merge. */ | |
708 | ext = next_extent(ext); | |
709 | continue; | |
710 | } | |
711 | ||
712 | /* it's required that an extent must be contiguous at chunk | |
713 | * level so that we know the whole extent is covered by grant | |
714 | * (the pages in the extent are NOT required to be contiguous). | |
715 | * Otherwise, it will be too much difficult to know which | |
716 | * chunks have grants allocated. */ | |
717 | ||
718 | /* try to do front merge - extend ext's start */ | |
719 | if (chunk + 1 == ext_chk_start) { | |
720 | /* ext must be chunk size aligned */ | |
721 | EASSERT((ext->oe_start & ~chunk_mask) == 0, ext); | |
722 | ||
723 | /* pull ext's start back to cover cur */ | |
724 | ext->oe_start = cur->oe_start; | |
725 | ext->oe_grants += chunksize; | |
726 | *grants -= chunksize; | |
727 | ||
728 | found = osc_extent_hold(ext); | |
729 | } else if (chunk == ext_chk_end + 1) { | |
730 | /* rear merge */ | |
731 | ext->oe_end = cur->oe_end; | |
732 | ext->oe_grants += chunksize; | |
733 | *grants -= chunksize; | |
734 | ||
735 | /* try to merge with the next one because we just fill | |
736 | * in a gap */ | |
737 | if (osc_extent_merge(env, ext, next_extent(ext)) == 0) | |
738 | /* we can save extent tax from next extent */ | |
739 | *grants += cli->cl_extent_tax; | |
740 | ||
741 | found = osc_extent_hold(ext); | |
742 | } | |
743 | if (found != NULL) | |
744 | break; | |
745 | ||
746 | ext = next_extent(ext); | |
747 | } | |
748 | ||
749 | osc_extent_tree_dump(D_CACHE, obj); | |
750 | if (found != NULL) { | |
751 | LASSERT(conflict == NULL); | |
752 | if (!IS_ERR(found)) { | |
753 | LASSERT(found->oe_osclock == cur->oe_osclock); | |
754 | OSC_EXTENT_DUMP(D_CACHE, found, | |
755 | "found caching ext for %lu.\n", index); | |
756 | } | |
757 | } else if (conflict == NULL) { | |
758 | /* create a new extent */ | |
759 | EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur); | |
760 | cur->oe_grants = chunksize + cli->cl_extent_tax; | |
761 | *grants -= cur->oe_grants; | |
762 | LASSERT(*grants >= 0); | |
763 | ||
764 | cur->oe_state = OES_CACHE; | |
765 | found = osc_extent_hold(cur); | |
766 | osc_extent_insert(obj, cur); | |
767 | OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n", | |
768 | index, lock->cll_descr.cld_end); | |
769 | } | |
770 | osc_object_unlock(obj); | |
771 | ||
772 | if (conflict != NULL) { | |
773 | LASSERT(found == NULL); | |
774 | ||
775 | /* waiting for IO to finish. Please notice that it's impossible | |
776 | * to be an OES_TRUNC extent. */ | |
777 | rc = osc_extent_wait(env, conflict, OES_INV); | |
778 | osc_extent_put(env, conflict); | |
779 | conflict = NULL; | |
780 | if (rc < 0) | |
781 | GOTO(out, found = ERR_PTR(rc)); | |
782 | ||
783 | goto restart; | |
784 | } | |
785 | EXIT; | |
786 | ||
787 | out: | |
788 | osc_extent_put(env, cur); | |
789 | LASSERT(*grants >= 0); | |
790 | return found; | |
791 | } | |
792 | ||
793 | /** | |
794 | * Called when IO is finished to an extent. | |
795 | */ | |
796 | int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, | |
797 | int sent, int rc) | |
798 | { | |
799 | struct client_obd *cli = osc_cli(ext->oe_obj); | |
800 | struct osc_async_page *oap; | |
801 | struct osc_async_page *tmp; | |
802 | int nr_pages = ext->oe_nr_pages; | |
803 | int lost_grant = 0; | |
804 | int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096; | |
805 | __u64 last_off = 0; | |
806 | int last_count = -1; | |
807 | ENTRY; | |
808 | ||
809 | OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n"); | |
810 | ||
811 | ext->oe_rc = rc ?: ext->oe_nr_pages; | |
812 | EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext); | |
813 | list_for_each_entry_safe(oap, tmp, &ext->oe_pages, | |
814 | oap_pending_item) { | |
815 | list_del_init(&oap->oap_rpc_item); | |
816 | list_del_init(&oap->oap_pending_item); | |
817 | if (last_off <= oap->oap_obj_off) { | |
818 | last_off = oap->oap_obj_off; | |
819 | last_count = oap->oap_count; | |
820 | } | |
821 | ||
822 | --ext->oe_nr_pages; | |
823 | osc_ap_completion(env, cli, oap, sent, rc); | |
824 | } | |
825 | EASSERT(ext->oe_nr_pages == 0, ext); | |
826 | ||
827 | if (!sent) { | |
828 | lost_grant = ext->oe_grants; | |
829 | } else if (blocksize < PAGE_CACHE_SIZE && | |
830 | last_count != PAGE_CACHE_SIZE) { | |
831 | /* For short writes we shouldn't count parts of pages that | |
832 | * span a whole chunk on the OST side, or our accounting goes | |
833 | * wrong. Should match the code in filter_grant_check. */ | |
834 | int offset = oap->oap_page_off & ~CFS_PAGE_MASK; | |
835 | int count = oap->oap_count + (offset & (blocksize - 1)); | |
836 | int end = (offset + oap->oap_count) & (blocksize - 1); | |
837 | if (end) | |
838 | count += blocksize - end; | |
839 | ||
840 | lost_grant = PAGE_CACHE_SIZE - count; | |
841 | } | |
842 | if (ext->oe_grants > 0) | |
843 | osc_free_grant(cli, nr_pages, lost_grant); | |
844 | ||
845 | osc_extent_remove(ext); | |
846 | /* put the refcount for RPC */ | |
847 | osc_extent_put(env, ext); | |
848 | RETURN(0); | |
849 | } | |
850 | ||
851 | static int extent_wait_cb(struct osc_extent *ext, int state) | |
852 | { | |
853 | int ret; | |
854 | ||
855 | osc_object_lock(ext->oe_obj); | |
856 | ret = ext->oe_state == state; | |
857 | osc_object_unlock(ext->oe_obj); | |
858 | ||
859 | return ret; | |
860 | } | |
861 | ||
862 | /** | |
863 | * Wait for the extent's state to become @state. | |
864 | */ | |
865 | static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, | |
866 | int state) | |
867 | { | |
868 | struct osc_object *obj = ext->oe_obj; | |
869 | struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(600), NULL, | |
870 | LWI_ON_SIGNAL_NOOP, NULL); | |
871 | int rc = 0; | |
872 | ENTRY; | |
873 | ||
874 | osc_object_lock(obj); | |
875 | LASSERT(sanity_check_nolock(ext) == 0); | |
876 | /* `Kick' this extent only if the caller is waiting for it to be | |
877 | * written out. */ | |
878 | if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp) { | |
879 | if (ext->oe_state == OES_ACTIVE) { | |
880 | ext->oe_urgent = 1; | |
881 | } else if (ext->oe_state == OES_CACHE) { | |
882 | ext->oe_urgent = 1; | |
883 | osc_extent_hold(ext); | |
884 | rc = 1; | |
885 | } | |
886 | } | |
887 | osc_object_unlock(obj); | |
888 | if (rc == 1) | |
889 | osc_extent_release(env, ext); | |
890 | ||
891 | /* wait for the extent until its state becomes @state */ | |
892 | rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), &lwi); | |
893 | if (rc == -ETIMEDOUT) { | |
894 | OSC_EXTENT_DUMP(D_ERROR, ext, | |
895 | "%s: wait ext to %d timedout, recovery in progress?\n", | |
896 | osc_export(obj)->exp_obd->obd_name, state); | |
897 | ||
898 | lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); | |
899 | rc = l_wait_event(ext->oe_waitq, extent_wait_cb(ext, state), | |
900 | &lwi); | |
901 | } | |
902 | if (rc == 0 && ext->oe_rc < 0) | |
903 | rc = ext->oe_rc; | |
904 | RETURN(rc); | |
905 | } | |
906 | ||
907 | /** | |
908 | * Discard pages with index greater than @size. If @ext is overlapped with | |
909 | * @size, then partial truncate happens. | |
910 | */ | |
911 | static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, | |
912 | bool partial) | |
913 | { | |
914 | struct cl_env_nest nest; | |
915 | struct lu_env *env; | |
916 | struct cl_io *io; | |
917 | struct osc_object *obj = ext->oe_obj; | |
918 | struct client_obd *cli = osc_cli(obj); | |
919 | struct osc_async_page *oap; | |
920 | struct osc_async_page *tmp; | |
921 | int pages_in_chunk = 0; | |
922 | int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; | |
923 | __u64 trunc_chunk = trunc_index >> ppc_bits; | |
924 | int grants = 0; | |
925 | int nr_pages = 0; | |
926 | int rc = 0; | |
927 | ENTRY; | |
928 | ||
929 | LASSERT(sanity_check(ext) == 0); | |
930 | LASSERT(ext->oe_state == OES_TRUNC); | |
931 | LASSERT(!ext->oe_urgent); | |
932 | ||
933 | /* Request new lu_env. | |
934 | * We can't use that env from osc_cache_truncate_start() because | |
935 | * it's from lov_io_sub and not fully initialized. */ | |
936 | env = cl_env_nested_get(&nest); | |
937 | io = &osc_env_info(env)->oti_io; | |
938 | io->ci_obj = cl_object_top(osc2cl(obj)); | |
939 | rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); | |
940 | if (rc < 0) | |
941 | GOTO(out, rc); | |
942 | ||
943 | /* discard all pages with index greater then trunc_index */ | |
944 | list_for_each_entry_safe(oap, tmp, &ext->oe_pages, | |
945 | oap_pending_item) { | |
946 | struct cl_page *sub = oap2cl_page(oap); | |
947 | struct cl_page *page = cl_page_top(sub); | |
948 | ||
949 | LASSERT(list_empty(&oap->oap_rpc_item)); | |
950 | ||
951 | /* only discard the pages with their index greater than | |
952 | * trunc_index, and ... */ | |
953 | if (sub->cp_index < trunc_index || | |
954 | (sub->cp_index == trunc_index && partial)) { | |
955 | /* accounting how many pages remaining in the chunk | |
956 | * so that we can calculate grants correctly. */ | |
957 | if (sub->cp_index >> ppc_bits == trunc_chunk) | |
958 | ++pages_in_chunk; | |
959 | continue; | |
960 | } | |
961 | ||
962 | list_del_init(&oap->oap_pending_item); | |
963 | ||
964 | cl_page_get(page); | |
965 | lu_ref_add(&page->cp_reference, "truncate", current); | |
966 | ||
967 | if (cl_page_own(env, io, page) == 0) { | |
968 | cl_page_unmap(env, io, page); | |
969 | cl_page_discard(env, io, page); | |
970 | cl_page_disown(env, io, page); | |
971 | } else { | |
972 | LASSERT(page->cp_state == CPS_FREEING); | |
973 | LASSERT(0); | |
974 | } | |
975 | ||
976 | lu_ref_del(&page->cp_reference, "truncate", current); | |
977 | cl_page_put(env, page); | |
978 | ||
979 | --ext->oe_nr_pages; | |
980 | ++nr_pages; | |
981 | } | |
982 | EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial, | |
983 | ext->oe_nr_pages == 0), | |
984 | ext, "trunc_index %lu, partial %d\n", trunc_index, partial); | |
985 | ||
986 | osc_object_lock(obj); | |
987 | if (ext->oe_nr_pages == 0) { | |
988 | LASSERT(pages_in_chunk == 0); | |
989 | grants = ext->oe_grants; | |
990 | ext->oe_grants = 0; | |
991 | } else { /* calculate how many grants we can free */ | |
992 | int chunks = (ext->oe_end >> ppc_bits) - trunc_chunk; | |
993 | pgoff_t last_index; | |
994 | ||
995 | ||
996 | /* if there is no pages in this chunk, we can also free grants | |
997 | * for the last chunk */ | |
998 | if (pages_in_chunk == 0) { | |
999 | /* if this is the 1st chunk and no pages in this chunk, | |
1000 | * ext->oe_nr_pages must be zero, so we should be in | |
1001 | * the other if-clause. */ | |
1002 | LASSERT(trunc_chunk > 0); | |
1003 | --trunc_chunk; | |
1004 | ++chunks; | |
1005 | } | |
1006 | ||
1007 | /* this is what we can free from this extent */ | |
1008 | grants = chunks << cli->cl_chunkbits; | |
1009 | ext->oe_grants -= grants; | |
1010 | last_index = ((trunc_chunk + 1) << ppc_bits) - 1; | |
1011 | ext->oe_end = min(last_index, ext->oe_max_end); | |
1012 | LASSERT(ext->oe_end >= ext->oe_start); | |
1013 | LASSERT(ext->oe_grants > 0); | |
1014 | } | |
1015 | osc_object_unlock(obj); | |
1016 | ||
1017 | if (grants > 0 || nr_pages > 0) | |
1018 | osc_free_grant(cli, nr_pages, grants); | |
1019 | ||
1020 | out: | |
1021 | cl_io_fini(env, io); | |
1022 | cl_env_nested_put(&nest, env); | |
1023 | RETURN(rc); | |
1024 | } | |
1025 | ||
1026 | /** | |
1027 | * This function is used to make the extent prepared for transfer. | |
1028 | * A race with flusing page - ll_writepage() has to be handled cautiously. | |
1029 | */ | |
1030 | static int osc_extent_make_ready(const struct lu_env *env, | |
1031 | struct osc_extent *ext) | |
1032 | { | |
1033 | struct osc_async_page *oap; | |
1034 | struct osc_async_page *last = NULL; | |
1035 | struct osc_object *obj = ext->oe_obj; | |
1036 | int page_count = 0; | |
1037 | int rc; | |
1038 | ENTRY; | |
1039 | ||
1040 | /* we're going to grab page lock, so object lock must not be taken. */ | |
1041 | LASSERT(sanity_check(ext) == 0); | |
1042 | /* in locking state, any process should not touch this extent. */ | |
1043 | EASSERT(ext->oe_state == OES_LOCKING, ext); | |
1044 | EASSERT(ext->oe_owner != NULL, ext); | |
1045 | ||
1046 | OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n"); | |
1047 | ||
1048 | list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { | |
1049 | ++page_count; | |
1050 | if (last == NULL || last->oap_obj_off < oap->oap_obj_off) | |
1051 | last = oap; | |
1052 | ||
1053 | /* checking ASYNC_READY is race safe */ | |
1054 | if ((oap->oap_async_flags & ASYNC_READY) != 0) | |
1055 | continue; | |
1056 | ||
1057 | rc = osc_make_ready(env, oap, OBD_BRW_WRITE); | |
1058 | switch (rc) { | |
1059 | case 0: | |
1060 | spin_lock(&oap->oap_lock); | |
1061 | oap->oap_async_flags |= ASYNC_READY; | |
1062 | spin_unlock(&oap->oap_lock); | |
1063 | break; | |
1064 | case -EALREADY: | |
1065 | LASSERT((oap->oap_async_flags & ASYNC_READY) != 0); | |
1066 | break; | |
1067 | default: | |
1068 | LASSERTF(0, "unknown return code: %d\n", rc); | |
1069 | } | |
1070 | } | |
1071 | ||
1072 | LASSERT(page_count == ext->oe_nr_pages); | |
1073 | LASSERT(last != NULL); | |
1074 | /* the last page is the only one we need to refresh its count by | |
1075 | * the size of file. */ | |
1076 | if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) { | |
1077 | last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); | |
1078 | LASSERT(last->oap_count > 0); | |
1079 | LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE); | |
1080 | last->oap_async_flags |= ASYNC_COUNT_STABLE; | |
1081 | } | |
1082 | ||
1083 | /* for the rest of pages, we don't need to call osf_refresh_count() | |
1084 | * because it's known they are not the last page */ | |
1085 | list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { | |
1086 | if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { | |
1087 | oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off; | |
1088 | oap->oap_async_flags |= ASYNC_COUNT_STABLE; | |
1089 | } | |
1090 | } | |
1091 | ||
1092 | osc_object_lock(obj); | |
1093 | osc_extent_state_set(ext, OES_RPC); | |
1094 | osc_object_unlock(obj); | |
1095 | /* get a refcount for RPC. */ | |
1096 | osc_extent_get(ext); | |
1097 | ||
1098 | RETURN(0); | |
1099 | } | |
1100 | ||
1101 | /** | |
1102 | * Quick and simple version of osc_extent_find(). This function is frequently | |
1103 | * called to expand the extent for the same IO. To expand the extent, the | |
1104 | * page index must be in the same or next chunk of ext->oe_end. | |
1105 | */ | |
1106 | static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants) | |
1107 | { | |
1108 | struct osc_object *obj = ext->oe_obj; | |
1109 | struct client_obd *cli = osc_cli(obj); | |
1110 | struct osc_extent *next; | |
1111 | int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT; | |
1112 | pgoff_t chunk = index >> ppc_bits; | |
1113 | pgoff_t end_chunk; | |
1114 | pgoff_t end_index; | |
1115 | int chunksize = 1 << cli->cl_chunkbits; | |
1116 | int rc = 0; | |
1117 | ENTRY; | |
1118 | ||
1119 | LASSERT(ext->oe_max_end >= index && ext->oe_start <= index); | |
1120 | osc_object_lock(obj); | |
1121 | LASSERT(sanity_check_nolock(ext) == 0); | |
1122 | end_chunk = ext->oe_end >> ppc_bits; | |
1123 | if (chunk > end_chunk + 1) | |
1124 | GOTO(out, rc = -ERANGE); | |
1125 | ||
1126 | if (end_chunk >= chunk) | |
1127 | GOTO(out, rc = 0); | |
1128 | ||
1129 | LASSERT(end_chunk + 1 == chunk); | |
1130 | /* try to expand this extent to cover @index */ | |
1131 | end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1); | |
1132 | ||
1133 | next = next_extent(ext); | |
1134 | if (next != NULL && next->oe_start <= end_index) | |
1135 | /* complex mode - overlapped with the next extent, | |
1136 | * this case will be handled by osc_extent_find() */ | |
1137 | GOTO(out, rc = -EAGAIN); | |
1138 | ||
1139 | ext->oe_end = end_index; | |
1140 | ext->oe_grants += chunksize; | |
1141 | *grants -= chunksize; | |
1142 | LASSERT(*grants >= 0); | |
1143 | EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext, | |
1144 | "overlapped after expanding for %lu.\n", index); | |
1145 | EXIT; | |
1146 | ||
1147 | out: | |
1148 | osc_object_unlock(obj); | |
1149 | RETURN(rc); | |
1150 | } | |
1151 | ||
1152 | static void osc_extent_tree_dump0(int level, struct osc_object *obj, | |
1153 | const char *func, int line) | |
1154 | { | |
1155 | struct osc_extent *ext; | |
1156 | int cnt; | |
1157 | ||
1158 | CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n", | |
1159 | obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc); | |
1160 | ||
1161 | /* osc_object_lock(obj); */ | |
1162 | cnt = 1; | |
1163 | for (ext = first_extent(obj); ext != NULL; ext = next_extent(ext)) | |
1164 | OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++); | |
1165 | ||
1166 | cnt = 1; | |
1167 | list_for_each_entry(ext, &obj->oo_hp_exts, oe_link) | |
1168 | OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++); | |
1169 | ||
1170 | cnt = 1; | |
1171 | list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link) | |
1172 | OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++); | |
1173 | ||
1174 | cnt = 1; | |
1175 | list_for_each_entry(ext, &obj->oo_reading_exts, oe_link) | |
1176 | OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++); | |
1177 | /* osc_object_unlock(obj); */ | |
1178 | } | |
1179 | ||
1180 | /* ------------------ osc extent end ------------------ */ | |
1181 | ||
1182 | static inline int osc_is_ready(struct osc_object *osc) | |
1183 | { | |
1184 | return !list_empty(&osc->oo_ready_item) || | |
1185 | !list_empty(&osc->oo_hp_ready_item); | |
1186 | } | |
1187 | ||
1188 | #define OSC_IO_DEBUG(OSC, STR, args...) \ | |
1189 | CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR, \ | |
1190 | (OSC), osc_is_ready(OSC), \ | |
1191 | list_empty_marker(&(OSC)->oo_hp_ready_item), \ | |
1192 | list_empty_marker(&(OSC)->oo_ready_item), \ | |
1193 | atomic_read(&(OSC)->oo_nr_writes), \ | |
1194 | list_empty_marker(&(OSC)->oo_hp_exts), \ | |
1195 | list_empty_marker(&(OSC)->oo_urgent_exts), \ | |
1196 | atomic_read(&(OSC)->oo_nr_reads), \ | |
1197 | list_empty_marker(&(OSC)->oo_reading_exts), \ | |
1198 | ##args) | |
1199 | ||
1200 | static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, | |
1201 | int cmd) | |
1202 | { | |
1203 | struct osc_page *opg = oap2osc_page(oap); | |
1204 | struct cl_page *page = cl_page_top(oap2cl_page(oap)); | |
1205 | int result; | |
1206 | ||
1207 | LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */ | |
1208 | ||
1209 | ENTRY; | |
1210 | result = cl_page_make_ready(env, page, CRT_WRITE); | |
1211 | if (result == 0) | |
1212 | opg->ops_submit_time = cfs_time_current(); | |
1213 | RETURN(result); | |
1214 | } | |
1215 | ||
1216 | static int osc_refresh_count(const struct lu_env *env, | |
1217 | struct osc_async_page *oap, int cmd) | |
1218 | { | |
1219 | struct osc_page *opg = oap2osc_page(oap); | |
1220 | struct cl_page *page = oap2cl_page(oap); | |
1221 | struct cl_object *obj; | |
1222 | struct cl_attr *attr = &osc_env_info(env)->oti_attr; | |
1223 | ||
1224 | int result; | |
1225 | loff_t kms; | |
1226 | ||
1227 | /* readpage queues with _COUNT_STABLE, shouldn't get here. */ | |
1228 | LASSERT(!(cmd & OBD_BRW_READ)); | |
1229 | LASSERT(opg != NULL); | |
1230 | obj = opg->ops_cl.cpl_obj; | |
1231 | ||
1232 | cl_object_attr_lock(obj); | |
1233 | result = cl_object_attr_get(env, obj, attr); | |
1234 | cl_object_attr_unlock(obj); | |
1235 | if (result < 0) | |
1236 | return result; | |
1237 | kms = attr->cat_kms; | |
1238 | if (cl_offset(obj, page->cp_index) >= kms) | |
1239 | /* catch race with truncate */ | |
1240 | return 0; | |
1241 | else if (cl_offset(obj, page->cp_index + 1) > kms) | |
1242 | /* catch sub-page write at end of file */ | |
1243 | return kms % PAGE_CACHE_SIZE; | |
1244 | else | |
1245 | return PAGE_CACHE_SIZE; | |
1246 | } | |
1247 | ||
1248 | static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, | |
1249 | int cmd, int rc) | |
1250 | { | |
1251 | struct osc_page *opg = oap2osc_page(oap); | |
1252 | struct cl_page *page = cl_page_top(oap2cl_page(oap)); | |
1253 | struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); | |
1254 | enum cl_req_type crt; | |
1255 | int srvlock; | |
1256 | ||
1257 | ENTRY; | |
1258 | ||
1259 | cmd &= ~OBD_BRW_NOQUOTA; | |
1260 | LASSERT(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ)); | |
1261 | LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE)); | |
1262 | LASSERT(opg->ops_transfer_pinned); | |
1263 | ||
1264 | /* | |
1265 | * page->cp_req can be NULL if io submission failed before | |
1266 | * cl_req was allocated. | |
1267 | */ | |
1268 | if (page->cp_req != NULL) | |
1269 | cl_req_page_done(env, page); | |
1270 | LASSERT(page->cp_req == NULL); | |
1271 | ||
1272 | crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; | |
1273 | /* Clear opg->ops_transfer_pinned before VM lock is released. */ | |
1274 | opg->ops_transfer_pinned = 0; | |
1275 | ||
1276 | spin_lock(&obj->oo_seatbelt); | |
1277 | LASSERT(opg->ops_submitter != NULL); | |
1278 | LASSERT(!list_empty(&opg->ops_inflight)); | |
1279 | list_del_init(&opg->ops_inflight); | |
1280 | opg->ops_submitter = NULL; | |
1281 | spin_unlock(&obj->oo_seatbelt); | |
1282 | ||
1283 | opg->ops_submit_time = 0; | |
1284 | srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK; | |
1285 | ||
1286 | /* statistic */ | |
1287 | if (rc == 0 && srvlock) { | |
1288 | struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev; | |
1289 | struct osc_stats *stats = &lu2osc_dev(ld)->od_stats; | |
1290 | int bytes = oap->oap_count; | |
1291 | ||
1292 | if (crt == CRT_READ) | |
1293 | stats->os_lockless_reads += bytes; | |
1294 | else | |
1295 | stats->os_lockless_writes += bytes; | |
1296 | } | |
1297 | ||
1298 | /* | |
1299 | * This has to be the last operation with the page, as locks are | |
1300 | * released in cl_page_completion() and nothing except for the | |
1301 | * reference counter protects page from concurrent reclaim. | |
1302 | */ | |
1303 | lu_ref_del(&page->cp_reference, "transfer", page); | |
1304 | ||
1305 | cl_page_completion(env, page, crt, rc); | |
1306 | ||
1307 | RETURN(0); | |
1308 | } | |
1309 | ||
1310 | #define OSC_DUMP_GRANT(cli, fmt, args...) do { \ | |
1311 | struct client_obd *__tmp = (cli); \ | |
1312 | CDEBUG(D_CACHE, "%s: { dirty: %ld/%ld dirty_pages: %d/%d " \ | |
1313 | "unstable_pages: %d/%d dropped: %ld avail: %ld, " \ | |
1314 | "reserved: %ld, flight: %d } " fmt, \ | |
1315 | __tmp->cl_import->imp_obd->obd_name, \ | |
1316 | __tmp->cl_dirty, __tmp->cl_dirty_max, \ | |
1317 | atomic_read(&obd_dirty_pages), obd_max_dirty_pages, \ | |
1318 | atomic_read(&obd_unstable_pages), obd_max_dirty_pages, \ | |
1319 | __tmp->cl_lost_grant, __tmp->cl_avail_grant, \ | |
1320 | __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, ##args); \ | |
1321 | } while (0) | |
1322 | ||
1323 | /* caller must hold loi_list_lock */ | |
1324 | static void osc_consume_write_grant(struct client_obd *cli, | |
1325 | struct brw_page *pga) | |
1326 | { | |
1327 | LASSERT(spin_is_locked(&cli->cl_loi_list_lock.lock)); | |
1328 | LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); | |
1329 | atomic_inc(&obd_dirty_pages); | |
1330 | cli->cl_dirty += PAGE_CACHE_SIZE; | |
1331 | pga->flag |= OBD_BRW_FROM_GRANT; | |
1332 | CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", | |
1333 | PAGE_CACHE_SIZE, pga, pga->pg); | |
1334 | osc_update_next_shrink(cli); | |
1335 | } | |
1336 | ||
1337 | /* the companion to osc_consume_write_grant, called when a brw has completed. | |
1338 | * must be called with the loi lock held. */ | |
1339 | static void osc_release_write_grant(struct client_obd *cli, | |
1340 | struct brw_page *pga) | |
1341 | { | |
1342 | ENTRY; | |
1343 | ||
1344 | LASSERT(spin_is_locked(&cli->cl_loi_list_lock.lock)); | |
1345 | if (!(pga->flag & OBD_BRW_FROM_GRANT)) { | |
1346 | EXIT; | |
1347 | return; | |
1348 | } | |
1349 | ||
1350 | pga->flag &= ~OBD_BRW_FROM_GRANT; | |
1351 | atomic_dec(&obd_dirty_pages); | |
1352 | cli->cl_dirty -= PAGE_CACHE_SIZE; | |
1353 | if (pga->flag & OBD_BRW_NOCACHE) { | |
1354 | pga->flag &= ~OBD_BRW_NOCACHE; | |
1355 | atomic_dec(&obd_dirty_transit_pages); | |
1356 | cli->cl_dirty_transit -= PAGE_CACHE_SIZE; | |
1357 | } | |
1358 | EXIT; | |
1359 | } | |
1360 | ||
1361 | /** | |
1362 | * To avoid sleeping with object lock held, it's good for us allocate enough | |
1363 | * grants before entering into critical section. | |
1364 | * | |
1365 | * client_obd_list_lock held by caller | |
1366 | */ | |
1367 | static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes) | |
1368 | { | |
1369 | int rc = -EDQUOT; | |
1370 | ||
1371 | if (cli->cl_avail_grant >= bytes) { | |
1372 | cli->cl_avail_grant -= bytes; | |
1373 | cli->cl_reserved_grant += bytes; | |
1374 | rc = 0; | |
1375 | } | |
1376 | return rc; | |
1377 | } | |
1378 | ||
1379 | static void __osc_unreserve_grant(struct client_obd *cli, | |
1380 | unsigned int reserved, unsigned int unused) | |
1381 | { | |
1382 | /* it's quite normal for us to get more grant than reserved. | |
1383 | * Thinking about a case that two extents merged by adding a new | |
1384 | * chunk, we can save one extent tax. If extent tax is greater than | |
1385 | * one chunk, we can save more grant by adding a new chunk */ | |
1386 | cli->cl_reserved_grant -= reserved; | |
1387 | if (unused > reserved) { | |
1388 | cli->cl_avail_grant += reserved; | |
1389 | cli->cl_lost_grant += unused - reserved; | |
1390 | } else { | |
1391 | cli->cl_avail_grant += unused; | |
1392 | } | |
1393 | } | |
1394 | ||
1395 | void osc_unreserve_grant(struct client_obd *cli, | |
1396 | unsigned int reserved, unsigned int unused) | |
1397 | { | |
1398 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1399 | __osc_unreserve_grant(cli, reserved, unused); | |
1400 | if (unused > 0) | |
1401 | osc_wake_cache_waiters(cli); | |
1402 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1403 | } | |
1404 | ||
1405 | /** | |
1406 | * Free grant after IO is finished or canceled. | |
1407 | * | |
1408 | * @lost_grant is used to remember how many grants we have allocated but not | |
1409 | * used, we should return these grants to OST. There're two cases where grants | |
1410 | * can be lost: | |
1411 | * 1. truncate; | |
1412 | * 2. blocksize at OST is less than PAGE_CACHE_SIZE and a partial page was | |
1413 | * written. In this case OST may use less chunks to serve this partial | |
1414 | * write. OSTs don't actually know the page size on the client side. so | |
1415 | * clients have to calculate lost grant by the blocksize on the OST. | |
1416 | * See filter_grant_check() for details. | |
1417 | */ | |
1418 | static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, | |
1419 | unsigned int lost_grant) | |
1420 | { | |
1421 | int grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; | |
1422 | ||
1423 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1424 | atomic_sub(nr_pages, &obd_dirty_pages); | |
1425 | cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT; | |
1426 | cli->cl_lost_grant += lost_grant; | |
1427 | if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) { | |
1428 | /* borrow some grant from truncate to avoid the case that | |
1429 | * truncate uses up all avail grant */ | |
1430 | cli->cl_lost_grant -= grant; | |
1431 | cli->cl_avail_grant += grant; | |
1432 | } | |
1433 | osc_wake_cache_waiters(cli); | |
1434 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1435 | CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n", | |
1436 | lost_grant, cli->cl_lost_grant, | |
1437 | cli->cl_avail_grant, cli->cl_dirty); | |
1438 | } | |
1439 | ||
1440 | /** | |
1441 | * The companion to osc_enter_cache(), called when @oap is no longer part of | |
1442 | * the dirty accounting due to error. | |
1443 | */ | |
1444 | static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap) | |
1445 | { | |
1446 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1447 | osc_release_write_grant(cli, &oap->oap_brw_page); | |
1448 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1449 | } | |
1450 | ||
1451 | /** | |
1452 | * Non-blocking version of osc_enter_cache() that consumes grant only when it | |
1453 | * is available. | |
1454 | */ | |
1455 | static int osc_enter_cache_try(struct client_obd *cli, | |
1456 | struct osc_async_page *oap, | |
1457 | int bytes, int transient) | |
1458 | { | |
1459 | int rc; | |
1460 | ||
1461 | OSC_DUMP_GRANT(cli, "need:%d.\n", bytes); | |
1462 | ||
1463 | rc = osc_reserve_grant(cli, bytes); | |
1464 | if (rc < 0) | |
1465 | return 0; | |
1466 | ||
1467 | if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max && | |
1468 | atomic_read(&obd_unstable_pages) + 1 + | |
1469 | atomic_read(&obd_dirty_pages) <= obd_max_dirty_pages) { | |
1470 | osc_consume_write_grant(cli, &oap->oap_brw_page); | |
1471 | if (transient) { | |
1472 | cli->cl_dirty_transit += PAGE_CACHE_SIZE; | |
1473 | atomic_inc(&obd_dirty_transit_pages); | |
1474 | oap->oap_brw_flags |= OBD_BRW_NOCACHE; | |
1475 | } | |
1476 | rc = 1; | |
1477 | } else { | |
1478 | __osc_unreserve_grant(cli, bytes, bytes); | |
1479 | rc = 0; | |
1480 | } | |
1481 | return rc; | |
1482 | } | |
1483 | ||
1484 | static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) | |
1485 | { | |
1486 | int rc; | |
1487 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1488 | rc = list_empty(&ocw->ocw_entry); | |
1489 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1490 | return rc; | |
1491 | } | |
1492 | ||
1493 | /** | |
1494 | * The main entry to reserve dirty page accounting. Usually the grant reserved | |
1495 | * in this function will be freed in bulk in osc_free_grant() unless it fails | |
1496 | * to add osc cache, in that case, it will be freed in osc_exit_cache(). | |
1497 | * | |
1498 | * The process will be put into sleep if it's already run out of grant. | |
1499 | */ | |
1500 | static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, | |
1501 | struct osc_async_page *oap, int bytes) | |
1502 | { | |
1503 | struct osc_object *osc = oap->oap_obj; | |
1504 | struct lov_oinfo *loi = osc->oo_oinfo; | |
1505 | struct osc_cache_waiter ocw; | |
1506 | struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); | |
1507 | int rc = -EDQUOT; | |
1508 | ENTRY; | |
1509 | ||
1510 | OSC_DUMP_GRANT(cli, "need:%d.\n", bytes); | |
1511 | ||
1512 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1513 | ||
1514 | /* force the caller to try sync io. this can jump the list | |
1515 | * of queued writes and create a discontiguous rpc stream */ | |
1516 | if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || | |
1517 | cli->cl_dirty_max < PAGE_CACHE_SIZE || | |
1518 | cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) | |
1519 | GOTO(out, rc = -EDQUOT); | |
1520 | ||
1521 | /* Hopefully normal case - cache space and write credits available */ | |
1522 | if (osc_enter_cache_try(cli, oap, bytes, 0)) | |
1523 | GOTO(out, rc = 0); | |
1524 | ||
1525 | /* We can get here for two reasons: too many dirty pages in cache, or | |
1526 | * run out of grants. In both cases we should write dirty pages out. | |
1527 | * Adding a cache waiter will trigger urgent write-out no matter what | |
1528 | * RPC size will be. | |
1529 | * The exiting condition is no avail grants and no dirty pages caching, | |
1530 | * that really means there is no space on the OST. */ | |
1531 | init_waitqueue_head(&ocw.ocw_waitq); | |
1532 | ocw.ocw_oap = oap; | |
1533 | ocw.ocw_grant = bytes; | |
1534 | while (cli->cl_dirty > 0 || cli->cl_w_in_flight > 0) { | |
1535 | list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); | |
1536 | ocw.ocw_rc = 0; | |
1537 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1538 | ||
1539 | osc_io_unplug_async(env, cli, NULL); | |
1540 | ||
1541 | CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", | |
1542 | cli->cl_import->imp_obd->obd_name, &ocw, oap); | |
1543 | ||
1544 | rc = l_wait_event(ocw.ocw_waitq, ocw_granted(cli, &ocw), &lwi); | |
1545 | ||
1546 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1547 | ||
1548 | /* l_wait_event is interrupted by signal */ | |
1549 | if (rc < 0) { | |
1550 | list_del_init(&ocw.ocw_entry); | |
1551 | GOTO(out, rc); | |
1552 | } | |
1553 | ||
1554 | LASSERT(list_empty(&ocw.ocw_entry)); | |
1555 | rc = ocw.ocw_rc; | |
1556 | ||
1557 | if (rc != -EDQUOT) | |
1558 | GOTO(out, rc); | |
1559 | if (osc_enter_cache_try(cli, oap, bytes, 0)) | |
1560 | GOTO(out, rc = 0); | |
1561 | } | |
1562 | EXIT; | |
1563 | out: | |
1564 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1565 | OSC_DUMP_GRANT(cli, "returned %d.\n", rc); | |
1566 | RETURN(rc); | |
1567 | } | |
1568 | ||
1569 | /* caller must hold loi_list_lock */ | |
1570 | void osc_wake_cache_waiters(struct client_obd *cli) | |
1571 | { | |
1572 | struct list_head *l, *tmp; | |
1573 | struct osc_cache_waiter *ocw; | |
1574 | ||
1575 | ENTRY; | |
1576 | list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { | |
1577 | ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); | |
1578 | list_del_init(&ocw->ocw_entry); | |
1579 | ||
1580 | ocw->ocw_rc = -EDQUOT; | |
1581 | /* we can't dirty more */ | |
1582 | if (cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max || | |
1583 | atomic_read(&obd_unstable_pages) + 1 + | |
1584 | atomic_read(&obd_dirty_pages) > obd_max_dirty_pages) { | |
1585 | CDEBUG(D_CACHE, "no dirty room: dirty: %ld " | |
1586 | "osc max %ld, sys max %d\n", cli->cl_dirty, | |
1587 | cli->cl_dirty_max, obd_max_dirty_pages); | |
1588 | goto wakeup; | |
1589 | } | |
1590 | ||
1591 | ocw->ocw_rc = 0; | |
1592 | if (!osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) | |
1593 | ocw->ocw_rc = -EDQUOT; | |
1594 | ||
1595 | wakeup: | |
1596 | CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n", | |
1597 | ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc); | |
1598 | ||
1599 | wake_up(&ocw->ocw_waitq); | |
1600 | } | |
1601 | ||
1602 | EXIT; | |
1603 | } | |
1604 | ||
1605 | static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) | |
1606 | { | |
1607 | int hprpc = !!list_empty(&osc->oo_hp_exts); | |
1608 | return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc; | |
1609 | } | |
1610 | ||
1611 | /* This maintains the lists of pending pages to read/write for a given object | |
1612 | * (lop). This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint() | |
1613 | * to quickly find objects that are ready to send an RPC. */ | |
1614 | static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, | |
1615 | int cmd) | |
1616 | { | |
1617 | int invalid_import = 0; | |
1618 | ENTRY; | |
1619 | ||
1620 | /* if we have an invalid import we want to drain the queued pages | |
1621 | * by forcing them through rpcs that immediately fail and complete | |
1622 | * the pages. recovery relies on this to empty the queued pages | |
1623 | * before canceling the locks and evicting down the llite pages */ | |
1624 | if ((cli->cl_import == NULL || cli->cl_import->imp_invalid)) | |
1625 | invalid_import = 1; | |
1626 | ||
1627 | if (cmd & OBD_BRW_WRITE) { | |
1628 | if (atomic_read(&osc->oo_nr_writes) == 0) | |
1629 | RETURN(0); | |
1630 | if (invalid_import) { | |
1631 | CDEBUG(D_CACHE, "invalid import forcing RPC\n"); | |
1632 | RETURN(1); | |
1633 | } | |
1634 | if (!list_empty(&osc->oo_hp_exts)) { | |
1635 | CDEBUG(D_CACHE, "high prio request forcing RPC\n"); | |
1636 | RETURN(1); | |
1637 | } | |
1638 | if (!list_empty(&osc->oo_urgent_exts)) { | |
1639 | CDEBUG(D_CACHE, "urgent request forcing RPC\n"); | |
1640 | RETURN(1); | |
1641 | } | |
1642 | /* trigger a write rpc stream as long as there are dirtiers | |
1643 | * waiting for space. as they're waiting, they're not going to | |
1644 | * create more pages to coalesce with what's waiting.. */ | |
1645 | if (!list_empty(&cli->cl_cache_waiters)) { | |
1646 | CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); | |
1647 | RETURN(1); | |
1648 | } | |
1649 | if (atomic_read(&osc->oo_nr_writes) >= | |
1650 | cli->cl_max_pages_per_rpc) | |
1651 | RETURN(1); | |
1652 | } else { | |
1653 | if (atomic_read(&osc->oo_nr_reads) == 0) | |
1654 | RETURN(0); | |
1655 | if (invalid_import) { | |
1656 | CDEBUG(D_CACHE, "invalid import forcing RPC\n"); | |
1657 | RETURN(1); | |
1658 | } | |
1659 | /* all read are urgent. */ | |
1660 | if (!list_empty(&osc->oo_reading_exts)) | |
1661 | RETURN(1); | |
1662 | } | |
1663 | ||
1664 | RETURN(0); | |
1665 | } | |
1666 | ||
1667 | static void osc_update_pending(struct osc_object *obj, int cmd, int delta) | |
1668 | { | |
1669 | struct client_obd *cli = osc_cli(obj); | |
1670 | if (cmd & OBD_BRW_WRITE) { | |
1671 | atomic_add(delta, &obj->oo_nr_writes); | |
1672 | atomic_add(delta, &cli->cl_pending_w_pages); | |
1673 | LASSERT(atomic_read(&obj->oo_nr_writes) >= 0); | |
1674 | } else { | |
1675 | atomic_add(delta, &obj->oo_nr_reads); | |
1676 | atomic_add(delta, &cli->cl_pending_r_pages); | |
1677 | LASSERT(atomic_read(&obj->oo_nr_reads) >= 0); | |
1678 | } | |
1679 | OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta); | |
1680 | } | |
1681 | ||
1682 | static int osc_makes_hprpc(struct osc_object *obj) | |
1683 | { | |
1684 | return !list_empty(&obj->oo_hp_exts); | |
1685 | } | |
1686 | ||
1687 | static void on_list(struct list_head *item, struct list_head *list, int should_be_on) | |
1688 | { | |
1689 | if (list_empty(item) && should_be_on) | |
1690 | list_add_tail(item, list); | |
1691 | else if (!list_empty(item) && !should_be_on) | |
1692 | list_del_init(item); | |
1693 | } | |
1694 | ||
1695 | /* maintain the osc's cli list membership invariants so that osc_send_oap_rpc | |
1696 | * can find pages to build into rpcs quickly */ | |
1697 | static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc) | |
1698 | { | |
1699 | if (osc_makes_hprpc(osc)) { | |
1700 | /* HP rpc */ | |
1701 | on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0); | |
1702 | on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1); | |
1703 | } else { | |
1704 | on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0); | |
1705 | on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, | |
1706 | osc_makes_rpc(cli, osc, OBD_BRW_WRITE) || | |
1707 | osc_makes_rpc(cli, osc, OBD_BRW_READ)); | |
1708 | } | |
1709 | ||
1710 | on_list(&osc->oo_write_item, &cli->cl_loi_write_list, | |
1711 | atomic_read(&osc->oo_nr_writes) > 0); | |
1712 | ||
1713 | on_list(&osc->oo_read_item, &cli->cl_loi_read_list, | |
1714 | atomic_read(&osc->oo_nr_reads) > 0); | |
1715 | ||
1716 | return osc_is_ready(osc); | |
1717 | } | |
1718 | ||
1719 | static int osc_list_maint(struct client_obd *cli, struct osc_object *osc) | |
1720 | { | |
1721 | int is_ready; | |
1722 | ||
1723 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1724 | is_ready = __osc_list_maint(cli, osc); | |
1725 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1726 | ||
1727 | return is_ready; | |
1728 | } | |
1729 | ||
1730 | /* this is trying to propogate async writeback errors back up to the | |
1731 | * application. As an async write fails we record the error code for later if | |
1732 | * the app does an fsync. As long as errors persist we force future rpcs to be | |
1733 | * sync so that the app can get a sync error and break the cycle of queueing | |
1734 | * pages for which writeback will fail. */ | |
1735 | static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, | |
1736 | int rc) | |
1737 | { | |
1738 | if (rc) { | |
1739 | if (!ar->ar_rc) | |
1740 | ar->ar_rc = rc; | |
1741 | ||
1742 | ar->ar_force_sync = 1; | |
1743 | ar->ar_min_xid = ptlrpc_sample_next_xid(); | |
1744 | return; | |
1745 | ||
1746 | } | |
1747 | ||
1748 | if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) | |
1749 | ar->ar_force_sync = 0; | |
1750 | } | |
1751 | ||
1752 | /* Performs "unstable" page accounting. This function balances the | |
1753 | * increment operations performed in osc_inc_unstable_pages. It is | |
1754 | * registered as the RPC request callback, and is executed when the | |
1755 | * bulk RPC is committed on the server. Thus at this point, the pages | |
1756 | * involved in the bulk transfer are no longer considered unstable. */ | |
1757 | void osc_dec_unstable_pages(struct ptlrpc_request *req) | |
1758 | { | |
1759 | struct ptlrpc_bulk_desc *desc = req->rq_bulk; | |
1760 | struct client_obd *cli = &req->rq_import->imp_obd->u.cli; | |
1761 | obd_count page_count = desc->bd_iov_count; | |
1762 | int i; | |
1763 | ||
1764 | /* No unstable page tracking */ | |
1765 | if (cli->cl_cache == NULL) | |
1766 | return; | |
1767 | ||
1768 | LASSERT(page_count >= 0); | |
1769 | ||
1770 | for (i = 0; i < page_count; i++) | |
1771 | dec_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); | |
1772 | ||
1773 | atomic_sub(page_count, &cli->cl_cache->ccc_unstable_nr); | |
1774 | LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); | |
1775 | ||
1776 | atomic_sub(page_count, &obd_unstable_pages); | |
1777 | LASSERT(atomic_read(&obd_unstable_pages) >= 0); | |
1778 | ||
1779 | spin_lock(&req->rq_lock); | |
1780 | req->rq_committed = 1; | |
1781 | req->rq_unstable = 0; | |
1782 | spin_unlock(&req->rq_lock); | |
1783 | ||
1784 | wake_up_all(&cli->cl_cache->ccc_unstable_waitq); | |
1785 | } | |
1786 | ||
1787 | /* "unstable" page accounting. See: osc_dec_unstable_pages. */ | |
1788 | void osc_inc_unstable_pages(struct ptlrpc_request *req) | |
1789 | { | |
1790 | struct ptlrpc_bulk_desc *desc = req->rq_bulk; | |
1791 | struct client_obd *cli = &req->rq_import->imp_obd->u.cli; | |
1792 | obd_count page_count = desc->bd_iov_count; | |
1793 | int i; | |
1794 | ||
1795 | /* No unstable page tracking */ | |
1796 | if (cli->cl_cache == NULL) | |
1797 | return; | |
1798 | ||
1799 | LASSERT(page_count >= 0); | |
1800 | ||
1801 | for (i = 0; i < page_count; i++) | |
1802 | inc_zone_page_state(desc->bd_iov[i].kiov_page, NR_UNSTABLE_NFS); | |
1803 | ||
1804 | LASSERT(atomic_read(&cli->cl_cache->ccc_unstable_nr) >= 0); | |
1805 | atomic_add(page_count, &cli->cl_cache->ccc_unstable_nr); | |
1806 | ||
1807 | LASSERT(atomic_read(&obd_unstable_pages) >= 0); | |
1808 | atomic_add(page_count, &obd_unstable_pages); | |
1809 | ||
1810 | spin_lock(&req->rq_lock); | |
1811 | ||
1812 | /* If the request has already been committed (i.e. brw_commit | |
1813 | * called via rq_commit_cb), we need to undo the unstable page | |
1814 | * increments we just performed because rq_commit_cb wont be | |
1815 | * called again. Otherwise, just set the commit callback so the | |
1816 | * unstable page accounting is properly updated when the request | |
1817 | * is committed */ | |
1818 | if (req->rq_committed) { | |
1819 | /* Drop lock before calling osc_dec_unstable_pages */ | |
1820 | spin_unlock(&req->rq_lock); | |
1821 | osc_dec_unstable_pages(req); | |
1822 | spin_lock(&req->rq_lock); | |
1823 | } else { | |
1824 | req->rq_unstable = 1; | |
1825 | req->rq_commit_cb = osc_dec_unstable_pages; | |
1826 | } | |
1827 | ||
1828 | spin_unlock(&req->rq_lock); | |
1829 | } | |
1830 | ||
1831 | /* this must be called holding the loi list lock to give coverage to exit_cache, | |
1832 | * async_flag maintenance, and oap_request */ | |
1833 | static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, | |
1834 | struct osc_async_page *oap, int sent, int rc) | |
1835 | { | |
1836 | struct osc_object *osc = oap->oap_obj; | |
1837 | struct lov_oinfo *loi = osc->oo_oinfo; | |
1838 | __u64 xid = 0; | |
1839 | ||
1840 | ENTRY; | |
1841 | if (oap->oap_request != NULL) { | |
1842 | if (rc == 0) | |
1843 | osc_inc_unstable_pages(oap->oap_request); | |
1844 | ||
1845 | xid = ptlrpc_req_xid(oap->oap_request); | |
1846 | ptlrpc_req_finished(oap->oap_request); | |
1847 | oap->oap_request = NULL; | |
1848 | } | |
1849 | ||
1850 | /* As the transfer for this page is being done, clear the flags */ | |
1851 | spin_lock(&oap->oap_lock); | |
1852 | oap->oap_async_flags = 0; | |
1853 | spin_unlock(&oap->oap_lock); | |
1854 | oap->oap_interrupted = 0; | |
1855 | ||
1856 | if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) { | |
1857 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
1858 | osc_process_ar(&cli->cl_ar, xid, rc); | |
1859 | osc_process_ar(&loi->loi_ar, xid, rc); | |
1860 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
1861 | } | |
1862 | ||
1863 | rc = osc_completion(env, oap, oap->oap_cmd, rc); | |
1864 | if (rc) | |
1865 | CERROR("completion on oap %p obj %p returns %d.\n", | |
1866 | oap, osc, rc); | |
1867 | ||
1868 | EXIT; | |
1869 | } | |
1870 | ||
1871 | /** | |
1872 | * Try to add extent to one RPC. We need to think about the following things: | |
1873 | * - # of pages must not be over max_pages_per_rpc | |
1874 | * - extent must be compatible with previous ones | |
1875 | */ | |
1876 | static int try_to_add_extent_for_io(struct client_obd *cli, | |
1877 | struct osc_extent *ext, struct list_head *rpclist, | |
1878 | int *pc, unsigned int *max_pages) | |
1879 | { | |
1880 | struct osc_extent *tmp; | |
1881 | ENTRY; | |
1882 | ||
1883 | EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE), | |
1884 | ext); | |
1885 | ||
1886 | *max_pages = max(ext->oe_mppr, *max_pages); | |
1887 | if (*pc + ext->oe_nr_pages > *max_pages) | |
1888 | RETURN(0); | |
1889 | ||
1890 | list_for_each_entry(tmp, rpclist, oe_link) { | |
1891 | EASSERT(tmp->oe_owner == current, tmp); | |
1892 | #if 0 | |
1893 | if (overlapped(tmp, ext)) { | |
1894 | OSC_EXTENT_DUMP(D_ERROR, tmp, "overlapped %p.\n", ext); | |
1895 | EASSERT(0, ext); | |
1896 | } | |
1897 | #endif | |
1898 | ||
1899 | if (tmp->oe_srvlock != ext->oe_srvlock || | |
1900 | !tmp->oe_grants != !ext->oe_grants) | |
1901 | RETURN(0); | |
1902 | ||
1903 | /* remove break for strict check */ | |
1904 | break; | |
1905 | } | |
1906 | ||
1907 | *pc += ext->oe_nr_pages; | |
1908 | list_move_tail(&ext->oe_link, rpclist); | |
1909 | ext->oe_owner = current; | |
1910 | RETURN(1); | |
1911 | } | |
1912 | ||
1913 | /** | |
1914 | * In order to prevent multiple ptlrpcd from breaking contiguous extents, | |
1915 | * get_write_extent() takes all appropriate extents in atomic. | |
1916 | * | |
1917 | * The following policy is used to collect extents for IO: | |
1918 | * 1. Add as many HP extents as possible; | |
1919 | * 2. Add the first urgent extent in urgent extent list and take it out of | |
1920 | * urgent list; | |
1921 | * 3. Add subsequent extents of this urgent extent; | |
1922 | * 4. If urgent list is not empty, goto 2; | |
1923 | * 5. Traverse the extent tree from the 1st extent; | |
1924 | * 6. Above steps exit if there is no space in this RPC. | |
1925 | */ | |
1926 | static int get_write_extents(struct osc_object *obj, struct list_head *rpclist) | |
1927 | { | |
1928 | struct client_obd *cli = osc_cli(obj); | |
1929 | struct osc_extent *ext; | |
1930 | int page_count = 0; | |
1931 | unsigned int max_pages = cli->cl_max_pages_per_rpc; | |
1932 | ||
1933 | LASSERT(osc_object_is_locked(obj)); | |
1934 | while (!list_empty(&obj->oo_hp_exts)) { | |
1935 | ext = list_entry(obj->oo_hp_exts.next, struct osc_extent, | |
1936 | oe_link); | |
1937 | LASSERT(ext->oe_state == OES_CACHE); | |
1938 | if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, | |
1939 | &max_pages)) | |
1940 | return page_count; | |
1941 | EASSERT(ext->oe_nr_pages <= max_pages, ext); | |
1942 | } | |
1943 | if (page_count == max_pages) | |
1944 | return page_count; | |
1945 | ||
1946 | while (!list_empty(&obj->oo_urgent_exts)) { | |
1947 | ext = list_entry(obj->oo_urgent_exts.next, | |
1948 | struct osc_extent, oe_link); | |
1949 | if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, | |
1950 | &max_pages)) | |
1951 | return page_count; | |
1952 | ||
1953 | if (!ext->oe_intree) | |
1954 | continue; | |
1955 | ||
1956 | while ((ext = next_extent(ext)) != NULL) { | |
1957 | if ((ext->oe_state != OES_CACHE) || | |
1958 | (!list_empty(&ext->oe_link) && | |
1959 | ext->oe_owner != NULL)) | |
1960 | continue; | |
1961 | ||
1962 | if (!try_to_add_extent_for_io(cli, ext, rpclist, | |
1963 | &page_count, &max_pages)) | |
1964 | return page_count; | |
1965 | } | |
1966 | } | |
1967 | if (page_count == max_pages) | |
1968 | return page_count; | |
1969 | ||
1970 | ext = first_extent(obj); | |
1971 | while (ext != NULL) { | |
1972 | if ((ext->oe_state != OES_CACHE) || | |
1973 | /* this extent may be already in current rpclist */ | |
1974 | (!list_empty(&ext->oe_link) && ext->oe_owner != NULL)) { | |
1975 | ext = next_extent(ext); | |
1976 | continue; | |
1977 | } | |
1978 | ||
1979 | if (!try_to_add_extent_for_io(cli, ext, rpclist, &page_count, | |
1980 | &max_pages)) | |
1981 | return page_count; | |
1982 | ||
1983 | ext = next_extent(ext); | |
1984 | } | |
1985 | return page_count; | |
1986 | } | |
1987 | ||
1988 | static int | |
1989 | osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli, | |
1990 | struct osc_object *osc, pdl_policy_t pol) | |
1991 | { | |
1992 | LIST_HEAD(rpclist); | |
1993 | struct osc_extent *ext; | |
1994 | struct osc_extent *tmp; | |
1995 | struct osc_extent *first = NULL; | |
1996 | obd_count page_count = 0; | |
1997 | int srvlock = 0; | |
1998 | int rc = 0; | |
1999 | ENTRY; | |
2000 | ||
2001 | LASSERT(osc_object_is_locked(osc)); | |
2002 | ||
2003 | page_count = get_write_extents(osc, &rpclist); | |
2004 | LASSERT(equi(page_count == 0, list_empty(&rpclist))); | |
2005 | ||
2006 | if (list_empty(&rpclist)) | |
2007 | RETURN(0); | |
2008 | ||
2009 | osc_update_pending(osc, OBD_BRW_WRITE, -page_count); | |
2010 | ||
2011 | list_for_each_entry(ext, &rpclist, oe_link) { | |
2012 | LASSERT(ext->oe_state == OES_CACHE || | |
2013 | ext->oe_state == OES_LOCK_DONE); | |
2014 | if (ext->oe_state == OES_CACHE) | |
2015 | osc_extent_state_set(ext, OES_LOCKING); | |
2016 | else | |
2017 | osc_extent_state_set(ext, OES_RPC); | |
2018 | } | |
2019 | ||
2020 | /* we're going to grab page lock, so release object lock because | |
2021 | * lock order is page lock -> object lock. */ | |
2022 | osc_object_unlock(osc); | |
2023 | ||
2024 | list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) { | |
2025 | if (ext->oe_state == OES_LOCKING) { | |
2026 | rc = osc_extent_make_ready(env, ext); | |
2027 | if (unlikely(rc < 0)) { | |
2028 | list_del_init(&ext->oe_link); | |
2029 | osc_extent_finish(env, ext, 0, rc); | |
2030 | continue; | |
2031 | } | |
2032 | } | |
2033 | if (first == NULL) { | |
2034 | first = ext; | |
2035 | srvlock = ext->oe_srvlock; | |
2036 | } else { | |
2037 | LASSERT(srvlock == ext->oe_srvlock); | |
2038 | } | |
2039 | } | |
2040 | ||
2041 | if (!list_empty(&rpclist)) { | |
2042 | LASSERT(page_count > 0); | |
2043 | rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE, pol); | |
2044 | LASSERT(list_empty(&rpclist)); | |
2045 | } | |
2046 | ||
2047 | osc_object_lock(osc); | |
2048 | RETURN(rc); | |
2049 | } | |
2050 | ||
2051 | /** | |
2052 | * prepare pages for ASYNC io and put pages in send queue. | |
2053 | * | |
2054 | * \param cmd OBD_BRW_* macroses | |
2055 | * \param lop pending pages | |
2056 | * | |
2057 | * \return zero if no page added to send queue. | |
2058 | * \return 1 if pages successfully added to send queue. | |
2059 | * \return negative on errors. | |
2060 | */ | |
2061 | static int | |
2062 | osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli, | |
2063 | struct osc_object *osc, pdl_policy_t pol) | |
2064 | { | |
2065 | struct osc_extent *ext; | |
2066 | struct osc_extent *next; | |
2067 | LIST_HEAD(rpclist); | |
2068 | int page_count = 0; | |
2069 | unsigned int max_pages = cli->cl_max_pages_per_rpc; | |
2070 | int rc = 0; | |
2071 | ENTRY; | |
2072 | ||
2073 | LASSERT(osc_object_is_locked(osc)); | |
2074 | list_for_each_entry_safe(ext, next, | |
2075 | &osc->oo_reading_exts, oe_link) { | |
2076 | EASSERT(ext->oe_state == OES_LOCK_DONE, ext); | |
2077 | if (!try_to_add_extent_for_io(cli, ext, &rpclist, &page_count, | |
2078 | &max_pages)) | |
2079 | break; | |
2080 | osc_extent_state_set(ext, OES_RPC); | |
2081 | EASSERT(ext->oe_nr_pages <= max_pages, ext); | |
2082 | } | |
2083 | LASSERT(page_count <= max_pages); | |
2084 | ||
2085 | osc_update_pending(osc, OBD_BRW_READ, -page_count); | |
2086 | ||
2087 | if (!list_empty(&rpclist)) { | |
2088 | osc_object_unlock(osc); | |
2089 | ||
2090 | LASSERT(page_count > 0); | |
2091 | rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ, pol); | |
2092 | LASSERT(list_empty(&rpclist)); | |
2093 | ||
2094 | osc_object_lock(osc); | |
2095 | } | |
2096 | RETURN(rc); | |
2097 | } | |
2098 | ||
2099 | #define list_to_obj(list, item) ({ \ | |
2100 | struct list_head *__tmp = (list)->next; \ | |
2101 | list_del_init(__tmp); \ | |
2102 | list_entry(__tmp, struct osc_object, oo_##item); \ | |
2103 | }) | |
2104 | ||
2105 | /* This is called by osc_check_rpcs() to find which objects have pages that | |
2106 | * we could be sending. These lists are maintained by osc_makes_rpc(). */ | |
2107 | static struct osc_object *osc_next_obj(struct client_obd *cli) | |
2108 | { | |
2109 | ENTRY; | |
2110 | ||
2111 | /* First return objects that have blocked locks so that they | |
2112 | * will be flushed quickly and other clients can get the lock, | |
2113 | * then objects which have pages ready to be stuffed into RPCs */ | |
2114 | if (!list_empty(&cli->cl_loi_hp_ready_list)) | |
2115 | RETURN(list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item)); | |
2116 | if (!list_empty(&cli->cl_loi_ready_list)) | |
2117 | RETURN(list_to_obj(&cli->cl_loi_ready_list, ready_item)); | |
2118 | ||
2119 | /* then if we have cache waiters, return all objects with queued | |
2120 | * writes. This is especially important when many small files | |
2121 | * have filled up the cache and not been fired into rpcs because | |
2122 | * they don't pass the nr_pending/object threshhold */ | |
2123 | if (!list_empty(&cli->cl_cache_waiters) && | |
2124 | !list_empty(&cli->cl_loi_write_list)) | |
2125 | RETURN(list_to_obj(&cli->cl_loi_write_list, write_item)); | |
2126 | ||
2127 | /* then return all queued objects when we have an invalid import | |
2128 | * so that they get flushed */ | |
2129 | if (cli->cl_import == NULL || cli->cl_import->imp_invalid) { | |
2130 | if (!list_empty(&cli->cl_loi_write_list)) | |
2131 | RETURN(list_to_obj(&cli->cl_loi_write_list, | |
2132 | write_item)); | |
2133 | if (!list_empty(&cli->cl_loi_read_list)) | |
2134 | RETURN(list_to_obj(&cli->cl_loi_read_list, | |
2135 | read_item)); | |
2136 | } | |
2137 | RETURN(NULL); | |
2138 | } | |
2139 | ||
2140 | /* called with the loi list lock held */ | |
2141 | static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli, | |
2142 | pdl_policy_t pol) | |
2143 | { | |
2144 | struct osc_object *osc; | |
2145 | int rc = 0; | |
2146 | ENTRY; | |
2147 | ||
2148 | while ((osc = osc_next_obj(cli)) != NULL) { | |
2149 | struct cl_object *obj = osc2cl(osc); | |
2150 | struct lu_ref_link *link; | |
2151 | ||
2152 | OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli)); | |
2153 | ||
2154 | if (osc_max_rpc_in_flight(cli, osc)) { | |
2155 | __osc_list_maint(cli, osc); | |
2156 | break; | |
2157 | } | |
2158 | ||
2159 | cl_object_get(obj); | |
2160 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
2161 | link = lu_object_ref_add(&obj->co_lu, "check", current); | |
2162 | ||
2163 | /* attempt some read/write balancing by alternating between | |
2164 | * reads and writes in an object. The makes_rpc checks here | |
2165 | * would be redundant if we were getting read/write work items | |
2166 | * instead of objects. we don't want send_oap_rpc to drain a | |
2167 | * partial read pending queue when we're given this object to | |
2168 | * do io on writes while there are cache waiters */ | |
2169 | osc_object_lock(osc); | |
2170 | if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) { | |
2171 | rc = osc_send_write_rpc(env, cli, osc, pol); | |
2172 | if (rc < 0) { | |
2173 | CERROR("Write request failed with %d\n", rc); | |
2174 | ||
2175 | /* osc_send_write_rpc failed, mostly because of | |
2176 | * memory pressure. | |
2177 | * | |
2178 | * It can't break here, because if: | |
2179 | * - a page was submitted by osc_io_submit, so | |
2180 | * page locked; | |
2181 | * - no request in flight | |
2182 | * - no subsequent request | |
2183 | * The system will be in live-lock state, | |
2184 | * because there is no chance to call | |
2185 | * osc_io_unplug() and osc_check_rpcs() any | |
2186 | * more. pdflush can't help in this case, | |
2187 | * because it might be blocked at grabbing | |
2188 | * the page lock as we mentioned. | |
2189 | * | |
2190 | * Anyway, continue to drain pages. */ | |
2191 | /* break; */ | |
2192 | } | |
2193 | } | |
2194 | if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) { | |
2195 | rc = osc_send_read_rpc(env, cli, osc, pol); | |
2196 | if (rc < 0) | |
2197 | CERROR("Read request failed with %d\n", rc); | |
2198 | } | |
2199 | osc_object_unlock(osc); | |
2200 | ||
2201 | osc_list_maint(cli, osc); | |
2202 | lu_object_ref_del_at(&obj->co_lu, link, "check", current); | |
2203 | cl_object_put(env, obj); | |
2204 | ||
2205 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
2206 | } | |
2207 | } | |
2208 | ||
2209 | static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, | |
2210 | struct osc_object *osc, pdl_policy_t pol, int async) | |
2211 | { | |
2212 | int has_rpcs = 1; | |
2213 | int rc = 0; | |
2214 | ||
2215 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
2216 | if (osc != NULL) | |
2217 | has_rpcs = __osc_list_maint(cli, osc); | |
2218 | if (has_rpcs) { | |
2219 | if (!async) { | |
2220 | /* disable osc_lru_shrink() temporarily to avoid | |
2221 | * potential stack overrun problem. LU-2859 */ | |
2222 | atomic_inc(&cli->cl_lru_shrinkers); | |
2223 | osc_check_rpcs(env, cli, pol); | |
2224 | atomic_dec(&cli->cl_lru_shrinkers); | |
2225 | } else { | |
2226 | CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", | |
2227 | cli); | |
2228 | LASSERT(cli->cl_writeback_work != NULL); | |
2229 | rc = ptlrpcd_queue_work(cli->cl_writeback_work); | |
2230 | } | |
2231 | } | |
2232 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
2233 | return rc; | |
2234 | } | |
2235 | ||
2236 | static int osc_io_unplug_async(const struct lu_env *env, | |
2237 | struct client_obd *cli, struct osc_object *osc) | |
2238 | { | |
2239 | /* XXX: policy is no use actually. */ | |
2240 | return osc_io_unplug0(env, cli, osc, PDL_POLICY_ROUND, 1); | |
2241 | } | |
2242 | ||
2243 | void osc_io_unplug(const struct lu_env *env, struct client_obd *cli, | |
2244 | struct osc_object *osc, pdl_policy_t pol) | |
2245 | { | |
2246 | (void)osc_io_unplug0(env, cli, osc, pol, 0); | |
2247 | } | |
2248 | ||
2249 | int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, | |
2250 | struct page *page, loff_t offset) | |
2251 | { | |
2252 | struct obd_export *exp = osc_export(osc); | |
2253 | struct osc_async_page *oap = &ops->ops_oap; | |
2254 | ENTRY; | |
2255 | ||
2256 | if (!page) | |
2257 | return cfs_size_round(sizeof(*oap)); | |
2258 | ||
2259 | oap->oap_magic = OAP_MAGIC; | |
2260 | oap->oap_cli = &exp->exp_obd->u.cli; | |
2261 | oap->oap_obj = osc; | |
2262 | ||
2263 | oap->oap_page = page; | |
2264 | oap->oap_obj_off = offset; | |
2265 | LASSERT(!(offset & ~CFS_PAGE_MASK)); | |
2266 | ||
2267 | if (!client_is_remote(exp) && cfs_capable(CFS_CAP_SYS_RESOURCE)) | |
2268 | oap->oap_brw_flags = OBD_BRW_NOQUOTA; | |
2269 | ||
2270 | INIT_LIST_HEAD(&oap->oap_pending_item); | |
2271 | INIT_LIST_HEAD(&oap->oap_rpc_item); | |
2272 | ||
2273 | spin_lock_init(&oap->oap_lock); | |
2274 | CDEBUG(D_INFO, "oap %p page %p obj off "LPU64"\n", | |
2275 | oap, page, oap->oap_obj_off); | |
2276 | RETURN(0); | |
2277 | } | |
2278 | ||
2279 | int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, | |
2280 | struct osc_page *ops) | |
2281 | { | |
2282 | struct osc_io *oio = osc_env_io(env); | |
2283 | struct osc_extent *ext = NULL; | |
2284 | struct osc_async_page *oap = &ops->ops_oap; | |
2285 | struct client_obd *cli = oap->oap_cli; | |
2286 | struct osc_object *osc = oap->oap_obj; | |
2287 | pgoff_t index; | |
2288 | int grants = 0; | |
2289 | int brw_flags = OBD_BRW_ASYNC; | |
2290 | int cmd = OBD_BRW_WRITE; | |
2291 | int need_release = 0; | |
2292 | int rc = 0; | |
2293 | ENTRY; | |
2294 | ||
2295 | if (oap->oap_magic != OAP_MAGIC) | |
2296 | RETURN(-EINVAL); | |
2297 | ||
2298 | if (cli->cl_import == NULL || cli->cl_import->imp_invalid) | |
2299 | RETURN(-EIO); | |
2300 | ||
2301 | if (!list_empty(&oap->oap_pending_item) || | |
2302 | !list_empty(&oap->oap_rpc_item)) | |
2303 | RETURN(-EBUSY); | |
2304 | ||
2305 | /* Set the OBD_BRW_SRVLOCK before the page is queued. */ | |
2306 | brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; | |
2307 | if (!client_is_remote(osc_export(osc)) && | |
2308 | cfs_capable(CFS_CAP_SYS_RESOURCE)) { | |
2309 | brw_flags |= OBD_BRW_NOQUOTA; | |
2310 | cmd |= OBD_BRW_NOQUOTA; | |
2311 | } | |
2312 | ||
2313 | /* check if the file's owner/group is over quota */ | |
2314 | if (!(cmd & OBD_BRW_NOQUOTA)) { | |
2315 | struct cl_object *obj; | |
2316 | struct cl_attr *attr; | |
2317 | unsigned int qid[MAXQUOTAS]; | |
2318 | ||
2319 | obj = cl_object_top(&osc->oo_cl); | |
2320 | attr = &osc_env_info(env)->oti_attr; | |
2321 | ||
2322 | cl_object_attr_lock(obj); | |
2323 | rc = cl_object_attr_get(env, obj, attr); | |
2324 | cl_object_attr_unlock(obj); | |
2325 | ||
2326 | qid[USRQUOTA] = attr->cat_uid; | |
2327 | qid[GRPQUOTA] = attr->cat_gid; | |
2328 | if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA) | |
2329 | rc = -EDQUOT; | |
2330 | if (rc) | |
2331 | RETURN(rc); | |
2332 | } | |
2333 | ||
2334 | oap->oap_cmd = cmd; | |
2335 | oap->oap_page_off = ops->ops_from; | |
2336 | oap->oap_count = ops->ops_to - ops->ops_from; | |
2337 | oap->oap_async_flags = 0; | |
2338 | oap->oap_brw_flags = brw_flags; | |
2339 | ||
2340 | OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n", | |
2341 | oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK); | |
2342 | ||
2343 | index = oap2cl_page(oap)->cp_index; | |
2344 | ||
2345 | /* Add this page into extent by the following steps: | |
2346 | * 1. if there exists an active extent for this IO, mostly this page | |
2347 | * can be added to the active extent and sometimes we need to | |
2348 | * expand extent to accomodate this page; | |
2349 | * 2. otherwise, a new extent will be allocated. */ | |
2350 | ||
2351 | ext = oio->oi_active; | |
2352 | if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) { | |
2353 | /* one chunk plus extent overhead must be enough to write this | |
2354 | * page */ | |
2355 | grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; | |
2356 | if (ext->oe_end >= index) | |
2357 | grants = 0; | |
2358 | ||
2359 | /* it doesn't need any grant to dirty this page */ | |
2360 | client_obd_list_lock(&cli->cl_loi_list_lock); | |
2361 | rc = osc_enter_cache_try(cli, oap, grants, 0); | |
2362 | client_obd_list_unlock(&cli->cl_loi_list_lock); | |
2363 | if (rc == 0) { /* try failed */ | |
2364 | grants = 0; | |
2365 | need_release = 1; | |
2366 | } else if (ext->oe_end < index) { | |
2367 | int tmp = grants; | |
2368 | /* try to expand this extent */ | |
2369 | rc = osc_extent_expand(ext, index, &tmp); | |
2370 | if (rc < 0) { | |
2371 | need_release = 1; | |
2372 | /* don't free reserved grant */ | |
2373 | } else { | |
2374 | OSC_EXTENT_DUMP(D_CACHE, ext, | |
2375 | "expanded for %lu.\n", index); | |
2376 | osc_unreserve_grant(cli, grants, tmp); | |
2377 | grants = 0; | |
2378 | } | |
2379 | } | |
2380 | rc = 0; | |
2381 | } else if (ext != NULL) { | |
2382 | /* index is located outside of active extent */ | |
2383 | need_release = 1; | |
2384 | } | |
2385 | if (need_release) { | |
2386 | osc_extent_release(env, ext); | |
2387 | oio->oi_active = NULL; | |
2388 | ext = NULL; | |
2389 | } | |
2390 | ||
2391 | if (ext == NULL) { | |
2392 | int tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; | |
2393 | ||
2394 | /* try to find new extent to cover this page */ | |
2395 | LASSERT(oio->oi_active == NULL); | |
2396 | /* we may have allocated grant for this page if we failed | |
2397 | * to expand the previous active extent. */ | |
2398 | LASSERT(ergo(grants > 0, grants >= tmp)); | |
2399 | ||
2400 | rc = 0; | |
2401 | if (grants == 0) { | |
2402 | /* we haven't allocated grant for this page. */ | |
2403 | rc = osc_enter_cache(env, cli, oap, tmp); | |
2404 | if (rc == 0) | |
2405 | grants = tmp; | |
2406 | } | |
2407 | ||
2408 | tmp = grants; | |
2409 | if (rc == 0) { | |
2410 | ext = osc_extent_find(env, osc, index, &tmp); | |
2411 | if (IS_ERR(ext)) { | |
2412 | LASSERT(tmp == grants); | |
2413 | osc_exit_cache(cli, oap); | |
2414 | rc = PTR_ERR(ext); | |
2415 | ext = NULL; | |
2416 | } else { | |
2417 | oio->oi_active = ext; | |
2418 | } | |
2419 | } | |
2420 | if (grants > 0) | |
2421 | osc_unreserve_grant(cli, grants, tmp); | |
2422 | } | |
2423 | ||
2424 | LASSERT(ergo(rc == 0, ext != NULL)); | |
2425 | if (ext != NULL) { | |
2426 | EASSERTF(ext->oe_end >= index && ext->oe_start <= index, | |
2427 | ext, "index = %lu.\n", index); | |
2428 | LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0); | |
2429 | ||
2430 | osc_object_lock(osc); | |
2431 | if (ext->oe_nr_pages == 0) | |
2432 | ext->oe_srvlock = ops->ops_srvlock; | |
2433 | else | |
2434 | LASSERT(ext->oe_srvlock == ops->ops_srvlock); | |
2435 | ++ext->oe_nr_pages; | |
2436 | list_add_tail(&oap->oap_pending_item, &ext->oe_pages); | |
2437 | osc_object_unlock(osc); | |
2438 | } | |
2439 | RETURN(rc); | |
2440 | } | |
2441 | ||
2442 | int osc_teardown_async_page(const struct lu_env *env, | |
2443 | struct osc_object *obj, struct osc_page *ops) | |
2444 | { | |
2445 | struct osc_async_page *oap = &ops->ops_oap; | |
2446 | struct osc_extent *ext = NULL; | |
2447 | int rc = 0; | |
2448 | ENTRY; | |
2449 | ||
2450 | LASSERT(oap->oap_magic == OAP_MAGIC); | |
2451 | ||
2452 | CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n", | |
2453 | oap, ops, oap2cl_page(oap)->cp_index); | |
2454 | ||
2455 | osc_object_lock(obj); | |
2456 | if (!list_empty(&oap->oap_rpc_item)) { | |
2457 | CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap); | |
2458 | rc = -EBUSY; | |
2459 | } else if (!list_empty(&oap->oap_pending_item)) { | |
2460 | ext = osc_extent_lookup(obj, oap2cl_page(oap)->cp_index); | |
2461 | /* only truncated pages are allowed to be taken out. | |
2462 | * See osc_extent_truncate() and osc_cache_truncate_start() | |
2463 | * for details. */ | |
2464 | if (ext != NULL && ext->oe_state != OES_TRUNC) { | |
2465 | OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n", | |
2466 | oap2cl_page(oap)->cp_index); | |
2467 | rc = -EBUSY; | |
2468 | } | |
2469 | } | |
2470 | osc_object_unlock(obj); | |
2471 | if (ext != NULL) | |
2472 | osc_extent_put(env, ext); | |
2473 | RETURN(rc); | |
2474 | } | |
2475 | ||
2476 | /** | |
2477 | * This is called when a page is picked up by kernel to write out. | |
2478 | * | |
2479 | * We should find out the corresponding extent and add the whole extent | |
2480 | * into urgent list. The extent may be being truncated or used, handle it | |
2481 | * carefully. | |
2482 | */ | |
2483 | int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, | |
2484 | struct osc_page *ops) | |
2485 | { | |
2486 | struct osc_extent *ext = NULL; | |
2487 | struct osc_object *obj = cl2osc(ops->ops_cl.cpl_obj); | |
2488 | struct cl_page *cp = ops->ops_cl.cpl_page; | |
2489 | pgoff_t index = cp->cp_index; | |
2490 | struct osc_async_page *oap = &ops->ops_oap; | |
2491 | bool unplug = false; | |
2492 | int rc = 0; | |
2493 | ENTRY; | |
2494 | ||
2495 | osc_object_lock(obj); | |
2496 | ext = osc_extent_lookup(obj, index); | |
2497 | if (ext == NULL) { | |
2498 | osc_extent_tree_dump(D_ERROR, obj); | |
2499 | LASSERTF(0, "page index %lu is NOT covered.\n", index); | |
2500 | } | |
2501 | ||
2502 | switch (ext->oe_state) { | |
2503 | case OES_RPC: | |
2504 | case OES_LOCK_DONE: | |
2505 | CL_PAGE_DEBUG(D_ERROR, env, cl_page_top(cp), | |
2506 | "flush an in-rpc page?\n"); | |
2507 | LASSERT(0); | |
2508 | break; | |
2509 | case OES_LOCKING: | |
2510 | /* If we know this extent is being written out, we should abort | |
2511 | * so that the writer can make this page ready. Otherwise, there | |
2512 | * exists a deadlock problem because other process can wait for | |
2513 | * page writeback bit holding page lock; and meanwhile in | |
2514 | * vvp_page_make_ready(), we need to grab page lock before | |
2515 | * really sending the RPC. */ | |
2516 | case OES_TRUNC: | |
2517 | /* race with truncate, page will be redirtied */ | |
2518 | GOTO(out, rc = -EAGAIN); | |
2519 | default: | |
2520 | break; | |
2521 | } | |
2522 | ||
2523 | rc = cl_page_prep(env, io, cl_page_top(cp), CRT_WRITE); | |
2524 | if (rc) | |
2525 | GOTO(out, rc); | |
2526 | ||
2527 | spin_lock(&oap->oap_lock); | |
2528 | oap->oap_async_flags |= ASYNC_READY|ASYNC_URGENT; | |
2529 | spin_unlock(&oap->oap_lock); | |
2530 | ||
2531 | if (memory_pressure_get()) | |
2532 | ext->oe_memalloc = 1; | |
2533 | ||
2534 | ext->oe_urgent = 1; | |
2535 | if (ext->oe_state == OES_CACHE) { | |
2536 | OSC_EXTENT_DUMP(D_CACHE, ext, | |
2537 | "flush page %p make it urgent.\n", oap); | |
2538 | if (list_empty(&ext->oe_link)) | |
2539 | list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); | |
2540 | unplug = true; | |
2541 | } | |
2542 | rc = 0; | |
2543 | EXIT; | |
2544 | ||
2545 | out: | |
2546 | osc_object_unlock(obj); | |
2547 | osc_extent_put(env, ext); | |
2548 | if (unplug) | |
2549 | osc_io_unplug_async(env, osc_cli(obj), obj); | |
2550 | return rc; | |
2551 | } | |
2552 | ||
2553 | /** | |
2554 | * this is called when a sync waiter receives an interruption. Its job is to | |
2555 | * get the caller woken as soon as possible. If its page hasn't been put in an | |
2556 | * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as | |
2557 | * desiring interruption which will forcefully complete the rpc once the rpc | |
2558 | * has timed out. | |
2559 | */ | |
2560 | int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops) | |
2561 | { | |
2562 | struct osc_async_page *oap = &ops->ops_oap; | |
2563 | struct osc_object *obj = oap->oap_obj; | |
2564 | struct client_obd *cli = osc_cli(obj); | |
2565 | struct osc_extent *ext; | |
2566 | struct osc_extent *found = NULL; | |
2567 | struct list_head *plist; | |
2568 | pgoff_t index = oap2cl_page(oap)->cp_index; | |
2569 | int rc = -EBUSY; | |
2570 | int cmd; | |
2571 | ENTRY; | |
2572 | ||
2573 | LASSERT(!oap->oap_interrupted); | |
2574 | oap->oap_interrupted = 1; | |
2575 | ||
2576 | /* Find out the caching extent */ | |
2577 | osc_object_lock(obj); | |
2578 | if (oap->oap_cmd & OBD_BRW_WRITE) { | |
2579 | plist = &obj->oo_urgent_exts; | |
2580 | cmd = OBD_BRW_WRITE; | |
2581 | } else { | |
2582 | plist = &obj->oo_reading_exts; | |
2583 | cmd = OBD_BRW_READ; | |
2584 | } | |
2585 | list_for_each_entry(ext, plist, oe_link) { | |
2586 | if (ext->oe_start <= index && ext->oe_end >= index) { | |
2587 | LASSERT(ext->oe_state == OES_LOCK_DONE); | |
2588 | /* For OES_LOCK_DONE state extent, it has already held | |
2589 | * a refcount for RPC. */ | |
2590 | found = osc_extent_get(ext); | |
2591 | break; | |
2592 | } | |
2593 | } | |
2594 | if (found != NULL) { | |
2595 | list_del_init(&found->oe_link); | |
2596 | osc_update_pending(obj, cmd, -found->oe_nr_pages); | |
2597 | osc_object_unlock(obj); | |
2598 | ||
2599 | osc_extent_finish(env, found, 0, -EINTR); | |
2600 | osc_extent_put(env, found); | |
2601 | rc = 0; | |
2602 | } else { | |
2603 | osc_object_unlock(obj); | |
2604 | /* ok, it's been put in an rpc. only one oap gets a request | |
2605 | * reference */ | |
2606 | if (oap->oap_request != NULL) { | |
2607 | ptlrpc_mark_interrupted(oap->oap_request); | |
2608 | ptlrpcd_wake(oap->oap_request); | |
2609 | ptlrpc_req_finished(oap->oap_request); | |
2610 | oap->oap_request = NULL; | |
2611 | } | |
2612 | } | |
2613 | ||
2614 | osc_list_maint(cli, obj); | |
2615 | RETURN(rc); | |
2616 | } | |
2617 | ||
2618 | int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, | |
2619 | struct list_head *list, int cmd, int brw_flags) | |
2620 | { | |
2621 | struct client_obd *cli = osc_cli(obj); | |
2622 | struct osc_extent *ext; | |
2623 | struct osc_async_page *oap; | |
2624 | int page_count = 0; | |
2625 | int mppr = cli->cl_max_pages_per_rpc; | |
2626 | pgoff_t start = CL_PAGE_EOF; | |
2627 | pgoff_t end = 0; | |
2628 | ENTRY; | |
2629 | ||
2630 | list_for_each_entry(oap, list, oap_pending_item) { | |
2631 | struct cl_page *cp = oap2cl_page(oap); | |
2632 | if (cp->cp_index > end) | |
2633 | end = cp->cp_index; | |
2634 | if (cp->cp_index < start) | |
2635 | start = cp->cp_index; | |
2636 | ++page_count; | |
2637 | mppr <<= (page_count > mppr); | |
2638 | } | |
2639 | ||
2640 | ext = osc_extent_alloc(obj); | |
2641 | if (ext == NULL) { | |
2642 | list_for_each_entry(oap, list, oap_pending_item) { | |
2643 | list_del_init(&oap->oap_pending_item); | |
2644 | osc_ap_completion(env, cli, oap, 0, -ENOMEM); | |
2645 | } | |
2646 | RETURN(-ENOMEM); | |
2647 | } | |
2648 | ||
2649 | ext->oe_rw = !!(cmd & OBD_BRW_READ); | |
2650 | ext->oe_urgent = 1; | |
2651 | ext->oe_start = start; | |
2652 | ext->oe_end = ext->oe_max_end = end; | |
2653 | ext->oe_obj = obj; | |
2654 | ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK); | |
2655 | ext->oe_nr_pages = page_count; | |
2656 | ext->oe_mppr = mppr; | |
2657 | list_splice_init(list, &ext->oe_pages); | |
2658 | ||
2659 | osc_object_lock(obj); | |
2660 | /* Reuse the initial refcount for RPC, don't drop it */ | |
2661 | osc_extent_state_set(ext, OES_LOCK_DONE); | |
2662 | if (cmd & OBD_BRW_WRITE) { | |
2663 | list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); | |
2664 | osc_update_pending(obj, OBD_BRW_WRITE, page_count); | |
2665 | } else { | |
2666 | list_add_tail(&ext->oe_link, &obj->oo_reading_exts); | |
2667 | osc_update_pending(obj, OBD_BRW_READ, page_count); | |
2668 | } | |
2669 | osc_object_unlock(obj); | |
2670 | ||
2671 | osc_io_unplug(env, cli, obj, PDL_POLICY_ROUND); | |
2672 | RETURN(0); | |
2673 | } | |
2674 | ||
2675 | /** | |
2676 | * Called by osc_io_setattr_start() to freeze and destroy covering extents. | |
2677 | */ | |
2678 | int osc_cache_truncate_start(const struct lu_env *env, struct osc_io *oio, | |
2679 | struct osc_object *obj, __u64 size) | |
2680 | { | |
2681 | struct client_obd *cli = osc_cli(obj); | |
2682 | struct osc_extent *ext; | |
2683 | struct osc_extent *waiting = NULL; | |
2684 | pgoff_t index; | |
2685 | LIST_HEAD(list); | |
2686 | int result = 0; | |
2687 | bool partial; | |
2688 | ENTRY; | |
2689 | ||
2690 | /* pages with index greater or equal to index will be truncated. */ | |
2691 | index = cl_index(osc2cl(obj), size); | |
2692 | partial = size > cl_offset(osc2cl(obj), index); | |
2693 | ||
2694 | again: | |
2695 | osc_object_lock(obj); | |
2696 | ext = osc_extent_search(obj, index); | |
2697 | if (ext == NULL) | |
2698 | ext = first_extent(obj); | |
2699 | else if (ext->oe_end < index) | |
2700 | ext = next_extent(ext); | |
2701 | while (ext != NULL) { | |
2702 | EASSERT(ext->oe_state != OES_TRUNC, ext); | |
2703 | ||
2704 | if (ext->oe_state > OES_CACHE || ext->oe_urgent) { | |
2705 | /* if ext is in urgent state, it means there must exist | |
2706 | * a page already having been flushed by write_page(). | |
2707 | * We have to wait for this extent because we can't | |
2708 | * truncate that page. */ | |
2709 | LASSERT(!ext->oe_hp); | |
2710 | OSC_EXTENT_DUMP(D_CACHE, ext, | |
2711 | "waiting for busy extent\n"); | |
2712 | waiting = osc_extent_get(ext); | |
2713 | break; | |
2714 | } | |
2715 | ||
2716 | OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:"LPU64".\n", size); | |
2717 | ||
2718 | osc_extent_get(ext); | |
2719 | if (ext->oe_state == OES_ACTIVE) { | |
2720 | /* though we grab inode mutex for write path, but we | |
2721 | * release it before releasing extent(in osc_io_end()), | |
2722 | * so there is a race window that an extent is still | |
2723 | * in OES_ACTIVE when truncate starts. */ | |
2724 | LASSERT(!ext->oe_trunc_pending); | |
2725 | ext->oe_trunc_pending = 1; | |
2726 | } else { | |
2727 | EASSERT(ext->oe_state == OES_CACHE, ext); | |
2728 | osc_extent_state_set(ext, OES_TRUNC); | |
2729 | osc_update_pending(obj, OBD_BRW_WRITE, | |
2730 | -ext->oe_nr_pages); | |
2731 | } | |
2732 | EASSERT(list_empty(&ext->oe_link), ext); | |
2733 | list_add_tail(&ext->oe_link, &list); | |
2734 | ||
2735 | ext = next_extent(ext); | |
2736 | } | |
2737 | osc_object_unlock(obj); | |
2738 | ||
2739 | osc_list_maint(cli, obj); | |
2740 | ||
2741 | while (!list_empty(&list)) { | |
2742 | int rc; | |
2743 | ||
2744 | ext = list_entry(list.next, struct osc_extent, oe_link); | |
2745 | list_del_init(&ext->oe_link); | |
2746 | ||
2747 | /* extent may be in OES_ACTIVE state because inode mutex | |
2748 | * is released before osc_io_end() in file write case */ | |
2749 | if (ext->oe_state != OES_TRUNC) | |
2750 | osc_extent_wait(env, ext, OES_TRUNC); | |
2751 | ||
2752 | rc = osc_extent_truncate(ext, index, partial); | |
2753 | if (rc < 0) { | |
2754 | if (result == 0) | |
2755 | result = rc; | |
2756 | ||
2757 | OSC_EXTENT_DUMP(D_ERROR, ext, | |
2758 | "truncate error %d\n", rc); | |
2759 | } else if (ext->oe_nr_pages == 0) { | |
2760 | osc_extent_remove(ext); | |
2761 | } else { | |
2762 | /* this must be an overlapped extent which means only | |
2763 | * part of pages in this extent have been truncated. | |
2764 | */ | |
2765 | EASSERTF(ext->oe_start <= index, ext, | |
2766 | "trunc index = %lu/%d.\n", index, partial); | |
2767 | /* fix index to skip this partially truncated extent */ | |
2768 | index = ext->oe_end + 1; | |
2769 | partial = false; | |
2770 | ||
2771 | /* we need to hold this extent in OES_TRUNC state so | |
2772 | * that no writeback will happen. This is to avoid | |
2773 | * BUG 17397. */ | |
2774 | LASSERT(oio->oi_trunc == NULL); | |
2775 | oio->oi_trunc = osc_extent_get(ext); | |
2776 | OSC_EXTENT_DUMP(D_CACHE, ext, | |
2777 | "trunc at "LPU64"\n", size); | |
2778 | } | |
2779 | osc_extent_put(env, ext); | |
2780 | } | |
2781 | if (waiting != NULL) { | |
2782 | int rc; | |
2783 | ||
2784 | /* ignore the result of osc_extent_wait the write initiator | |
2785 | * should take care of it. */ | |
2786 | rc = osc_extent_wait(env, waiting, OES_INV); | |
2787 | if (rc < 0) | |
2788 | OSC_EXTENT_DUMP(D_CACHE, ext, "wait error: %d.\n", rc); | |
2789 | ||
2790 | osc_extent_put(env, waiting); | |
2791 | waiting = NULL; | |
2792 | goto again; | |
2793 | } | |
2794 | RETURN(result); | |
2795 | } | |
2796 | ||
2797 | /** | |
2798 | * Called after osc_io_setattr_end to add oio->oi_trunc back to cache. | |
2799 | */ | |
2800 | void osc_cache_truncate_end(const struct lu_env *env, struct osc_io *oio, | |
2801 | struct osc_object *obj) | |
2802 | { | |
2803 | struct osc_extent *ext = oio->oi_trunc; | |
2804 | ||
2805 | oio->oi_trunc = NULL; | |
2806 | if (ext != NULL) { | |
2807 | bool unplug = false; | |
2808 | ||
2809 | EASSERT(ext->oe_nr_pages > 0, ext); | |
2810 | EASSERT(ext->oe_state == OES_TRUNC, ext); | |
2811 | EASSERT(!ext->oe_urgent, ext); | |
2812 | ||
2813 | OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n"); | |
2814 | osc_object_lock(obj); | |
2815 | osc_extent_state_set(ext, OES_CACHE); | |
2816 | if (ext->oe_fsync_wait && !ext->oe_urgent) { | |
2817 | ext->oe_urgent = 1; | |
2818 | list_move_tail(&ext->oe_link, &obj->oo_urgent_exts); | |
2819 | unplug = true; | |
2820 | } | |
2821 | osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages); | |
2822 | osc_object_unlock(obj); | |
2823 | osc_extent_put(env, ext); | |
2824 | ||
2825 | if (unplug) | |
2826 | osc_io_unplug_async(env, osc_cli(obj), obj); | |
2827 | } | |
2828 | } | |
2829 | ||
2830 | /** | |
2831 | * Wait for extents in a specific range to be written out. | |
2832 | * The caller must have called osc_cache_writeback_range() to issue IO | |
2833 | * otherwise it will take a long time for this function to finish. | |
2834 | * | |
2835 | * Caller must hold inode_mutex , or cancel exclusive dlm lock so that | |
2836 | * nobody else can dirty this range of file while we're waiting for | |
2837 | * extents to be written. | |
2838 | */ | |
2839 | int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, | |
2840 | pgoff_t start, pgoff_t end) | |
2841 | { | |
2842 | struct osc_extent *ext; | |
2843 | pgoff_t index = start; | |
2844 | int result = 0; | |
2845 | ENTRY; | |
2846 | ||
2847 | again: | |
2848 | osc_object_lock(obj); | |
2849 | ext = osc_extent_search(obj, index); | |
2850 | if (ext == NULL) | |
2851 | ext = first_extent(obj); | |
2852 | else if (ext->oe_end < index) | |
2853 | ext = next_extent(ext); | |
2854 | while (ext != NULL) { | |
2855 | int rc; | |
2856 | ||
2857 | if (ext->oe_start > end) | |
2858 | break; | |
2859 | ||
2860 | if (!ext->oe_fsync_wait) { | |
2861 | ext = next_extent(ext); | |
2862 | continue; | |
2863 | } | |
2864 | ||
2865 | EASSERT(ergo(ext->oe_state == OES_CACHE, | |
2866 | ext->oe_hp || ext->oe_urgent), ext); | |
2867 | EASSERT(ergo(ext->oe_state == OES_ACTIVE, | |
2868 | !ext->oe_hp && ext->oe_urgent), ext); | |
2869 | ||
2870 | index = ext->oe_end + 1; | |
2871 | osc_extent_get(ext); | |
2872 | osc_object_unlock(obj); | |
2873 | ||
2874 | rc = osc_extent_wait(env, ext, OES_INV); | |
2875 | if (result == 0) | |
2876 | result = rc; | |
2877 | osc_extent_put(env, ext); | |
2878 | goto again; | |
2879 | } | |
2880 | osc_object_unlock(obj); | |
2881 | ||
2882 | OSC_IO_DEBUG(obj, "sync file range.\n"); | |
2883 | RETURN(result); | |
2884 | } | |
2885 | ||
2886 | /** | |
2887 | * Called to write out a range of osc object. | |
2888 | * | |
2889 | * @hp : should be set this is caused by lock cancel; | |
2890 | * @discard: is set if dirty pages should be dropped - file will be deleted or | |
2891 | * truncated, this implies there is no partially discarding extents. | |
2892 | * | |
2893 | * Return how many pages will be issued, or error code if error occurred. | |
2894 | */ | |
2895 | int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, | |
2896 | pgoff_t start, pgoff_t end, int hp, int discard) | |
2897 | { | |
2898 | struct osc_extent *ext; | |
2899 | LIST_HEAD(discard_list); | |
2900 | bool unplug = false; | |
2901 | int result = 0; | |
2902 | ENTRY; | |
2903 | ||
2904 | osc_object_lock(obj); | |
2905 | ext = osc_extent_search(obj, start); | |
2906 | if (ext == NULL) | |
2907 | ext = first_extent(obj); | |
2908 | else if (ext->oe_end < start) | |
2909 | ext = next_extent(ext); | |
2910 | while (ext != NULL) { | |
2911 | if (ext->oe_start > end) | |
2912 | break; | |
2913 | ||
2914 | ext->oe_fsync_wait = 1; | |
2915 | switch (ext->oe_state) { | |
2916 | case OES_CACHE: | |
2917 | result += ext->oe_nr_pages; | |
2918 | if (!discard) { | |
2919 | struct list_head *list = NULL; | |
2920 | if (hp) { | |
2921 | EASSERT(!ext->oe_hp, ext); | |
2922 | ext->oe_hp = 1; | |
2923 | list = &obj->oo_hp_exts; | |
2924 | } else if (!ext->oe_urgent) { | |
2925 | ext->oe_urgent = 1; | |
2926 | list = &obj->oo_urgent_exts; | |
2927 | } | |
2928 | if (list != NULL) | |
2929 | list_move_tail(&ext->oe_link, list); | |
2930 | unplug = true; | |
2931 | } else { | |
2932 | /* the only discarder is lock cancelling, so | |
2933 | * [start, end] must contain this extent */ | |
2934 | EASSERT(ext->oe_start >= start && | |
2935 | ext->oe_max_end <= end, ext); | |
2936 | osc_extent_state_set(ext, OES_LOCKING); | |
2937 | ext->oe_owner = current; | |
2938 | list_move_tail(&ext->oe_link, | |
2939 | &discard_list); | |
2940 | osc_update_pending(obj, OBD_BRW_WRITE, | |
2941 | -ext->oe_nr_pages); | |
2942 | } | |
2943 | break; | |
2944 | case OES_ACTIVE: | |
2945 | /* It's pretty bad to wait for ACTIVE extents, because | |
2946 | * we don't know how long we will wait for it to be | |
2947 | * flushed since it may be blocked at awaiting more | |
2948 | * grants. We do this for the correctness of fsync. */ | |
2949 | LASSERT(hp == 0 && discard == 0); | |
2950 | ext->oe_urgent = 1; | |
2951 | break; | |
2952 | case OES_TRUNC: | |
2953 | /* this extent is being truncated, can't do anything | |
2954 | * for it now. it will be set to urgent after truncate | |
2955 | * is finished in osc_cache_truncate_end(). */ | |
2956 | default: | |
2957 | break; | |
2958 | } | |
2959 | ext = next_extent(ext); | |
2960 | } | |
2961 | osc_object_unlock(obj); | |
2962 | ||
2963 | LASSERT(ergo(!discard, list_empty(&discard_list))); | |
2964 | if (!list_empty(&discard_list)) { | |
2965 | struct osc_extent *tmp; | |
2966 | int rc; | |
2967 | ||
2968 | osc_list_maint(osc_cli(obj), obj); | |
2969 | list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) { | |
2970 | list_del_init(&ext->oe_link); | |
2971 | EASSERT(ext->oe_state == OES_LOCKING, ext); | |
2972 | ||
2973 | /* Discard caching pages. We don't actually write this | |
2974 | * extent out but we complete it as if we did. */ | |
2975 | rc = osc_extent_make_ready(env, ext); | |
2976 | if (unlikely(rc < 0)) { | |
2977 | OSC_EXTENT_DUMP(D_ERROR, ext, | |
2978 | "make_ready returned %d\n", rc); | |
2979 | if (result >= 0) | |
2980 | result = rc; | |
2981 | } | |
2982 | ||
2983 | /* finish the extent as if the pages were sent */ | |
2984 | osc_extent_finish(env, ext, 0, 0); | |
2985 | } | |
2986 | } | |
2987 | ||
2988 | if (unplug) | |
2989 | osc_io_unplug(env, osc_cli(obj), obj, PDL_POLICY_ROUND); | |
2990 | ||
2991 | if (hp || discard) { | |
2992 | int rc; | |
2993 | rc = osc_cache_wait_range(env, obj, start, end); | |
2994 | if (result >= 0 && rc < 0) | |
2995 | result = rc; | |
2996 | } | |
2997 | ||
2998 | OSC_IO_DEBUG(obj, "cache page out.\n"); | |
2999 | RETURN(result); | |
3000 | } | |
3001 | ||
3002 | /** @} osc */ |