]>
Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
6a5b99a4 | 18 | * http://www.gnu.org/licenses/gpl-2.0.html |
d7e09d03 | 19 | * |
d7e09d03 PT |
20 | * GPL HEADER END |
21 | */ | |
22 | /* | |
23 | * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. | |
24 | * Use is subject to license terms. | |
25 | * | |
1dc563a6 | 26 | * Copyright (c) 2011, 2015, Intel Corporation. |
d7e09d03 PT |
27 | */ |
28 | /* | |
29 | * This file is part of Lustre, http://www.lustre.org/ | |
30 | * Lustre is a trademark of Sun Microsystems, Inc. | |
31 | */ | |
32 | #ifndef _LUSTRE_CL_OBJECT_H | |
33 | #define _LUSTRE_CL_OBJECT_H | |
34 | ||
35 | /** \defgroup clio clio | |
36 | * | |
37 | * Client objects implement io operations and cache pages. | |
38 | * | |
39 | * Examples: lov and osc are implementations of cl interface. | |
40 | * | |
41 | * Big Theory Statement. | |
42 | * | |
43 | * Layered objects. | |
44 | * | |
45 | * Client implementation is based on the following data-types: | |
46 | * | |
47 | * - cl_object | |
48 | * | |
49 | * - cl_page | |
50 | * | |
51 | * - cl_lock represents an extent lock on an object. | |
52 | * | |
53 | * - cl_io represents high-level i/o activity such as whole read/write | |
54 | * system call, or write-out of pages from under the lock being | |
55 | * canceled. cl_io has sub-ios that can be stopped and resumed | |
56 | * independently, thus achieving high degree of transfer | |
57 | * parallelism. Single cl_io can be advanced forward by | |
58 | * the multiple threads (although in the most usual case of | |
59 | * read/write system call it is associated with the single user | |
60 | * thread, that issued the system call). | |
61 | * | |
d7e09d03 PT |
62 | * Terminology |
63 | * | |
64 | * - to avoid confusion high-level I/O operation like read or write system | |
65 | * call is referred to as "an io", whereas low-level I/O operation, like | |
66 | * RPC, is referred to as "a transfer" | |
67 | * | |
68 | * - "generic code" means generic (not file system specific) code in the | |
69 | * hosting environment. "cl-code" means code (mostly in cl_*.c files) that | |
70 | * is not layer specific. | |
71 | * | |
72 | * Locking. | |
73 | * | |
74 | * - i_mutex | |
75 | * - PG_locked | |
76 | * - cl_object_header::coh_page_guard | |
d7e09d03 PT |
77 | * - lu_site::ls_guard |
78 | * | |
79 | * See the top comment in cl_object.c for the description of overall locking and | |
80 | * reference-counting design. | |
81 | * | |
82 | * See comments below for the description of i/o, page, and dlm-locking | |
83 | * design. | |
84 | * | |
85 | * @{ | |
86 | */ | |
87 | ||
88 | /* | |
89 | * super-class definitions. | |
90 | */ | |
1accaadf | 91 | #include "lu_object.h" |
23ec6607 | 92 | #include "lustre_compat.h" |
0d345656 | 93 | #include <linux/atomic.h> |
1accaadf GKH |
94 | #include <linux/mutex.h> |
95 | #include <linux/radix-tree.h> | |
0d345656 JH |
96 | #include <linux/spinlock.h> |
97 | #include <linux/wait.h> | |
d7e09d03 PT |
98 | |
99 | struct inode; | |
100 | ||
101 | struct cl_device; | |
d7e09d03 PT |
102 | |
103 | struct cl_object; | |
d7e09d03 PT |
104 | |
105 | struct cl_page; | |
106 | struct cl_page_slice; | |
107 | struct cl_lock; | |
108 | struct cl_lock_slice; | |
109 | ||
110 | struct cl_lock_operations; | |
111 | struct cl_page_operations; | |
112 | ||
113 | struct cl_io; | |
114 | struct cl_io_slice; | |
115 | ||
6a9b2c92 | 116 | struct cl_req_attr; |
d7e09d03 PT |
117 | |
118 | /** | |
119 | * Device in the client stack. | |
120 | * | |
3c95b839 | 121 | * \see vvp_device, lov_device, lovsub_device, osc_device |
d7e09d03 PT |
122 | */ |
123 | struct cl_device { | |
124 | /** Super-class. */ | |
125 | struct lu_device cd_lu_dev; | |
d7e09d03 PT |
126 | }; |
127 | ||
128 | /** \addtogroup cl_object cl_object | |
c56e256d OD |
129 | * @{ |
130 | */ | |
d7e09d03 PT |
131 | /** |
132 | * "Data attributes" of cl_object. Data attributes can be updated | |
133 | * independently for a sub-object, and top-object's attributes are calculated | |
134 | * from sub-objects' ones. | |
135 | */ | |
136 | struct cl_attr { | |
137 | /** Object size, in bytes */ | |
138 | loff_t cat_size; | |
139 | /** | |
140 | * Known minimal size, in bytes. | |
141 | * | |
142 | * This is only valid when at least one DLM lock is held. | |
143 | */ | |
144 | loff_t cat_kms; | |
145 | /** Modification time. Measured in seconds since epoch. */ | |
46c360f9 | 146 | time64_t cat_mtime; |
d7e09d03 | 147 | /** Access time. Measured in seconds since epoch. */ |
46c360f9 | 148 | time64_t cat_atime; |
d7e09d03 | 149 | /** Change time. Measured in seconds since epoch. */ |
46c360f9 | 150 | time64_t cat_ctime; |
d7e09d03 PT |
151 | /** |
152 | * Blocks allocated to this cl_object on the server file system. | |
153 | * | |
154 | * \todo XXX An interface for block size is needed. | |
155 | */ | |
156 | __u64 cat_blocks; | |
157 | /** | |
158 | * User identifier for quota purposes. | |
159 | */ | |
160 | uid_t cat_uid; | |
161 | /** | |
162 | * Group identifier for quota purposes. | |
163 | */ | |
164 | gid_t cat_gid; | |
2de35386 | 165 | |
166 | /* nlink of the directory */ | |
167 | __u64 cat_nlink; | |
d7e09d03 PT |
168 | }; |
169 | ||
170 | /** | |
171 | * Fields in cl_attr that are being set. | |
172 | */ | |
173 | enum cl_attr_valid { | |
174 | CAT_SIZE = 1 << 0, | |
175 | CAT_KMS = 1 << 1, | |
176 | CAT_MTIME = 1 << 3, | |
177 | CAT_ATIME = 1 << 4, | |
178 | CAT_CTIME = 1 << 5, | |
179 | CAT_BLOCKS = 1 << 6, | |
180 | CAT_UID = 1 << 7, | |
181 | CAT_GID = 1 << 8 | |
182 | }; | |
183 | ||
184 | /** | |
185 | * Sub-class of lu_object with methods common for objects on the client | |
186 | * stacks. | |
187 | * | |
188 | * cl_object: represents a regular file system object, both a file and a | |
189 | * stripe. cl_object is based on lu_object: it is identified by a fid, | |
190 | * layered, cached, hashed, and lrued. Important distinction with the server | |
191 | * side, where md_object and dt_object are used, is that cl_object "fans out" | |
192 | * at the lov/sns level: depending on the file layout, single file is | |
193 | * represented as a set of "sub-objects" (stripes). At the implementation | |
194 | * level, struct lov_object contains an array of cl_objects. Each sub-object | |
195 | * is a full-fledged cl_object, having its fid, living in the lru and hash | |
196 | * table. | |
197 | * | |
198 | * This leads to the next important difference with the server side: on the | |
199 | * client, it's quite usual to have objects with the different sequence of | |
200 | * layers. For example, typical top-object is composed of the following | |
201 | * layers: | |
202 | * | |
203 | * - vvp | |
204 | * - lov | |
205 | * | |
206 | * whereas its sub-objects are composed of | |
207 | * | |
208 | * - lovsub | |
209 | * - osc | |
210 | * | |
211 | * layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep | |
212 | * track of the object-subobject relationship. | |
213 | * | |
214 | * Sub-objects are not cached independently: when top-object is about to | |
215 | * be discarded from the memory, all its sub-objects are torn-down and | |
216 | * destroyed too. | |
217 | * | |
8c7b0e1a | 218 | * \see vvp_object, lov_object, lovsub_object, osc_object |
d7e09d03 PT |
219 | */ |
220 | struct cl_object { | |
221 | /** super class */ | |
222 | struct lu_object co_lu; | |
223 | /** per-object-layer operations */ | |
224 | const struct cl_object_operations *co_ops; | |
225 | /** offset of page slice in cl_page buffer */ | |
226 | int co_slice_off; | |
227 | }; | |
228 | ||
229 | /** | |
230 | * Description of the client object configuration. This is used for the | |
231 | * creation of a new client object that is identified by a more state than | |
232 | * fid. | |
233 | */ | |
234 | struct cl_object_conf { | |
235 | /** Super-class. */ | |
236 | struct lu_object_conf coc_lu; | |
237 | union { | |
238 | /** | |
239 | * Object layout. This is consumed by lov. | |
240 | */ | |
55051039 | 241 | struct lu_buf coc_layout; |
d7e09d03 PT |
242 | /** |
243 | * Description of particular stripe location in the | |
244 | * cluster. This is consumed by osc. | |
245 | */ | |
246 | struct lov_oinfo *coc_oinfo; | |
247 | } u; | |
248 | /** | |
249 | * VFS inode. This is consumed by vvp. | |
250 | */ | |
251 | struct inode *coc_inode; | |
252 | /** | |
253 | * Layout lock handle. | |
254 | */ | |
255 | struct ldlm_lock *coc_lock; | |
256 | /** | |
257 | * Operation to handle layout, OBJECT_CONF_XYZ. | |
258 | */ | |
259 | int coc_opc; | |
260 | }; | |
261 | ||
262 | enum { | |
263 | /** configure layout, set up a new stripe, must be called while | |
c56e256d OD |
264 | * holding layout lock. |
265 | */ | |
d7e09d03 PT |
266 | OBJECT_CONF_SET = 0, |
267 | /** invalidate the current stripe configuration due to losing | |
c56e256d OD |
268 | * layout lock. |
269 | */ | |
d7e09d03 | 270 | OBJECT_CONF_INVALIDATE = 1, |
c56e256d | 271 | /** wait for old layout to go away so that new layout can be set up. */ |
d7e09d03 PT |
272 | OBJECT_CONF_WAIT = 2 |
273 | }; | |
274 | ||
55554f31 JH |
275 | enum { |
276 | CL_LAYOUT_GEN_NONE = (u32)-2, /* layout lock was cancelled */ | |
277 | CL_LAYOUT_GEN_EMPTY = (u32)-1, /* for empty layout */ | |
278 | }; | |
279 | ||
280 | struct cl_layout { | |
281 | /** the buffer to return the layout in lov_mds_md format. */ | |
282 | struct lu_buf cl_buf; | |
283 | /** size of layout in lov_mds_md format. */ | |
284 | size_t cl_size; | |
285 | /** Layout generation. */ | |
286 | u32 cl_layout_gen; | |
55554f31 JH |
287 | }; |
288 | ||
d7e09d03 PT |
289 | /** |
290 | * Operations implemented for each cl object layer. | |
291 | * | |
292 | * \see vvp_ops, lov_ops, lovsub_ops, osc_ops | |
293 | */ | |
294 | struct cl_object_operations { | |
295 | /** | |
296 | * Initialize page slice for this layer. Called top-to-bottom through | |
297 | * every object layer when a new cl_page is instantiated. Layer | |
298 | * keeping private per-page data, or requiring its own page operations | |
299 | * vector should allocate these data here, and attach then to the page | |
300 | * by calling cl_page_slice_add(). \a vmpage is locked (in the VM | |
301 | * sense). Optional. | |
302 | * | |
303 | * \retval NULL success. | |
304 | * | |
305 | * \retval ERR_PTR(errno) failure code. | |
306 | * | |
307 | * \retval valid-pointer pointer to already existing referenced page | |
308 | * to be used instead of newly created. | |
309 | */ | |
310 | int (*coo_page_init)(const struct lu_env *env, struct cl_object *obj, | |
24c198e9 | 311 | struct cl_page *page, pgoff_t index); |
d7e09d03 PT |
312 | /** |
313 | * Initialize lock slice for this layer. Called top-to-bottom through | |
314 | * every object layer when a new cl_lock is instantiated. Layer | |
315 | * keeping private per-lock data, or requiring its own lock operations | |
316 | * vector should allocate these data here, and attach then to the lock | |
317 | * by calling cl_lock_slice_add(). Mandatory. | |
318 | */ | |
319 | int (*coo_lock_init)(const struct lu_env *env, | |
320 | struct cl_object *obj, struct cl_lock *lock, | |
321 | const struct cl_io *io); | |
322 | /** | |
323 | * Initialize io state for a given layer. | |
324 | * | |
325 | * called top-to-bottom once per io existence to initialize io | |
326 | * state. If layer wants to keep some state for this type of io, it | |
327 | * has to embed struct cl_io_slice in lu_env::le_ses, and register | |
328 | * slice with cl_io_slice_add(). It is guaranteed that all threads | |
329 | * participating in this io share the same session. | |
330 | */ | |
331 | int (*coo_io_init)(const struct lu_env *env, | |
332 | struct cl_object *obj, struct cl_io *io); | |
333 | /** | |
334 | * Fill portion of \a attr that this layer controls. This method is | |
335 | * called top-to-bottom through all object layers. | |
336 | * | |
337 | * \pre cl_object_header::coh_attr_guard of the top-object is locked. | |
338 | * | |
339 | * \return 0: to continue | |
340 | * \return +ve: to stop iterating through layers (but 0 is returned | |
341 | * from enclosing cl_object_attr_get()) | |
342 | * \return -ve: to signal error | |
343 | */ | |
344 | int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj, | |
345 | struct cl_attr *attr); | |
346 | /** | |
347 | * Update attributes. | |
348 | * | |
349 | * \a valid is a bitmask composed from enum #cl_attr_valid, and | |
350 | * indicating what attributes are to be set. | |
351 | * | |
352 | * \pre cl_object_header::coh_attr_guard of the top-object is locked. | |
353 | * | |
354 | * \return the same convention as for | |
355 | * cl_object_operations::coo_attr_get() is used. | |
356 | */ | |
96234ec5 BJ |
357 | int (*coo_attr_update)(const struct lu_env *env, struct cl_object *obj, |
358 | const struct cl_attr *attr, unsigned int valid); | |
d7e09d03 PT |
359 | /** |
360 | * Update object configuration. Called top-to-bottom to modify object | |
361 | * configuration. | |
362 | * | |
363 | * XXX error conditions and handling. | |
364 | */ | |
365 | int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj, | |
366 | const struct cl_object_conf *conf); | |
367 | /** | |
368 | * Glimpse ast. Executed when glimpse ast arrives for a lock on this | |
369 | * object. Layers are supposed to fill parts of \a lvb that will be | |
370 | * shipped to the glimpse originator as a glimpse result. | |
371 | * | |
8c7b0e1a | 372 | * \see vvp_object_glimpse(), lovsub_object_glimpse(), |
d7e09d03 PT |
373 | * \see osc_object_glimpse() |
374 | */ | |
375 | int (*coo_glimpse)(const struct lu_env *env, | |
376 | const struct cl_object *obj, struct ost_lvb *lvb); | |
d9d47901 JX |
377 | /** |
378 | * Object prune method. Called when the layout is going to change on | |
379 | * this object, therefore each layer has to clean up their cache, | |
380 | * mainly pages and locks. | |
381 | */ | |
382 | int (*coo_prune)(const struct lu_env *env, struct cl_object *obj); | |
a33fdc0d BJ |
383 | /** |
384 | * Object getstripe method. | |
385 | */ | |
386 | int (*coo_getstripe)(const struct lu_env *env, struct cl_object *obj, | |
387 | struct lov_user_md __user *lum); | |
cbd4d4a8 BJ |
388 | /** |
389 | * Get FIEMAP mapping from the object. | |
390 | */ | |
391 | int (*coo_fiemap)(const struct lu_env *env, struct cl_object *obj, | |
392 | struct ll_fiemap_info_key *fmkey, | |
393 | struct fiemap *fiemap, size_t *buflen); | |
55554f31 JH |
394 | /** |
395 | * Get layout and generation of the object. | |
396 | */ | |
397 | int (*coo_layout_get)(const struct lu_env *env, struct cl_object *obj, | |
398 | struct cl_layout *layout); | |
090a6485 JH |
399 | /** |
400 | * Get maximum size of the object. | |
401 | */ | |
402 | loff_t (*coo_maxbytes)(struct cl_object *obj); | |
6a9b2c92 JX |
403 | /** |
404 | * Set request attributes. | |
405 | */ | |
406 | void (*coo_req_attr_set)(const struct lu_env *env, | |
407 | struct cl_object *obj, | |
408 | struct cl_req_attr *attr); | |
d7e09d03 PT |
409 | }; |
410 | ||
411 | /** | |
412 | * Extended header for client object. | |
413 | */ | |
414 | struct cl_object_header { | |
415 | /** Standard lu_object_header. cl_object::co_lu::lo_header points | |
c56e256d OD |
416 | * here. |
417 | */ | |
d7e09d03 | 418 | struct lu_object_header coh_lu; |
d7e09d03 PT |
419 | |
420 | /** | |
421 | * Parent object. It is assumed that an object has a well-defined | |
422 | * parent, but not a well-defined child (there may be multiple | |
423 | * sub-objects, for the same top-object). cl_object_header::coh_parent | |
424 | * field allows certain code to be written generically, without | |
425 | * limiting possible cl_object layouts unduly. | |
426 | */ | |
427 | struct cl_object_header *coh_parent; | |
428 | /** | |
429 | * Protects consistency between cl_attr of parent object and | |
430 | * attributes of sub-objects, that the former is calculated ("merged") | |
431 | * from. | |
432 | * | |
433 | * \todo XXX this can be read/write lock if needed. | |
434 | */ | |
435 | spinlock_t coh_attr_guard; | |
436 | /** | |
437 | * Size of cl_page + page slices | |
438 | */ | |
439 | unsigned short coh_page_bufsize; | |
440 | /** | |
441 | * Number of objects above this one: 0 for a top-object, 1 for its | |
442 | * sub-object, etc. | |
443 | */ | |
444 | unsigned char coh_nesting; | |
445 | }; | |
446 | ||
447 | /** | |
448 | * Helper macro: iterate over all layers of the object \a obj, assigning every | |
449 | * layer top-to-bottom to \a slice. | |
450 | */ | |
451 | #define cl_object_for_each(slice, obj) \ | |
452 | list_for_each_entry((slice), \ | |
453 | &(obj)->co_lu.lo_header->loh_layers, \ | |
454 | co_lu.lo_linkage) | |
455 | /** | |
456 | * Helper macro: iterate over all layers of the object \a obj, assigning every | |
457 | * layer bottom-to-top to \a slice. | |
458 | */ | |
459 | #define cl_object_for_each_reverse(slice, obj) \ | |
460 | list_for_each_entry_reverse((slice), \ | |
461 | &(obj)->co_lu.lo_header->loh_layers, \ | |
462 | co_lu.lo_linkage) | |
463 | /** @} cl_object */ | |
464 | ||
d7e09d03 PT |
465 | #define CL_PAGE_EOF ((pgoff_t)~0ull) |
466 | ||
467 | /** \addtogroup cl_page cl_page | |
c56e256d OD |
468 | * @{ |
469 | */ | |
d7e09d03 PT |
470 | |
471 | /** \struct cl_page | |
472 | * Layered client page. | |
473 | * | |
474 | * cl_page: represents a portion of a file, cached in the memory. All pages | |
475 | * of the given file are of the same size, and are kept in the radix tree | |
476 | * hanging off the cl_object. cl_page doesn't fan out, but as sub-objects | |
477 | * of the top-level file object are first class cl_objects, they have their | |
478 | * own radix trees of pages and hence page is implemented as a sequence of | |
479 | * struct cl_pages's, linked into double-linked list through | |
480 | * cl_page::cp_parent and cl_page::cp_child pointers, each residing in the | |
481 | * corresponding radix tree at the corresponding logical offset. | |
482 | * | |
483 | * cl_page is associated with VM page of the hosting environment (struct | |
484 | * page in Linux kernel, for example), struct page. It is assumed, that this | |
485 | * association is implemented by one of cl_page layers (top layer in the | |
486 | * current design) that | |
487 | * | |
488 | * - intercepts per-VM-page call-backs made by the environment (e.g., | |
489 | * memory pressure), | |
490 | * | |
491 | * - translates state (page flag bits) and locking between lustre and | |
492 | * environment. | |
493 | * | |
494 | * The association between cl_page and struct page is immutable and | |
495 | * established when cl_page is created. | |
496 | * | |
497 | * cl_page can be "owned" by a particular cl_io (see below), guaranteeing | |
498 | * this io an exclusive access to this page w.r.t. other io attempts and | |
499 | * various events changing page state (such as transfer completion, or | |
500 | * eviction of the page from the memory). Note, that in general cl_io | |
501 | * cannot be identified with a particular thread, and page ownership is not | |
502 | * exactly equal to the current thread holding a lock on the page. Layer | |
503 | * implementing association between cl_page and struct page has to implement | |
504 | * ownership on top of available synchronization mechanisms. | |
505 | * | |
506 | * While lustre client maintains the notion of an page ownership by io, | |
507 | * hosting MM/VM usually has its own page concurrency control | |
508 | * mechanisms. For example, in Linux, page access is synchronized by the | |
509 | * per-page PG_locked bit-lock, and generic kernel code (generic_file_*()) | |
510 | * takes care to acquire and release such locks as necessary around the | |
511 | * calls to the file system methods (->readpage(), ->prepare_write(), | |
512 | * ->commit_write(), etc.). This leads to the situation when there are two | |
513 | * different ways to own a page in the client: | |
514 | * | |
515 | * - client code explicitly and voluntary owns the page (cl_page_own()); | |
516 | * | |
517 | * - VM locks a page and then calls the client, that has "to assume" | |
518 | * the ownership from the VM (cl_page_assume()). | |
519 | * | |
520 | * Dual methods to release ownership are cl_page_disown() and | |
521 | * cl_page_unassume(). | |
522 | * | |
523 | * cl_page is reference counted (cl_page::cp_ref). When reference counter | |
524 | * drops to 0, the page is returned to the cache, unless it is in | |
525 | * cl_page_state::CPS_FREEING state, in which case it is immediately | |
526 | * destroyed. | |
527 | * | |
528 | * The general logic guaranteeing the absence of "existential races" for | |
529 | * pages is the following: | |
530 | * | |
531 | * - there are fixed known ways for a thread to obtain a new reference | |
532 | * to a page: | |
533 | * | |
534 | * - by doing a lookup in the cl_object radix tree, protected by the | |
535 | * spin-lock; | |
536 | * | |
537 | * - by starting from VM-locked struct page and following some | |
538 | * hosting environment method (e.g., following ->private pointer in | |
539 | * the case of Linux kernel), see cl_vmpage_page(); | |
540 | * | |
541 | * - when the page enters cl_page_state::CPS_FREEING state, all these | |
542 | * ways are severed with the proper synchronization | |
543 | * (cl_page_delete()); | |
544 | * | |
545 | * - entry into cl_page_state::CPS_FREEING is serialized by the VM page | |
546 | * lock; | |
547 | * | |
548 | * - no new references to the page in cl_page_state::CPS_FREEING state | |
549 | * are allowed (checked in cl_page_get()). | |
550 | * | |
551 | * Together this guarantees that when last reference to a | |
552 | * cl_page_state::CPS_FREEING page is released, it is safe to destroy the | |
553 | * page, as neither references to it can be acquired at that point, nor | |
554 | * ones exist. | |
555 | * | |
556 | * cl_page is a state machine. States are enumerated in enum | |
557 | * cl_page_state. Possible state transitions are enumerated in | |
558 | * cl_page_state_set(). State transition process (i.e., actual changing of | |
559 | * cl_page::cp_state field) is protected by the lock on the underlying VM | |
560 | * page. | |
561 | * | |
562 | * Linux Kernel implementation. | |
563 | * | |
564 | * Binding between cl_page and struct page (which is a typedef for | |
565 | * struct page) is implemented in the vvp layer. cl_page is attached to the | |
566 | * ->private pointer of the struct page, together with the setting of | |
567 | * PG_private bit in page->flags, and acquiring additional reference on the | |
568 | * struct page (much like struct buffer_head, or any similar file system | |
569 | * private data structures). | |
570 | * | |
571 | * PG_locked lock is used to implement both ownership and transfer | |
572 | * synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}} | |
573 | * states. No additional references are acquired for the duration of the | |
574 | * transfer. | |
575 | * | |
576 | * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where | |
577 | * write-out is "protected" by the special PG_writeback bit. | |
578 | */ | |
579 | ||
580 | /** | |
581 | * States of cl_page. cl_page.c assumes particular order here. | |
582 | * | |
583 | * The page state machine is rather crude, as it doesn't recognize finer page | |
584 | * states like "dirty" or "up to date". This is because such states are not | |
585 | * always well defined for the whole stack (see, for example, the | |
586 | * implementation of the read-ahead, that hides page up-to-dateness to track | |
587 | * cache hits accurately). Such sub-states are maintained by the layers that | |
588 | * are interested in them. | |
589 | */ | |
590 | enum cl_page_state { | |
591 | /** | |
592 | * Page is in the cache, un-owned. Page leaves cached state in the | |
593 | * following cases: | |
594 | * | |
595 | * - [cl_page_state::CPS_OWNED] io comes across the page and | |
596 | * owns it; | |
597 | * | |
598 | * - [cl_page_state::CPS_PAGEOUT] page is dirty, the | |
599 | * req-formation engine decides that it wants to include this page | |
6a9b2c92 | 600 | * into an RPC being constructed, and yanks it from the cache; |
d7e09d03 PT |
601 | * |
602 | * - [cl_page_state::CPS_FREEING] VM callback is executed to | |
603 | * evict the page form the memory; | |
604 | * | |
605 | * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL | |
606 | */ | |
607 | CPS_CACHED, | |
608 | /** | |
609 | * Page is exclusively owned by some cl_io. Page may end up in this | |
610 | * state as a result of | |
611 | * | |
612 | * - io creating new page and immediately owning it; | |
613 | * | |
614 | * - [cl_page_state::CPS_CACHED] io finding existing cached page | |
615 | * and owning it; | |
616 | * | |
617 | * - [cl_page_state::CPS_OWNED] io finding existing owned page | |
618 | * and waiting for owner to release the page; | |
619 | * | |
620 | * Page leaves owned state in the following cases: | |
621 | * | |
622 | * - [cl_page_state::CPS_CACHED] io decides to leave the page in | |
623 | * the cache, doing nothing; | |
624 | * | |
625 | * - [cl_page_state::CPS_PAGEIN] io starts read transfer for | |
626 | * this page; | |
627 | * | |
628 | * - [cl_page_state::CPS_PAGEOUT] io starts immediate write | |
629 | * transfer for this page; | |
630 | * | |
631 | * - [cl_page_state::CPS_FREEING] io decides to destroy this | |
632 | * page (e.g., as part of truncate or extent lock cancellation). | |
633 | * | |
634 | * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL | |
635 | */ | |
636 | CPS_OWNED, | |
637 | /** | |
638 | * Page is being written out, as a part of a transfer. This state is | |
639 | * entered when req-formation logic decided that it wants this page to | |
640 | * be sent through the wire _now_. Specifically, it means that once | |
641 | * this state is achieved, transfer completion handler (with either | |
642 | * success or failure indication) is guaranteed to be executed against | |
643 | * this page independently of any locks and any scheduling decisions | |
644 | * made by the hosting environment (that effectively means that the | |
645 | * page is never put into cl_page_state::CPS_PAGEOUT state "in | |
646 | * advance". This property is mentioned, because it is important when | |
647 | * reasoning about possible dead-locks in the system). The page can | |
648 | * enter this state as a result of | |
649 | * | |
650 | * - [cl_page_state::CPS_OWNED] an io requesting an immediate | |
651 | * write-out of this page, or | |
652 | * | |
653 | * - [cl_page_state::CPS_CACHED] req-forming engine deciding | |
654 | * that it has enough dirty pages cached to issue a "good" | |
655 | * transfer. | |
656 | * | |
657 | * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer | |
658 | * is completed---it is moved into cl_page_state::CPS_CACHED state. | |
659 | * | |
660 | * Underlying VM page is locked for the duration of transfer. | |
661 | * | |
662 | * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL | |
663 | */ | |
664 | CPS_PAGEOUT, | |
665 | /** | |
666 | * Page is being read in, as a part of a transfer. This is quite | |
667 | * similar to the cl_page_state::CPS_PAGEOUT state, except that | |
668 | * read-in is always "immediate"---there is no such thing a sudden | |
6a9b2c92 | 669 | * construction of read request from cached, presumably not up to date, |
d7e09d03 PT |
670 | * pages. |
671 | * | |
672 | * Underlying VM page is locked for the duration of transfer. | |
673 | * | |
674 | * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL | |
675 | */ | |
676 | CPS_PAGEIN, | |
677 | /** | |
678 | * Page is being destroyed. This state is entered when client decides | |
679 | * that page has to be deleted from its host object, as, e.g., a part | |
680 | * of truncate. | |
681 | * | |
682 | * Once this state is reached, there is no way to escape it. | |
683 | * | |
684 | * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL | |
685 | */ | |
686 | CPS_FREEING, | |
687 | CPS_NR | |
688 | }; | |
689 | ||
690 | enum cl_page_type { | |
691 | /** Host page, the page is from the host inode which the cl_page | |
c56e256d OD |
692 | * belongs to. |
693 | */ | |
d7e09d03 PT |
694 | CPT_CACHEABLE = 1, |
695 | ||
696 | /** Transient page, the transient cl_page is used to bind a cl_page | |
697 | * to vmpage which is not belonging to the same object of cl_page. | |
85f552df | 698 | * it is used in DirectIO and lockless IO. |
c56e256d | 699 | */ |
d7e09d03 PT |
700 | CPT_TRANSIENT, |
701 | }; | |
702 | ||
d7e09d03 PT |
703 | /** |
704 | * Fields are protected by the lock on struct page, except for atomics and | |
705 | * immutables. | |
706 | * | |
707 | * \invariant Data type invariants are in cl_page_invariant(). Basically: | |
708 | * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked | |
709 | * list, consistent with the parent/child pointers in the cl_page::cp_obj and | |
710 | * cl_page::cp_owner (when set). | |
711 | */ | |
712 | struct cl_page { | |
713 | /** Reference counter. */ | |
714 | atomic_t cp_ref; | |
715 | /** An object this page is a part of. Immutable after creation. */ | |
716 | struct cl_object *cp_obj; | |
7addf402 JX |
717 | /** vmpage */ |
718 | struct page *cp_vmpage; | |
96c53363 JX |
719 | /** Linkage of pages within group. Pages must be owned */ |
720 | struct list_head cp_batch; | |
721 | /** List of slices. Immutable after creation. */ | |
722 | struct list_head cp_layers; | |
d7e09d03 PT |
723 | /** |
724 | * Page state. This field is const to avoid accidental update, it is | |
725 | * modified only internally within cl_page.c. Protected by a VM lock. | |
726 | */ | |
727 | const enum cl_page_state cp_state; | |
d7e09d03 PT |
728 | /** |
729 | * Page type. Only CPT_TRANSIENT is used so far. Immutable after | |
730 | * creation. | |
731 | */ | |
732 | enum cl_page_type cp_type; | |
733 | ||
734 | /** | |
735 | * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned | |
736 | * by sub-io. Protected by a VM lock. | |
737 | */ | |
738 | struct cl_io *cp_owner; | |
d7e09d03 PT |
739 | /** List of references to this page, for debugging. */ |
740 | struct lu_ref cp_reference; | |
741 | /** Link to an object, for debugging. */ | |
631abc6e | 742 | struct lu_ref_link cp_obj_ref; |
d7e09d03 | 743 | /** Link to a queue, for debugging. */ |
631abc6e | 744 | struct lu_ref_link cp_queue_ref; |
d7e09d03 PT |
745 | /** Assigned if doing a sync_io */ |
746 | struct cl_sync_io *cp_sync_io; | |
747 | }; | |
748 | ||
749 | /** | |
750 | * Per-layer part of cl_page. | |
751 | * | |
3a52f803 | 752 | * \see vvp_page, lov_page, osc_page |
d7e09d03 PT |
753 | */ |
754 | struct cl_page_slice { | |
755 | struct cl_page *cpl_page; | |
7addf402 | 756 | pgoff_t cpl_index; |
d7e09d03 PT |
757 | /** |
758 | * Object slice corresponding to this page slice. Immutable after | |
759 | * creation. | |
760 | */ | |
761 | struct cl_object *cpl_obj; | |
762 | const struct cl_page_operations *cpl_ops; | |
763 | /** Linkage into cl_page::cp_layers. Immutable after creation. */ | |
764 | struct list_head cpl_linkage; | |
765 | }; | |
766 | ||
767 | /** | |
768 | * Lock mode. For the client extent locks. | |
769 | * | |
d7e09d03 PT |
770 | * \ingroup cl_lock |
771 | */ | |
772 | enum cl_lock_mode { | |
d7e09d03 PT |
773 | CLM_READ, |
774 | CLM_WRITE, | |
775 | CLM_GROUP | |
776 | }; | |
777 | ||
778 | /** | |
779 | * Requested transfer type. | |
d7e09d03 PT |
780 | */ |
781 | enum cl_req_type { | |
782 | CRT_READ, | |
783 | CRT_WRITE, | |
784 | CRT_NR | |
785 | }; | |
786 | ||
787 | /** | |
788 | * Per-layer page operations. | |
789 | * | |
790 | * Methods taking an \a io argument are for the activity happening in the | |
791 | * context of given \a io. Page is assumed to be owned by that io, except for | |
792 | * the obvious cases (like cl_page_operations::cpo_own()). | |
793 | * | |
794 | * \see vvp_page_ops, lov_page_ops, osc_page_ops | |
795 | */ | |
796 | struct cl_page_operations { | |
797 | /** | |
798 | * cl_page<->struct page methods. Only one layer in the stack has to | |
799 | * implement these. Current code assumes that this functionality is | |
800 | * provided by the topmost layer, see cl_page_disown0() as an example. | |
801 | */ | |
802 | ||
d7e09d03 PT |
803 | /** |
804 | * Called when \a io acquires this page into the exclusive | |
805 | * ownership. When this method returns, it is guaranteed that the is | |
806 | * not owned by other io, and no transfer is going on against | |
807 | * it. Optional. | |
808 | * | |
809 | * \see cl_page_own() | |
810 | * \see vvp_page_own(), lov_page_own() | |
811 | */ | |
812 | int (*cpo_own)(const struct lu_env *env, | |
813 | const struct cl_page_slice *slice, | |
814 | struct cl_io *io, int nonblock); | |
815 | /** Called when ownership it yielded. Optional. | |
816 | * | |
817 | * \see cl_page_disown() | |
818 | * \see vvp_page_disown() | |
819 | */ | |
820 | void (*cpo_disown)(const struct lu_env *env, | |
821 | const struct cl_page_slice *slice, struct cl_io *io); | |
822 | /** | |
823 | * Called for a page that is already "owned" by \a io from VM point of | |
824 | * view. Optional. | |
825 | * | |
826 | * \see cl_page_assume() | |
827 | * \see vvp_page_assume(), lov_page_assume() | |
828 | */ | |
829 | void (*cpo_assume)(const struct lu_env *env, | |
830 | const struct cl_page_slice *slice, struct cl_io *io); | |
831 | /** Dual to cl_page_operations::cpo_assume(). Optional. Called | |
832 | * bottom-to-top when IO releases a page without actually unlocking | |
833 | * it. | |
834 | * | |
835 | * \see cl_page_unassume() | |
836 | * \see vvp_page_unassume() | |
837 | */ | |
838 | void (*cpo_unassume)(const struct lu_env *env, | |
839 | const struct cl_page_slice *slice, | |
840 | struct cl_io *io); | |
841 | /** | |
842 | * Announces whether the page contains valid data or not by \a uptodate. | |
843 | * | |
844 | * \see cl_page_export() | |
845 | * \see vvp_page_export() | |
846 | */ | |
847 | void (*cpo_export)(const struct lu_env *env, | |
848 | const struct cl_page_slice *slice, int uptodate); | |
d7e09d03 PT |
849 | /** |
850 | * Checks whether underlying VM page is locked (in the suitable | |
851 | * sense). Used for assertions. | |
852 | * | |
853 | * \retval -EBUSY: page is protected by a lock of a given mode; | |
854 | * \retval -ENODATA: page is not protected by a lock; | |
855 | * \retval 0: this layer cannot decide. (Should never happen.) | |
856 | */ | |
857 | int (*cpo_is_vmlocked)(const struct lu_env *env, | |
858 | const struct cl_page_slice *slice); | |
859 | /** | |
860 | * Page destruction. | |
861 | */ | |
862 | ||
863 | /** | |
864 | * Called when page is truncated from the object. Optional. | |
865 | * | |
866 | * \see cl_page_discard() | |
867 | * \see vvp_page_discard(), osc_page_discard() | |
868 | */ | |
869 | void (*cpo_discard)(const struct lu_env *env, | |
870 | const struct cl_page_slice *slice, | |
871 | struct cl_io *io); | |
872 | /** | |
873 | * Called when page is removed from the cache, and is about to being | |
874 | * destroyed. Optional. | |
875 | * | |
876 | * \see cl_page_delete() | |
877 | * \see vvp_page_delete(), osc_page_delete() | |
878 | */ | |
879 | void (*cpo_delete)(const struct lu_env *env, | |
880 | const struct cl_page_slice *slice); | |
881 | /** Destructor. Frees resources and slice itself. */ | |
882 | void (*cpo_fini)(const struct lu_env *env, | |
883 | struct cl_page_slice *slice); | |
d7e09d03 PT |
884 | /** |
885 | * Optional debugging helper. Prints given page slice. | |
886 | * | |
887 | * \see cl_page_print() | |
888 | */ | |
889 | int (*cpo_print)(const struct lu_env *env, | |
890 | const struct cl_page_slice *slice, | |
891 | void *cookie, lu_printer_t p); | |
892 | /** | |
893 | * \name transfer | |
894 | * | |
6a9b2c92 | 895 | * Transfer methods. |
d7e09d03 PT |
896 | * |
897 | * @{ | |
898 | */ | |
899 | /** | |
900 | * Request type dependent vector of operations. | |
901 | * | |
902 | * Transfer operations depend on transfer mode (cl_req_type). To avoid | |
903 | * passing transfer mode to each and every of these methods, and to | |
904 | * avoid branching on request type inside of the methods, separate | |
905 | * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are | |
906 | * provided. That is, method invocation usually looks like | |
907 | * | |
908 | * slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...); | |
909 | */ | |
910 | struct { | |
911 | /** | |
912 | * Called when a page is submitted for a transfer as a part of | |
913 | * cl_page_list. | |
914 | * | |
915 | * \return 0 : page is eligible for submission; | |
916 | * \return -EALREADY : skip this page; | |
917 | * \return -ve : error. | |
918 | * | |
919 | * \see cl_page_prep() | |
920 | */ | |
921 | int (*cpo_prep)(const struct lu_env *env, | |
922 | const struct cl_page_slice *slice, | |
923 | struct cl_io *io); | |
924 | /** | |
925 | * Completion handler. This is guaranteed to be eventually | |
926 | * fired after cl_page_operations::cpo_prep() or | |
927 | * cl_page_operations::cpo_make_ready() call. | |
928 | * | |
929 | * This method can be called in a non-blocking context. It is | |
930 | * guaranteed however, that the page involved and its object | |
931 | * are pinned in memory (and, hence, calling cl_page_put() is | |
932 | * safe). | |
933 | * | |
934 | * \see cl_page_completion() | |
935 | */ | |
936 | void (*cpo_completion)(const struct lu_env *env, | |
937 | const struct cl_page_slice *slice, | |
938 | int ioret); | |
939 | /** | |
940 | * Called when cached page is about to be added to the | |
6a9b2c92 | 941 | * ptlrpc request as a part of req formation. |
d7e09d03 PT |
942 | * |
943 | * \return 0 : proceed with this page; | |
944 | * \return -EAGAIN : skip this page; | |
945 | * \return -ve : error. | |
946 | * | |
947 | * \see cl_page_make_ready() | |
948 | */ | |
949 | int (*cpo_make_ready)(const struct lu_env *env, | |
950 | const struct cl_page_slice *slice); | |
d7e09d03 PT |
951 | } io[CRT_NR]; |
952 | /** | |
953 | * Tell transfer engine that only [to, from] part of a page should be | |
954 | * transmitted. | |
955 | * | |
956 | * This is used for immediate transfers. | |
957 | * | |
958 | * \todo XXX this is not very good interface. It would be much better | |
959 | * if all transfer parameters were supplied as arguments to | |
960 | * cl_io_operations::cio_submit() call, but it is not clear how to do | |
961 | * this for page queues. | |
962 | * | |
963 | * \see cl_page_clip() | |
964 | */ | |
965 | void (*cpo_clip)(const struct lu_env *env, | |
966 | const struct cl_page_slice *slice, | |
967 | int from, int to); | |
968 | /** | |
969 | * \pre the page was queued for transferring. | |
970 | * \post page is removed from client's pending list, or -EBUSY | |
971 | * is returned if it has already been in transferring. | |
972 | * | |
973 | * This is one of seldom page operation which is: | |
974 | * 0. called from top level; | |
975 | * 1. don't have vmpage locked; | |
976 | * 2. every layer should synchronize execution of its ->cpo_cancel() | |
977 | * with completion handlers. Osc uses client obd lock for this | |
978 | * purpose. Based on there is no vvp_page_cancel and | |
979 | * lov_page_cancel(), cpo_cancel is defacto protected by client lock. | |
980 | * | |
981 | * \see osc_page_cancel(). | |
982 | */ | |
983 | int (*cpo_cancel)(const struct lu_env *env, | |
984 | const struct cl_page_slice *slice); | |
985 | /** | |
986 | * Write out a page by kernel. This is only called by ll_writepage | |
987 | * right now. | |
988 | * | |
989 | * \see cl_page_flush() | |
990 | */ | |
991 | int (*cpo_flush)(const struct lu_env *env, | |
992 | const struct cl_page_slice *slice, | |
993 | struct cl_io *io); | |
994 | /** @} transfer */ | |
995 | }; | |
996 | ||
997 | /** | |
998 | * Helper macro, dumping detailed information about \a page into a log. | |
999 | */ | |
1000 | #define CL_PAGE_DEBUG(mask, env, page, format, ...) \ | |
1001 | do { \ | |
d7e09d03 | 1002 | if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ |
83e8d02c | 1003 | LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ |
d7e09d03 | 1004 | cl_page_print(env, &msgdata, lu_cdebug_printer, page); \ |
b2952d62 | 1005 | CDEBUG(mask, format, ## __VA_ARGS__); \ |
d7e09d03 PT |
1006 | } \ |
1007 | } while (0) | |
1008 | ||
1009 | /** | |
1010 | * Helper macro, dumping shorter information about \a page into a log. | |
1011 | */ | |
1012 | #define CL_PAGE_HEADER(mask, env, page, format, ...) \ | |
1013 | do { \ | |
d7e09d03 | 1014 | if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ |
83e8d02c | 1015 | LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ |
d7e09d03 | 1016 | cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \ |
b2952d62 | 1017 | CDEBUG(mask, format, ## __VA_ARGS__); \ |
d7e09d03 PT |
1018 | } \ |
1019 | } while (0) | |
1020 | ||
7addf402 JX |
1021 | static inline struct page *cl_page_vmpage(struct cl_page *page) |
1022 | { | |
1023 | LASSERT(page->cp_vmpage); | |
1024 | return page->cp_vmpage; | |
1025 | } | |
1026 | ||
d806f30e JX |
1027 | /** |
1028 | * Check if a cl_page is in use. | |
1029 | * | |
1030 | * Client cache holds a refcount, this refcount will be dropped when | |
1031 | * the page is taken out of cache, see vvp_page_delete(). | |
1032 | */ | |
1033 | static inline bool __page_in_use(const struct cl_page *page, int refc) | |
1034 | { | |
1035 | return (atomic_read(&page->cp_ref) > refc + 1); | |
1036 | } | |
1037 | ||
1038 | /** | |
1039 | * Caller itself holds a refcount of cl_page. | |
1040 | */ | |
1041 | #define cl_page_in_use(pg) __page_in_use(pg, 1) | |
1042 | /** | |
1043 | * Caller doesn't hold a refcount. | |
1044 | */ | |
1045 | #define cl_page_in_use_noref(pg) __page_in_use(pg, 0) | |
1046 | ||
d7e09d03 PT |
1047 | /** @} cl_page */ |
1048 | ||
1049 | /** \addtogroup cl_lock cl_lock | |
c56e256d OD |
1050 | * @{ |
1051 | */ | |
d7e09d03 PT |
1052 | /** \struct cl_lock |
1053 | * | |
1054 | * Extent locking on the client. | |
1055 | * | |
1056 | * LAYERING | |
1057 | * | |
1058 | * The locking model of the new client code is built around | |
1059 | * | |
1060 | * struct cl_lock | |
1061 | * | |
1062 | * data-type representing an extent lock on a regular file. cl_lock is a | |
1063 | * layered object (much like cl_object and cl_page), it consists of a header | |
1064 | * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to | |
1065 | * cl_lock::cll_layers list through cl_lock_slice::cls_linkage. | |
1066 | * | |
d7e09d03 PT |
1067 | * Typical cl_lock consists of the two layers: |
1068 | * | |
1069 | * - vvp_lock (vvp specific data), and | |
1070 | * - lov_lock (lov specific data). | |
1071 | * | |
1072 | * lov_lock contains an array of sub-locks. Each of these sub-locks is a | |
1073 | * normal cl_lock: it has a header (struct cl_lock) and a list of layers: | |
1074 | * | |
1075 | * - lovsub_lock, and | |
1076 | * - osc_lock | |
1077 | * | |
1078 | * Each sub-lock is associated with a cl_object (representing stripe | |
1079 | * sub-object or the file to which top-level cl_lock is associated to), and is | |
1080 | * linked into that cl_object::coh_locks. In this respect cl_lock is similar to | |
1081 | * cl_object (that at lov layer also fans out into multiple sub-objects), and | |
1082 | * is different from cl_page, that doesn't fan out (there is usually exactly | |
1083 | * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock | |
1084 | * a "top-lock" and its lovsub-osc portion a "sub-lock". | |
1085 | * | |
1086 | * LIFE CYCLE | |
1087 | * | |
71a96a05 BJ |
1088 | * cl_lock is a cacheless data container for the requirements of locks to |
1089 | * complete the IO. cl_lock is created before I/O starts and destroyed when the | |
1090 | * I/O is complete. | |
1091 | * | |
1092 | * cl_lock depends on LDLM lock to fulfill lock semantics. LDLM lock is attached | |
1093 | * to cl_lock at OSC layer. LDLM lock is still cacheable. | |
d7e09d03 PT |
1094 | * |
1095 | * INTERFACE AND USAGE | |
1096 | * | |
71a96a05 BJ |
1097 | * Two major methods are supported for cl_lock: clo_enqueue and clo_cancel. A |
1098 | * cl_lock is enqueued by cl_lock_request(), which will call clo_enqueue() | |
1099 | * methods for each layer to enqueue the lock. At the LOV layer, if a cl_lock | |
1100 | * consists of multiple sub cl_locks, each sub locks will be enqueued | |
1101 | * correspondingly. At OSC layer, the lock enqueue request will tend to reuse | |
1102 | * cached LDLM lock; otherwise a new LDLM lock will have to be requested from | |
1103 | * OST side. | |
d7e09d03 | 1104 | * |
71a96a05 BJ |
1105 | * cl_lock_cancel() must be called to release a cl_lock after use. clo_cancel() |
1106 | * method will be called for each layer to release the resource held by this | |
1107 | * lock. At OSC layer, the reference count of LDLM lock, which is held at | |
1108 | * clo_enqueue time, is released. | |
d7e09d03 | 1109 | * |
71a96a05 | 1110 | * LDLM lock can only be canceled if there is no cl_lock using it. |
d7e09d03 PT |
1111 | * |
1112 | * Overall process of the locking during IO operation is as following: | |
1113 | * | |
1114 | * - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock() | |
1115 | * is called on each layer. Responsibility of this method is to add locks, | |
1116 | * needed by a given layer into cl_io.ci_lockset. | |
1117 | * | |
1118 | * - once locks for all layers were collected, they are sorted to avoid | |
1119 | * dead-locks (cl_io_locks_sort()), and enqueued. | |
1120 | * | |
1121 | * - when all locks are acquired, IO is performed; | |
1122 | * | |
71a96a05 | 1123 | * - locks are released after IO is complete. |
d7e09d03 PT |
1124 | * |
1125 | * Striping introduces major additional complexity into locking. The | |
1126 | * fundamental problem is that it is generally unsafe to actively use (hold) | |
1127 | * two locks on the different OST servers at the same time, as this introduces | |
1128 | * inter-server dependency and can lead to cascading evictions. | |
1129 | * | |
1130 | * Basic solution is to sub-divide large read/write IOs into smaller pieces so | |
1131 | * that no multi-stripe locks are taken (note that this design abandons POSIX | |
1132 | * read/write semantics). Such pieces ideally can be executed concurrently. At | |
1133 | * the same time, certain types of IO cannot be sub-divived, without | |
1134 | * sacrificing correctness. This includes: | |
1135 | * | |
1136 | * - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee | |
1137 | * atomicity; | |
1138 | * | |
1139 | * - ftruncate(fd, offset), where [offset, EOF] lock has to be taken. | |
1140 | * | |
1141 | * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where | |
1142 | * buf is a part of memory mapped Lustre file, a lock or locks protecting buf | |
1143 | * has to be held together with the usual lock on [offset, offset + count]. | |
1144 | * | |
d7e09d03 PT |
1145 | * Interaction with DLM |
1146 | * | |
1147 | * In the expected setup, cl_lock is ultimately backed up by a collection of | |
1148 | * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is | |
1149 | * implemented in osc layer, that also matches DLM events (ASTs, cancellation, | |
1150 | * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed | |
1151 | * description of interaction with DLM. | |
1152 | */ | |
1153 | ||
1154 | /** | |
1155 | * Lock description. | |
1156 | */ | |
1157 | struct cl_lock_descr { | |
1158 | /** Object this lock is granted for. */ | |
1159 | struct cl_object *cld_obj; | |
1160 | /** Index of the first page protected by this lock. */ | |
1161 | pgoff_t cld_start; | |
1162 | /** Index of the last page (inclusive) protected by this lock. */ | |
1163 | pgoff_t cld_end; | |
1164 | /** Group ID, for group lock */ | |
1165 | __u64 cld_gid; | |
1166 | /** Lock mode. */ | |
1167 | enum cl_lock_mode cld_mode; | |
1168 | /** | |
1169 | * flags to enqueue lock. A combination of bit-flags from | |
1170 | * enum cl_enq_flags. | |
1171 | */ | |
1172 | __u32 cld_enq_flags; | |
1173 | }; | |
1174 | ||
06563b56 | 1175 | #define DDESCR "%s(%d):[%lu, %lu]:%x" |
d7e09d03 PT |
1176 | #define PDESCR(descr) \ |
1177 | cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode, \ | |
06563b56 | 1178 | (descr)->cld_start, (descr)->cld_end, (descr)->cld_enq_flags |
d7e09d03 PT |
1179 | |
1180 | const char *cl_lock_mode_name(const enum cl_lock_mode mode); | |
1181 | ||
d7e09d03 PT |
1182 | /** |
1183 | * Layered client lock. | |
1184 | */ | |
1185 | struct cl_lock { | |
d7e09d03 PT |
1186 | /** List of slices. Immutable after creation. */ |
1187 | struct list_head cll_layers; | |
06563b56 | 1188 | /** lock attribute, extent, cl_object, etc. */ |
d7e09d03 | 1189 | struct cl_lock_descr cll_descr; |
d7e09d03 PT |
1190 | }; |
1191 | ||
1192 | /** | |
1193 | * Per-layer part of cl_lock | |
1194 | * | |
4a4eee07 | 1195 | * \see vvp_lock, lov_lock, lovsub_lock, osc_lock |
d7e09d03 PT |
1196 | */ |
1197 | struct cl_lock_slice { | |
1198 | struct cl_lock *cls_lock; | |
1199 | /** Object slice corresponding to this lock slice. Immutable after | |
c56e256d OD |
1200 | * creation. |
1201 | */ | |
d7e09d03 PT |
1202 | struct cl_object *cls_obj; |
1203 | const struct cl_lock_operations *cls_ops; | |
1204 | /** Linkage into cl_lock::cll_layers. Immutable after creation. */ | |
1205 | struct list_head cls_linkage; | |
1206 | }; | |
1207 | ||
d7e09d03 PT |
1208 | /** |
1209 | * | |
1210 | * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops | |
1211 | */ | |
1212 | struct cl_lock_operations { | |
d7e09d03 PT |
1213 | /** @{ */ |
1214 | /** | |
1215 | * Attempts to enqueue the lock. Called top-to-bottom. | |
1216 | * | |
06563b56 JX |
1217 | * \retval 0 this layer has enqueued the lock successfully |
1218 | * \retval >0 this layer has enqueued the lock, but need to wait on | |
1219 | * @anchor for resources | |
1220 | * \retval -ve failure | |
1221 | * | |
4a4eee07 | 1222 | * \see vvp_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(), |
d7e09d03 PT |
1223 | * \see osc_lock_enqueue() |
1224 | */ | |
1225 | int (*clo_enqueue)(const struct lu_env *env, | |
1226 | const struct cl_lock_slice *slice, | |
06563b56 | 1227 | struct cl_io *io, struct cl_sync_io *anchor); |
d7e09d03 | 1228 | /** |
06563b56 JX |
1229 | * Cancel a lock, release its DLM lock ref, while does not cancel the |
1230 | * DLM lock | |
d7e09d03 PT |
1231 | */ |
1232 | void (*clo_cancel)(const struct lu_env *env, | |
1233 | const struct cl_lock_slice *slice); | |
06563b56 | 1234 | /** @} */ |
d7e09d03 PT |
1235 | /** |
1236 | * Destructor. Frees resources and the slice. | |
1237 | * | |
4a4eee07 | 1238 | * \see vvp_lock_fini(), lov_lock_fini(), lovsub_lock_fini(), |
d7e09d03 PT |
1239 | * \see osc_lock_fini() |
1240 | */ | |
1241 | void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice); | |
1242 | /** | |
1243 | * Optional debugging helper. Prints given lock slice. | |
1244 | */ | |
1245 | int (*clo_print)(const struct lu_env *env, | |
1246 | void *cookie, lu_printer_t p, | |
1247 | const struct cl_lock_slice *slice); | |
1248 | }; | |
1249 | ||
1250 | #define CL_LOCK_DEBUG(mask, env, lock, format, ...) \ | |
1251 | do { \ | |
1252 | LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ | |
1253 | \ | |
1254 | if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ | |
1255 | cl_lock_print(env, &msgdata, lu_cdebug_printer, lock); \ | |
b2952d62 | 1256 | CDEBUG(mask, format, ## __VA_ARGS__); \ |
d7e09d03 PT |
1257 | } \ |
1258 | } while (0) | |
1259 | ||
1260 | #define CL_LOCK_ASSERT(expr, env, lock) do { \ | |
1261 | if (likely(expr)) \ | |
1262 | break; \ | |
1263 | \ | |
1264 | CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr); \ | |
1265 | LBUG(); \ | |
1266 | } while (0) | |
1267 | ||
1268 | /** @} cl_lock */ | |
1269 | ||
1270 | /** \addtogroup cl_page_list cl_page_list | |
1271 | * Page list used to perform collective operations on a group of pages. | |
1272 | * | |
1273 | * Pages are added to the list one by one. cl_page_list acquires a reference | |
1274 | * for every page in it. Page list is used to perform collective operations on | |
1275 | * pages: | |
1276 | * | |
1277 | * - submit pages for an immediate transfer, | |
1278 | * | |
1279 | * - own pages on behalf of certain io (waiting for each page in turn), | |
1280 | * | |
1281 | * - discard pages. | |
1282 | * | |
1283 | * When list is finalized, it releases references on all pages it still has. | |
1284 | * | |
1285 | * \todo XXX concurrency control. | |
1286 | * | |
1287 | * @{ | |
1288 | */ | |
1289 | struct cl_page_list { | |
e7738506 | 1290 | unsigned int pl_nr; |
d7e09d03 | 1291 | struct list_head pl_pages; |
68b636b6 | 1292 | struct task_struct *pl_owner; |
d7e09d03 PT |
1293 | }; |
1294 | ||
1295 | /** | |
1296 | * A 2-queue of pages. A convenience data-type for common use case, 2-queue | |
1297 | * contains an incoming page list and an outgoing page list. | |
1298 | */ | |
1299 | struct cl_2queue { | |
1300 | struct cl_page_list c2_qin; | |
1301 | struct cl_page_list c2_qout; | |
1302 | }; | |
1303 | ||
1304 | /** @} cl_page_list */ | |
1305 | ||
1306 | /** \addtogroup cl_io cl_io | |
c56e256d OD |
1307 | * @{ |
1308 | */ | |
d7e09d03 PT |
1309 | /** \struct cl_io |
1310 | * I/O | |
1311 | * | |
1312 | * cl_io represents a high level I/O activity like | |
1313 | * read(2)/write(2)/truncate(2) system call, or cancellation of an extent | |
1314 | * lock. | |
1315 | * | |
1316 | * cl_io is a layered object, much like cl_{object,page,lock} but with one | |
1317 | * important distinction. We want to minimize number of calls to the allocator | |
1318 | * in the fast path, e.g., in the case of read(2) when everything is cached: | |
1319 | * client already owns the lock over region being read, and data are cached | |
1320 | * due to read-ahead. To avoid allocation of cl_io layers in such situations, | |
1321 | * per-layer io state is stored in the session, associated with the io, see | |
1322 | * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized | |
1323 | * by using free-lists, see cl_env_get(). | |
1324 | * | |
1325 | * There is a small predefined number of possible io types, enumerated in enum | |
1326 | * cl_io_type. | |
1327 | * | |
1328 | * cl_io is a state machine, that can be advanced concurrently by the multiple | |
1329 | * threads. It is up to these threads to control the concurrency and, | |
1330 | * specifically, to detect when io is done, and its state can be safely | |
1331 | * released. | |
1332 | * | |
1333 | * For read/write io overall execution plan is as following: | |
1334 | * | |
1335 | * (0) initialize io state through all layers; | |
1336 | * | |
1337 | * (1) loop: prepare chunk of work to do | |
1338 | * | |
1339 | * (2) call all layers to collect locks they need to process current chunk | |
1340 | * | |
1341 | * (3) sort all locks to avoid dead-locks, and acquire them | |
1342 | * | |
1343 | * (4) process the chunk: call per-page methods | |
d7e09d03 PT |
1344 | * cl_io_operations::cio_prepare_write(), |
1345 | * cl_io_operations::cio_commit_write() for write) | |
1346 | * | |
1347 | * (5) release locks | |
1348 | * | |
1349 | * (6) repeat loop. | |
1350 | * | |
1351 | * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to | |
1352 | * address allocation efficiency issues mentioned above), and returns with the | |
1353 | * special error condition from per-page method when current sub-io has to | |
1354 | * block. This causes io loop to be repeated, and lov switches to the next | |
1355 | * sub-io in its cl_io_operations::cio_iter_init() implementation. | |
1356 | */ | |
1357 | ||
1358 | /** IO types */ | |
1359 | enum cl_io_type { | |
1360 | /** read system call */ | |
1361 | CIT_READ, | |
1362 | /** write system call */ | |
1363 | CIT_WRITE, | |
1364 | /** truncate, utime system calls */ | |
1365 | CIT_SETATTR, | |
f0cf21ab JH |
1366 | /** get data version */ |
1367 | CIT_DATA_VERSION, | |
d7e09d03 PT |
1368 | /** |
1369 | * page fault handling | |
1370 | */ | |
1371 | CIT_FAULT, | |
1372 | /** | |
1373 | * fsync system call handling | |
1374 | * To write out a range of file | |
1375 | */ | |
1376 | CIT_FSYNC, | |
1377 | /** | |
1378 | * Miscellaneous io. This is used for occasional io activity that | |
1379 | * doesn't fit into other types. Currently this is used for: | |
1380 | * | |
1381 | * - cancellation of an extent lock. This io exists as a context | |
1382 | * to write dirty pages from under the lock being canceled back | |
1383 | * to the server; | |
1384 | * | |
1385 | * - VM induced page write-out. An io context for writing page out | |
1386 | * for memory cleansing; | |
1387 | * | |
1388 | * - glimpse. An io context to acquire glimpse lock. | |
1389 | * | |
1390 | * - grouplock. An io context to acquire group lock. | |
1391 | * | |
1392 | * CIT_MISC io is used simply as a context in which locks and pages | |
1393 | * are manipulated. Such io has no internal "process", that is, | |
1394 | * cl_io_loop() is never called for it. | |
1395 | */ | |
1396 | CIT_MISC, | |
1397 | CIT_OP_NR | |
1398 | }; | |
1399 | ||
1400 | /** | |
1401 | * States of cl_io state machine | |
1402 | */ | |
1403 | enum cl_io_state { | |
1404 | /** Not initialized. */ | |
1405 | CIS_ZERO, | |
1406 | /** Initialized. */ | |
1407 | CIS_INIT, | |
1408 | /** IO iteration started. */ | |
1409 | CIS_IT_STARTED, | |
1410 | /** Locks taken. */ | |
1411 | CIS_LOCKED, | |
1412 | /** Actual IO is in progress. */ | |
1413 | CIS_IO_GOING, | |
1414 | /** IO for the current iteration finished. */ | |
1415 | CIS_IO_FINISHED, | |
1416 | /** Locks released. */ | |
1417 | CIS_UNLOCKED, | |
1418 | /** Iteration completed. */ | |
1419 | CIS_IT_ENDED, | |
1420 | /** cl_io finalized. */ | |
1421 | CIS_FINI | |
1422 | }; | |
1423 | ||
1424 | /** | |
1425 | * IO state private for a layer. | |
1426 | * | |
1427 | * This is usually embedded into layer session data, rather than allocated | |
1428 | * dynamically. | |
1429 | * | |
10cdef73 | 1430 | * \see vvp_io, lov_io, osc_io |
d7e09d03 PT |
1431 | */ |
1432 | struct cl_io_slice { | |
1433 | struct cl_io *cis_io; | |
1434 | /** corresponding object slice. Immutable after creation. */ | |
1435 | struct cl_object *cis_obj; | |
1436 | /** io operations. Immutable after creation. */ | |
1437 | const struct cl_io_operations *cis_iop; | |
1438 | /** | |
1439 | * linkage into a list of all slices for a given cl_io, hanging off | |
1440 | * cl_io::ci_layers. Immutable after creation. | |
1441 | */ | |
1442 | struct list_head cis_linkage; | |
1443 | }; | |
1444 | ||
77605e41 JX |
1445 | typedef void (*cl_commit_cbt)(const struct lu_env *, struct cl_io *, |
1446 | struct cl_page *); | |
1e1db2a9 JX |
1447 | |
1448 | struct cl_read_ahead { | |
1449 | /* | |
1450 | * Maximum page index the readahead window will end. | |
1451 | * This is determined DLM lock coverage, RPC and stripe boundary. | |
1452 | * cra_end is included. | |
1453 | */ | |
1454 | pgoff_t cra_end; | |
198a49a9 JX |
1455 | /* optimal RPC size for this read, by pages */ |
1456 | unsigned long cra_rpc_size; | |
1e1db2a9 | 1457 | /* |
198a49a9 | 1458 | * Release callback. If readahead holds resources underneath, this |
1e1db2a9 JX |
1459 | * function should be called to release it. |
1460 | */ | |
1461 | void (*cra_release)(const struct lu_env *env, void *cbdata); | |
1462 | /* Callback data for cra_release routine */ | |
1463 | void *cra_cbdata; | |
1464 | }; | |
1465 | ||
1466 | static inline void cl_read_ahead_release(const struct lu_env *env, | |
1467 | struct cl_read_ahead *ra) | |
1468 | { | |
1469 | if (ra->cra_release) | |
1470 | ra->cra_release(env, ra->cra_cbdata); | |
1471 | memset(ra, 0, sizeof(*ra)); | |
1472 | } | |
1473 | ||
d7e09d03 PT |
1474 | /** |
1475 | * Per-layer io operations. | |
1476 | * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops | |
1477 | */ | |
1478 | struct cl_io_operations { | |
1479 | /** | |
1480 | * Vector of io state transition methods for every io type. | |
1481 | * | |
1482 | * \see cl_page_operations::io | |
1483 | */ | |
1484 | struct { | |
1485 | /** | |
1486 | * Prepare io iteration at a given layer. | |
1487 | * | |
1488 | * Called top-to-bottom at the beginning of each iteration of | |
1489 | * "io loop" (if it makes sense for this type of io). Here | |
1490 | * layer selects what work it will do during this iteration. | |
1491 | * | |
1492 | * \see cl_io_operations::cio_iter_fini() | |
1493 | */ | |
10457d4b OD |
1494 | int (*cio_iter_init)(const struct lu_env *env, |
1495 | const struct cl_io_slice *slice); | |
d7e09d03 PT |
1496 | /** |
1497 | * Finalize io iteration. | |
1498 | * | |
1499 | * Called bottom-to-top at the end of each iteration of "io | |
1500 | * loop". Here layers can decide whether IO has to be | |
1501 | * continued. | |
1502 | * | |
1503 | * \see cl_io_operations::cio_iter_init() | |
1504 | */ | |
10457d4b OD |
1505 | void (*cio_iter_fini)(const struct lu_env *env, |
1506 | const struct cl_io_slice *slice); | |
d7e09d03 PT |
1507 | /** |
1508 | * Collect locks for the current iteration of io. | |
1509 | * | |
1510 | * Called top-to-bottom to collect all locks necessary for | |
1511 | * this iteration. This methods shouldn't actually enqueue | |
1512 | * anything, instead it should post a lock through | |
1513 | * cl_io_lock_add(). Once all locks are collected, they are | |
1514 | * sorted and enqueued in the proper order. | |
1515 | */ | |
10457d4b OD |
1516 | int (*cio_lock)(const struct lu_env *env, |
1517 | const struct cl_io_slice *slice); | |
d7e09d03 PT |
1518 | /** |
1519 | * Finalize unlocking. | |
1520 | * | |
1521 | * Called bottom-to-top to finish layer specific unlocking | |
1522 | * functionality, after generic code released all locks | |
1523 | * acquired by cl_io_operations::cio_lock(). | |
1524 | */ | |
1525 | void (*cio_unlock)(const struct lu_env *env, | |
1526 | const struct cl_io_slice *slice); | |
1527 | /** | |
1528 | * Start io iteration. | |
1529 | * | |
1530 | * Once all locks are acquired, called top-to-bottom to | |
1531 | * commence actual IO. In the current implementation, | |
1532 | * top-level vvp_io_{read,write}_start() does all the work | |
1533 | * synchronously by calling generic_file_*(), so other layers | |
1534 | * are called when everything is done. | |
1535 | */ | |
1536 | int (*cio_start)(const struct lu_env *env, | |
1537 | const struct cl_io_slice *slice); | |
1538 | /** | |
1539 | * Called top-to-bottom at the end of io loop. Here layer | |
1540 | * might wait for an unfinished asynchronous io. | |
1541 | */ | |
10457d4b OD |
1542 | void (*cio_end)(const struct lu_env *env, |
1543 | const struct cl_io_slice *slice); | |
d7e09d03 PT |
1544 | /** |
1545 | * Called bottom-to-top to notify layers that read/write IO | |
1546 | * iteration finished, with \a nob bytes transferred. | |
1547 | */ | |
1548 | void (*cio_advance)(const struct lu_env *env, | |
1549 | const struct cl_io_slice *slice, | |
1550 | size_t nob); | |
1551 | /** | |
1552 | * Called once per io, bottom-to-top to release io resources. | |
1553 | */ | |
10457d4b OD |
1554 | void (*cio_fini)(const struct lu_env *env, |
1555 | const struct cl_io_slice *slice); | |
d7e09d03 | 1556 | } op[CIT_OP_NR]; |
77605e41 | 1557 | |
d7e09d03 PT |
1558 | /** |
1559 | * Submit pages from \a queue->c2_qin for IO, and move | |
1560 | * successfully submitted pages into \a queue->c2_qout. Return | |
1561 | * non-zero if failed to submit even the single page. If | |
1562 | * submission failed after some pages were moved into \a | |
1563 | * queue->c2_qout, completion callback with non-zero ioret is | |
1564 | * executed on them. | |
1565 | */ | |
1566 | int (*cio_submit)(const struct lu_env *env, | |
1567 | const struct cl_io_slice *slice, | |
1568 | enum cl_req_type crt, | |
1569 | struct cl_2queue *queue); | |
77605e41 JX |
1570 | /** |
1571 | * Queue async page for write. | |
1572 | * The difference between cio_submit and cio_queue is that | |
1573 | * cio_submit is for urgent request. | |
1574 | */ | |
1575 | int (*cio_commit_async)(const struct lu_env *env, | |
1576 | const struct cl_io_slice *slice, | |
1577 | struct cl_page_list *queue, int from, int to, | |
1578 | cl_commit_cbt cb); | |
d7e09d03 | 1579 | /** |
1e1db2a9 | 1580 | * Decide maximum read ahead extent |
d7e09d03 PT |
1581 | * |
1582 | * \pre io->ci_type == CIT_READ | |
1583 | */ | |
1e1db2a9 JX |
1584 | int (*cio_read_ahead)(const struct lu_env *env, |
1585 | const struct cl_io_slice *slice, | |
1586 | pgoff_t start, struct cl_read_ahead *ra); | |
d7e09d03 PT |
1587 | /** |
1588 | * Optional debugging helper. Print given io slice. | |
1589 | */ | |
1590 | int (*cio_print)(const struct lu_env *env, void *cookie, | |
1591 | lu_printer_t p, const struct cl_io_slice *slice); | |
1592 | }; | |
1593 | ||
1594 | /** | |
1595 | * Flags to lock enqueue procedure. | |
1596 | * \ingroup cl_lock | |
1597 | */ | |
1598 | enum cl_enq_flags { | |
1599 | /** | |
1600 | * instruct server to not block, if conflicting lock is found. Instead | |
1601 | * -EWOULDBLOCK is returned immediately. | |
1602 | */ | |
1603 | CEF_NONBLOCK = 0x00000001, | |
1604 | /** | |
1605 | * take lock asynchronously (out of order), as it cannot | |
1606 | * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing. | |
1607 | */ | |
1608 | CEF_ASYNC = 0x00000002, | |
1609 | /** | |
1610 | * tell the server to instruct (though a flag in the blocking ast) an | |
1611 | * owner of the conflicting lock, that it can drop dirty pages | |
1612 | * protected by this lock, without sending them to the server. | |
1613 | */ | |
1614 | CEF_DISCARD_DATA = 0x00000004, | |
1615 | /** | |
1616 | * tell the sub layers that it must be a `real' lock. This is used for | |
1617 | * mmapped-buffer locks and glimpse locks that must be never converted | |
1618 | * into lockless mode. | |
1619 | * | |
1620 | * \see vvp_mmap_locks(), cl_glimpse_lock(). | |
1621 | */ | |
1622 | CEF_MUST = 0x00000008, | |
1623 | /** | |
1624 | * tell the sub layers that never request a `real' lock. This flag is | |
1625 | * not used currently. | |
1626 | * | |
1627 | * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless | |
1628 | * conversion policy: ci_lockreq describes generic information of lock | |
1629 | * requirement for this IO, especially for locks which belong to the | |
1630 | * object doing IO; however, lock itself may have precise requirements | |
1631 | * that are described by the enqueue flags. | |
1632 | */ | |
1633 | CEF_NEVER = 0x00000010, | |
1634 | /** | |
1635 | * for async glimpse lock. | |
1636 | */ | |
1637 | CEF_AGL = 0x00000020, | |
06563b56 JX |
1638 | /** |
1639 | * enqueue a lock to test DLM lock existence. | |
1640 | */ | |
1641 | CEF_PEEK = 0x00000040, | |
46ff82f9 JX |
1642 | /** |
1643 | * Lock match only. Used by group lock in I/O as group lock | |
1644 | * is known to exist. | |
1645 | */ | |
1646 | CEF_LOCK_MATCH = BIT(7), | |
d7e09d03 PT |
1647 | /** |
1648 | * mask of enq_flags. | |
1649 | */ | |
46ff82f9 | 1650 | CEF_MASK = 0x000000ff, |
d7e09d03 PT |
1651 | }; |
1652 | ||
1653 | /** | |
1654 | * Link between lock and io. Intermediate structure is needed, because the | |
1655 | * same lock can be part of multiple io's simultaneously. | |
1656 | */ | |
1657 | struct cl_io_lock_link { | |
1658 | /** linkage into one of cl_lockset lists. */ | |
1659 | struct list_head cill_linkage; | |
06563b56 | 1660 | struct cl_lock cill_lock; |
d7e09d03 PT |
1661 | /** optional destructor */ |
1662 | void (*cill_fini)(const struct lu_env *env, | |
10457d4b | 1663 | struct cl_io_lock_link *link); |
d7e09d03 | 1664 | }; |
06563b56 | 1665 | #define cill_descr cill_lock.cll_descr |
d7e09d03 PT |
1666 | |
1667 | /** | |
1668 | * Lock-set represents a collection of locks, that io needs at a | |
1669 | * time. Generally speaking, client tries to avoid holding multiple locks when | |
1670 | * possible, because | |
1671 | * | |
1672 | * - holding extent locks over multiple ost's introduces the danger of | |
1673 | * "cascading timeouts"; | |
1674 | * | |
1675 | * - holding multiple locks over the same ost is still dead-lock prone, | |
1676 | * see comment in osc_lock_enqueue(), | |
1677 | * | |
1678 | * but there are certain situations where this is unavoidable: | |
1679 | * | |
1680 | * - O_APPEND writes have to take [0, EOF] lock for correctness; | |
1681 | * | |
1682 | * - truncate has to take [new-size, EOF] lock for correctness; | |
1683 | * | |
1684 | * - SNS has to take locks across full stripe for correctness; | |
1685 | * | |
1686 | * - in the case when user level buffer, supplied to {read,write}(file0), | |
1687 | * is a part of a memory mapped lustre file, client has to take a dlm | |
1688 | * locks on file0, and all files that back up the buffer (or a part of | |
1689 | * the buffer, that is being processed in the current chunk, in any | |
1690 | * case, there are situations where at least 2 locks are necessary). | |
1691 | * | |
1692 | * In such cases we at least try to take locks in the same consistent | |
1693 | * order. To this end, all locks are first collected, then sorted, and then | |
1694 | * enqueued. | |
1695 | */ | |
1696 | struct cl_lockset { | |
1697 | /** locks to be acquired. */ | |
1698 | struct list_head cls_todo; | |
d7e09d03 PT |
1699 | /** locks acquired. */ |
1700 | struct list_head cls_done; | |
1701 | }; | |
1702 | ||
1703 | /** | |
1704 | * Lock requirements(demand) for IO. It should be cl_io_lock_req, | |
1705 | * but 'req' is always to be thought as 'request' :-) | |
1706 | */ | |
1707 | enum cl_io_lock_dmd { | |
1708 | /** Always lock data (e.g., O_APPEND). */ | |
1709 | CILR_MANDATORY = 0, | |
1710 | /** Layers are free to decide between local and global locking. */ | |
1711 | CILR_MAYBE, | |
85f552df | 1712 | /** Never lock: there is no cache (e.g., lockless IO). */ |
d7e09d03 PT |
1713 | CILR_NEVER |
1714 | }; | |
1715 | ||
1716 | enum cl_fsync_mode { | |
1717 | /** start writeback, do not wait for them to finish */ | |
1718 | CL_FSYNC_NONE = 0, | |
1719 | /** start writeback and wait for them to finish */ | |
1720 | CL_FSYNC_LOCAL = 1, | |
1721 | /** discard all of dirty pages in a specific file range */ | |
1722 | CL_FSYNC_DISCARD = 2, | |
1723 | /** start writeback and make sure they have reached storage before | |
c56e256d OD |
1724 | * return. OST_SYNC RPC must be issued and finished |
1725 | */ | |
d7e09d03 PT |
1726 | CL_FSYNC_ALL = 3 |
1727 | }; | |
1728 | ||
1729 | struct cl_io_rw_common { | |
1730 | loff_t crw_pos; | |
1731 | size_t crw_count; | |
1732 | int crw_nonblock; | |
1733 | }; | |
1734 | ||
d7e09d03 PT |
1735 | /** |
1736 | * State for io. | |
1737 | * | |
1738 | * cl_io is shared by all threads participating in this IO (in current | |
1739 | * implementation only one thread advances IO, but parallel IO design and | |
1740 | * concurrent copy_*_user() require multiple threads acting on the same IO. It | |
1741 | * is up to these threads to serialize their activities, including updates to | |
1742 | * mutable cl_io fields. | |
1743 | */ | |
1744 | struct cl_io { | |
1745 | /** type of this IO. Immutable after creation. */ | |
1746 | enum cl_io_type ci_type; | |
1747 | /** current state of cl_io state machine. */ | |
1748 | enum cl_io_state ci_state; | |
1749 | /** main object this io is against. Immutable after creation. */ | |
1750 | struct cl_object *ci_obj; | |
1751 | /** | |
1752 | * Upper layer io, of which this io is a part of. Immutable after | |
1753 | * creation. | |
1754 | */ | |
1755 | struct cl_io *ci_parent; | |
1756 | /** List of slices. Immutable after creation. */ | |
1757 | struct list_head ci_layers; | |
1758 | /** list of locks (to be) acquired by this io. */ | |
1759 | struct cl_lockset ci_lockset; | |
1760 | /** lock requirements, this is just a help info for sublayers. */ | |
1761 | enum cl_io_lock_dmd ci_lockreq; | |
1762 | union { | |
1763 | struct cl_rd_io { | |
1764 | struct cl_io_rw_common rd; | |
1765 | } ci_rd; | |
1766 | struct cl_wr_io { | |
1767 | struct cl_io_rw_common wr; | |
1768 | int wr_append; | |
1769 | int wr_sync; | |
1770 | } ci_wr; | |
1771 | struct cl_io_rw_common ci_rw; | |
1772 | struct cl_setattr_io { | |
1773 | struct ost_lvb sa_attr; | |
933eb397 | 1774 | unsigned int sa_attr_flags; |
d7e09d03 | 1775 | unsigned int sa_valid; |
7ebb0ef3 | 1776 | int sa_stripe_index; |
933eb397 | 1777 | const struct lu_fid *sa_parent_fid; |
d7e09d03 | 1778 | } ci_setattr; |
f0cf21ab JH |
1779 | struct cl_data_version_io { |
1780 | u64 dv_data_version; | |
1781 | int dv_flags; | |
1782 | } ci_data_version; | |
d7e09d03 PT |
1783 | struct cl_fault_io { |
1784 | /** page index within file. */ | |
1785 | pgoff_t ft_index; | |
1786 | /** bytes valid byte on a faulted page. */ | |
fdeb14fa | 1787 | size_t ft_nob; |
d7e09d03 PT |
1788 | /** writable page? for nopage() only */ |
1789 | int ft_writable; | |
1790 | /** page of an executable? */ | |
1791 | int ft_executable; | |
1792 | /** page_mkwrite() */ | |
1793 | int ft_mkwrite; | |
1794 | /** resulting page */ | |
1795 | struct cl_page *ft_page; | |
1796 | } ci_fault; | |
1797 | struct cl_fsync_io { | |
1798 | loff_t fi_start; | |
1799 | loff_t fi_end; | |
d7e09d03 PT |
1800 | /** file system level fid */ |
1801 | struct lu_fid *fi_fid; | |
1802 | enum cl_fsync_mode fi_mode; | |
1803 | /* how many pages were written/discarded */ | |
1804 | unsigned int fi_nr_written; | |
1805 | } ci_fsync; | |
1806 | } u; | |
1807 | struct cl_2queue ci_queue; | |
1808 | size_t ci_nob; | |
1809 | int ci_result; | |
1810 | unsigned int ci_continue:1, | |
1811 | /** | |
1812 | * This io has held grouplock, to inform sublayers that | |
1813 | * don't do lockless i/o. | |
1814 | */ | |
1815 | ci_no_srvlock:1, | |
1816 | /** | |
1817 | * The whole IO need to be restarted because layout has been changed | |
1818 | */ | |
1819 | ci_need_restart:1, | |
1820 | /** | |
1821 | * to not refresh layout - the IO issuer knows that the layout won't | |
1822 | * change(page operations, layout change causes all page to be | |
1823 | * discarded), or it doesn't matter if it changes(sync). | |
1824 | */ | |
1825 | ci_ignore_layout:1, | |
1826 | /** | |
1827 | * Check if layout changed after the IO finishes. Mainly for HSM | |
1828 | * requirement. If IO occurs to openning files, it doesn't need to | |
1829 | * verify layout because HSM won't release openning files. | |
bd9070cb | 1830 | * Right now, only two operations need to verify layout: glimpse |
d7e09d03 PT |
1831 | * and setattr. |
1832 | */ | |
5ea17d6c JL |
1833 | ci_verify_layout:1, |
1834 | /** | |
1835 | * file is released, restore has to to be triggered by vvp layer | |
1836 | */ | |
ec9bca9c JH |
1837 | ci_restore_needed:1, |
1838 | /** | |
1839 | * O_NOATIME | |
1840 | */ | |
1841 | ci_noatime:1; | |
d7e09d03 PT |
1842 | /** |
1843 | * Number of pages owned by this IO. For invariant checking. | |
1844 | */ | |
e7738506 | 1845 | unsigned int ci_owned_nr; |
d7e09d03 PT |
1846 | }; |
1847 | ||
1848 | /** @} cl_io */ | |
1849 | ||
d7e09d03 PT |
1850 | /** |
1851 | * Per-transfer attributes. | |
1852 | */ | |
1853 | struct cl_req_attr { | |
6a9b2c92 JX |
1854 | enum cl_req_type cra_type; |
1855 | u64 cra_flags; | |
1856 | struct cl_page *cra_page; | |
1857 | ||
d7e09d03 PT |
1858 | /** Generic attributes for the server consumption. */ |
1859 | struct obdo *cra_oa; | |
d7e09d03 | 1860 | /** Jobid */ |
c9fe1f7f | 1861 | char cra_jobid[LUSTRE_JOBID_SIZE]; |
d7e09d03 PT |
1862 | }; |
1863 | ||
d7e09d03 PT |
1864 | enum cache_stats_item { |
1865 | /** how many cache lookups were performed */ | |
1866 | CS_lookup = 0, | |
1867 | /** how many times cache lookup resulted in a hit */ | |
1868 | CS_hit, | |
1869 | /** how many entities are in the cache right now */ | |
1870 | CS_total, | |
1871 | /** how many entities in the cache are actively used (and cannot be | |
c56e256d OD |
1872 | * evicted) right now |
1873 | */ | |
d7e09d03 PT |
1874 | CS_busy, |
1875 | /** how many entities were created at all */ | |
1876 | CS_create, | |
1877 | CS_NR | |
1878 | }; | |
1879 | ||
1880 | #define CS_NAMES { "lookup", "hit", "total", "busy", "create" } | |
1881 | ||
1882 | /** | |
1883 | * Stats for a generic cache (similar to inode, lu_object, etc. caches). | |
1884 | */ | |
1885 | struct cache_stats { | |
1886 | const char *cs_name; | |
1887 | atomic_t cs_stats[CS_NR]; | |
1888 | }; | |
1889 | ||
1890 | /** These are not exported so far */ | |
10457d4b | 1891 | void cache_stats_init(struct cache_stats *cs, const char *name); |
d7e09d03 PT |
1892 | |
1893 | /** | |
1894 | * Client-side site. This represents particular client stack. "Global" | |
1895 | * variables should (directly or indirectly) be added here to allow multiple | |
1896 | * clients to co-exist in the single address space. | |
1897 | */ | |
1898 | struct cl_site { | |
1899 | struct lu_site cs_lu; | |
1900 | /** | |
1901 | * Statistical counters. Atomics do not scale, something better like | |
1902 | * per-cpu counters is needed. | |
1903 | * | |
406c1c7c | 1904 | * These are exported as /sys/kernel/debug/lustre/llite/.../site |
d7e09d03 PT |
1905 | * |
1906 | * When interpreting keep in mind that both sub-locks (and sub-pages) | |
1907 | * and top-locks (and top-pages) are accounted here. | |
1908 | */ | |
1909 | struct cache_stats cs_pages; | |
d7e09d03 | 1910 | atomic_t cs_pages_state[CPS_NR]; |
d7e09d03 PT |
1911 | }; |
1912 | ||
10457d4b OD |
1913 | int cl_site_init(struct cl_site *s, struct cl_device *top); |
1914 | void cl_site_fini(struct cl_site *s); | |
d7e09d03 PT |
1915 | void cl_stack_fini(const struct lu_env *env, struct cl_device *cl); |
1916 | ||
1917 | /** | |
1918 | * Output client site statistical counters into a buffer. Suitable for | |
1919 | * ll_rd_*()-style functions. | |
1920 | */ | |
73bb1da6 | 1921 | int cl_site_stats_print(const struct cl_site *site, struct seq_file *m); |
d7e09d03 PT |
1922 | |
1923 | /** | |
1924 | * \name helpers | |
1925 | * | |
1926 | * Type conversion and accessory functions. | |
1927 | */ | |
1928 | /** @{ */ | |
1929 | ||
1930 | static inline struct cl_site *lu2cl_site(const struct lu_site *site) | |
1931 | { | |
1932 | return container_of(site, struct cl_site, cs_lu); | |
1933 | } | |
1934 | ||
1935 | static inline int lu_device_is_cl(const struct lu_device *d) | |
1936 | { | |
1937 | return d->ld_type->ldt_tags & LU_DEVICE_CL; | |
1938 | } | |
1939 | ||
1940 | static inline struct cl_device *lu2cl_dev(const struct lu_device *d) | |
1941 | { | |
d2a13989 | 1942 | LASSERT(!d || IS_ERR(d) || lu_device_is_cl(d)); |
d7e09d03 PT |
1943 | return container_of0(d, struct cl_device, cd_lu_dev); |
1944 | } | |
1945 | ||
1946 | static inline struct lu_device *cl2lu_dev(struct cl_device *d) | |
1947 | { | |
1948 | return &d->cd_lu_dev; | |
1949 | } | |
1950 | ||
1951 | static inline struct cl_object *lu2cl(const struct lu_object *o) | |
1952 | { | |
d2a13989 | 1953 | LASSERT(!o || IS_ERR(o) || lu_device_is_cl(o->lo_dev)); |
d7e09d03 PT |
1954 | return container_of0(o, struct cl_object, co_lu); |
1955 | } | |
1956 | ||
1957 | static inline const struct cl_object_conf * | |
1958 | lu2cl_conf(const struct lu_object_conf *conf) | |
1959 | { | |
1960 | return container_of0(conf, struct cl_object_conf, coc_lu); | |
1961 | } | |
1962 | ||
1963 | static inline struct cl_object *cl_object_next(const struct cl_object *obj) | |
1964 | { | |
1965 | return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL; | |
1966 | } | |
1967 | ||
1968 | static inline struct cl_device *cl_object_device(const struct cl_object *o) | |
1969 | { | |
d2a13989 | 1970 | LASSERT(!o || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev)); |
d7e09d03 PT |
1971 | return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev); |
1972 | } | |
1973 | ||
1974 | static inline struct cl_object_header *luh2coh(const struct lu_object_header *h) | |
1975 | { | |
1976 | return container_of0(h, struct cl_object_header, coh_lu); | |
1977 | } | |
1978 | ||
1979 | static inline struct cl_site *cl_object_site(const struct cl_object *obj) | |
1980 | { | |
1981 | return lu2cl_site(obj->co_lu.lo_dev->ld_site); | |
1982 | } | |
1983 | ||
1984 | static inline | |
1985 | struct cl_object_header *cl_object_header(const struct cl_object *obj) | |
1986 | { | |
1987 | return luh2coh(obj->co_lu.lo_header); | |
1988 | } | |
1989 | ||
1990 | static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t) | |
1991 | { | |
1992 | return lu_device_init(&d->cd_lu_dev, t); | |
1993 | } | |
1994 | ||
1995 | static inline void cl_device_fini(struct cl_device *d) | |
1996 | { | |
1997 | lu_device_fini(&d->cd_lu_dev); | |
1998 | } | |
1999 | ||
2000 | void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, | |
fd7444fe | 2001 | struct cl_object *obj, pgoff_t index, |
d7e09d03 PT |
2002 | const struct cl_page_operations *ops); |
2003 | void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, | |
2004 | struct cl_object *obj, | |
2005 | const struct cl_lock_operations *ops); | |
2006 | void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, | |
2007 | struct cl_object *obj, const struct cl_io_operations *ops); | |
d7e09d03 PT |
2008 | /** @} helpers */ |
2009 | ||
2010 | /** \defgroup cl_object cl_object | |
c56e256d OD |
2011 | * @{ |
2012 | */ | |
10457d4b | 2013 | struct cl_object *cl_object_top(struct cl_object *o); |
d7e09d03 PT |
2014 | struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd, |
2015 | const struct lu_fid *fid, | |
2016 | const struct cl_object_conf *c); | |
2017 | ||
2018 | int cl_object_header_init(struct cl_object_header *h); | |
10457d4b OD |
2019 | void cl_object_put(const struct lu_env *env, struct cl_object *o); |
2020 | void cl_object_get(struct cl_object *o); | |
2021 | void cl_object_attr_lock(struct cl_object *o); | |
d7e09d03 | 2022 | void cl_object_attr_unlock(struct cl_object *o); |
10457d4b OD |
2023 | int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj, |
2024 | struct cl_attr *attr); | |
96234ec5 BJ |
2025 | int cl_object_attr_update(const struct lu_env *env, struct cl_object *obj, |
2026 | const struct cl_attr *attr, unsigned int valid); | |
10457d4b OD |
2027 | int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj, |
2028 | struct ost_lvb *lvb); | |
2029 | int cl_conf_set(const struct lu_env *env, struct cl_object *obj, | |
2030 | const struct cl_object_conf *conf); | |
06563b56 | 2031 | int cl_object_prune(const struct lu_env *env, struct cl_object *obj); |
10457d4b | 2032 | void cl_object_kill(const struct lu_env *env, struct cl_object *obj); |
a33fdc0d BJ |
2033 | int cl_object_getstripe(const struct lu_env *env, struct cl_object *obj, |
2034 | struct lov_user_md __user *lum); | |
cbd4d4a8 BJ |
2035 | int cl_object_fiemap(const struct lu_env *env, struct cl_object *obj, |
2036 | struct ll_fiemap_info_key *fmkey, struct fiemap *fiemap, | |
2037 | size_t *buflen); | |
55554f31 JH |
2038 | int cl_object_layout_get(const struct lu_env *env, struct cl_object *obj, |
2039 | struct cl_layout *cl); | |
090a6485 | 2040 | loff_t cl_object_maxbytes(struct cl_object *obj); |
d7e09d03 PT |
2041 | |
2042 | /** | |
2043 | * Returns true, iff \a o0 and \a o1 are slices of the same object. | |
2044 | */ | |
2045 | static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1) | |
2046 | { | |
2047 | return cl_object_header(o0) == cl_object_header(o1); | |
2048 | } | |
2049 | ||
2050 | static inline void cl_object_page_init(struct cl_object *clob, int size) | |
2051 | { | |
2052 | clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize; | |
7addf402 | 2053 | cl_object_header(clob)->coh_page_bufsize += cfs_size_round(size); |
96c53363 | 2054 | WARN_ON(cl_object_header(clob)->coh_page_bufsize > 512); |
d7e09d03 PT |
2055 | } |
2056 | ||
2057 | static inline void *cl_object_page_slice(struct cl_object *clob, | |
2058 | struct cl_page *page) | |
2059 | { | |
2060 | return (void *)((char *)page + clob->co_slice_off); | |
2061 | } | |
2062 | ||
3c361c1c JX |
2063 | /** |
2064 | * Return refcount of cl_object. | |
2065 | */ | |
2066 | static inline int cl_object_refc(struct cl_object *clob) | |
2067 | { | |
2068 | struct lu_object_header *header = clob->co_lu.lo_header; | |
2069 | ||
2070 | return atomic_read(&header->loh_ref); | |
2071 | } | |
2072 | ||
d7e09d03 PT |
2073 | /** @} cl_object */ |
2074 | ||
2075 | /** \defgroup cl_page cl_page | |
c56e256d OD |
2076 | * @{ |
2077 | */ | |
d7e09d03 PT |
2078 | enum { |
2079 | CLP_GANG_OKAY = 0, | |
2080 | CLP_GANG_RESCHED, | |
2081 | CLP_GANG_AGAIN, | |
2082 | CLP_GANG_ABORT | |
2083 | }; | |
2084 | ||
2085 | /* callback of cl_page_gang_lookup() */ | |
10457d4b OD |
2086 | struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *obj, |
2087 | pgoff_t idx, struct page *vmpage, | |
2088 | enum cl_page_type type); | |
d9d47901 JX |
2089 | struct cl_page *cl_page_alloc(const struct lu_env *env, |
2090 | struct cl_object *o, pgoff_t ind, | |
2091 | struct page *vmpage, | |
2092 | enum cl_page_type type); | |
10457d4b OD |
2093 | void cl_page_get(struct cl_page *page); |
2094 | void cl_page_put(const struct lu_env *env, struct cl_page *page); | |
2095 | void cl_page_print(const struct lu_env *env, void *cookie, lu_printer_t printer, | |
2096 | const struct cl_page *pg); | |
2097 | void cl_page_header_print(const struct lu_env *env, void *cookie, | |
2098 | lu_printer_t printer, const struct cl_page *pg); | |
10457d4b | 2099 | struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj); |
d7e09d03 PT |
2100 | |
2101 | const struct cl_page_slice *cl_page_at(const struct cl_page *page, | |
2102 | const struct lu_device_type *dtype); | |
2103 | ||
2104 | /** | |
2105 | * \name ownership | |
2106 | * | |
2107 | * Functions dealing with the ownership of page by io. | |
2108 | */ | |
2109 | /** @{ */ | |
2110 | ||
10457d4b OD |
2111 | int cl_page_own(const struct lu_env *env, |
2112 | struct cl_io *io, struct cl_page *page); | |
2113 | int cl_page_own_try(const struct lu_env *env, | |
2114 | struct cl_io *io, struct cl_page *page); | |
2115 | void cl_page_assume(const struct lu_env *env, | |
2116 | struct cl_io *io, struct cl_page *page); | |
2117 | void cl_page_unassume(const struct lu_env *env, | |
2118 | struct cl_io *io, struct cl_page *pg); | |
2119 | void cl_page_disown(const struct lu_env *env, | |
2120 | struct cl_io *io, struct cl_page *page); | |
b3f863a6 BX |
2121 | void cl_page_disown0(const struct lu_env *env, |
2122 | struct cl_io *io, struct cl_page *pg); | |
10457d4b | 2123 | int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io); |
d7e09d03 PT |
2124 | |
2125 | /** @} ownership */ | |
2126 | ||
2127 | /** | |
2128 | * \name transfer | |
2129 | * | |
2130 | * Functions dealing with the preparation of a page for a transfer, and | |
2131 | * tracking transfer state. | |
2132 | */ | |
2133 | /** @{ */ | |
10457d4b OD |
2134 | int cl_page_prep(const struct lu_env *env, struct cl_io *io, |
2135 | struct cl_page *pg, enum cl_req_type crt); | |
2136 | void cl_page_completion(const struct lu_env *env, | |
2137 | struct cl_page *pg, enum cl_req_type crt, int ioret); | |
2138 | int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg, | |
2139 | enum cl_req_type crt); | |
2140 | int cl_page_cache_add(const struct lu_env *env, struct cl_io *io, | |
2141 | struct cl_page *pg, enum cl_req_type crt); | |
2142 | void cl_page_clip(const struct lu_env *env, struct cl_page *pg, | |
2143 | int from, int to); | |
2144 | int cl_page_cancel(const struct lu_env *env, struct cl_page *page); | |
2145 | int cl_page_flush(const struct lu_env *env, struct cl_io *io, | |
2146 | struct cl_page *pg); | |
d7e09d03 PT |
2147 | |
2148 | /** @} transfer */ | |
2149 | ||
d7e09d03 PT |
2150 | /** |
2151 | * \name helper routines | |
2152 | * Functions to discard, delete and export a cl_page. | |
2153 | */ | |
2154 | /** @{ */ | |
10457d4b OD |
2155 | void cl_page_discard(const struct lu_env *env, struct cl_io *io, |
2156 | struct cl_page *pg); | |
2157 | void cl_page_delete(const struct lu_env *env, struct cl_page *pg); | |
10457d4b OD |
2158 | int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg); |
2159 | void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate); | |
10457d4b OD |
2160 | loff_t cl_offset(const struct cl_object *obj, pgoff_t idx); |
2161 | pgoff_t cl_index(const struct cl_object *obj, loff_t offset); | |
fdeb14fa | 2162 | size_t cl_page_size(const struct cl_object *obj); |
10457d4b OD |
2163 | int cl_pages_prune(const struct lu_env *env, struct cl_object *obj); |
2164 | ||
2165 | void cl_lock_print(const struct lu_env *env, void *cookie, | |
2166 | lu_printer_t printer, const struct cl_lock *lock); | |
d7e09d03 PT |
2167 | void cl_lock_descr_print(const struct lu_env *env, void *cookie, |
2168 | lu_printer_t printer, | |
2169 | const struct cl_lock_descr *descr); | |
2170 | /* @} helper */ | |
2171 | ||
0d345656 JH |
2172 | /** |
2173 | * Data structure managing a client's cached pages. A count of | |
2174 | * "unstable" pages is maintained, and an LRU of clean pages is | |
2175 | * maintained. "unstable" pages are pages pinned by the ptlrpc | |
2176 | * layer for recovery purposes. | |
2177 | */ | |
2178 | struct cl_client_cache { | |
2179 | /** | |
1b02bde3 EL |
2180 | * # of client cache refcount |
2181 | * # of users (OSCs) + 2 (held by llite and lov) | |
0d345656 JH |
2182 | */ |
2183 | atomic_t ccc_users; | |
2184 | /** | |
2185 | * # of threads are doing shrinking | |
2186 | */ | |
2187 | unsigned int ccc_lru_shrinkers; | |
2188 | /** | |
2189 | * # of LRU entries available | |
2190 | */ | |
29c877a5 | 2191 | atomic_long_t ccc_lru_left; |
0d345656 JH |
2192 | /** |
2193 | * List of entities(OSCs) for this LRU cache | |
2194 | */ | |
2195 | struct list_head ccc_lru; | |
2196 | /** | |
2197 | * Max # of LRU entries | |
2198 | */ | |
2199 | unsigned long ccc_lru_max; | |
2200 | /** | |
2201 | * Lock to protect ccc_lru list | |
2202 | */ | |
2203 | spinlock_t ccc_lru_lock; | |
d806f30e JX |
2204 | /** |
2205 | * Set if unstable check is enabled | |
2206 | */ | |
2207 | unsigned int ccc_unstable_check:1; | |
ac5b1481 PS |
2208 | /** |
2209 | * # of unstable pages for this mount point | |
2210 | */ | |
29c877a5 | 2211 | atomic_long_t ccc_unstable_nr; |
ac5b1481 PS |
2212 | /** |
2213 | * Waitq for awaiting unstable pages to reach zero. | |
2214 | * Used at umounting time and signaled on BRW commit | |
2215 | */ | |
550982cc | 2216 | wait_queue_head_t ccc_unstable_waitq; |
ac5b1481 | 2217 | |
0d345656 JH |
2218 | }; |
2219 | ||
1b02bde3 EL |
2220 | /** |
2221 | * cl_cache functions | |
2222 | */ | |
2223 | struct cl_client_cache *cl_cache_init(unsigned long lru_page_max); | |
2224 | void cl_cache_incref(struct cl_client_cache *cache); | |
2225 | void cl_cache_decref(struct cl_client_cache *cache); | |
2226 | ||
d7e09d03 PT |
2227 | /** @} cl_page */ |
2228 | ||
2229 | /** \defgroup cl_lock cl_lock | |
c56e256d OD |
2230 | * @{ |
2231 | */ | |
d7e09d03 | 2232 | |
06563b56 JX |
2233 | int cl_lock_request(const struct lu_env *env, struct cl_io *io, |
2234 | struct cl_lock *lock); | |
2235 | int cl_lock_init(const struct lu_env *env, struct cl_lock *lock, | |
2236 | const struct cl_io *io); | |
2237 | void cl_lock_fini(const struct lu_env *env, struct cl_lock *lock); | |
d7e09d03 PT |
2238 | const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, |
2239 | const struct lu_device_type *dtype); | |
06563b56 JX |
2240 | void cl_lock_release(const struct lu_env *env, struct cl_lock *lock); |
2241 | int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io, | |
2242 | struct cl_lock *lock, struct cl_sync_io *anchor); | |
d7e09d03 | 2243 | void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock); |
d7e09d03 PT |
2244 | |
2245 | /** @} cl_lock */ | |
2246 | ||
2247 | /** \defgroup cl_io cl_io | |
c56e256d OD |
2248 | * @{ |
2249 | */ | |
d7e09d03 | 2250 | |
10457d4b OD |
2251 | int cl_io_init(const struct lu_env *env, struct cl_io *io, |
2252 | enum cl_io_type iot, struct cl_object *obj); | |
2253 | int cl_io_sub_init(const struct lu_env *env, struct cl_io *io, | |
2254 | enum cl_io_type iot, struct cl_object *obj); | |
2255 | int cl_io_rw_init(const struct lu_env *env, struct cl_io *io, | |
2256 | enum cl_io_type iot, loff_t pos, size_t count); | |
2257 | int cl_io_loop(const struct lu_env *env, struct cl_io *io); | |
2258 | ||
2259 | void cl_io_fini(const struct lu_env *env, struct cl_io *io); | |
2260 | int cl_io_iter_init(const struct lu_env *env, struct cl_io *io); | |
2261 | void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io); | |
2262 | int cl_io_lock(const struct lu_env *env, struct cl_io *io); | |
2263 | void cl_io_unlock(const struct lu_env *env, struct cl_io *io); | |
2264 | int cl_io_start(const struct lu_env *env, struct cl_io *io); | |
2265 | void cl_io_end(const struct lu_env *env, struct cl_io *io); | |
2266 | int cl_io_lock_add(const struct lu_env *env, struct cl_io *io, | |
2267 | struct cl_io_lock_link *link); | |
2268 | int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, | |
2269 | struct cl_lock_descr *descr); | |
10457d4b OD |
2270 | int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io, |
2271 | enum cl_req_type iot, struct cl_2queue *queue); | |
2272 | int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io, | |
2273 | enum cl_req_type iot, struct cl_2queue *queue, | |
2274 | long timeout); | |
77605e41 JX |
2275 | int cl_io_commit_async(const struct lu_env *env, struct cl_io *io, |
2276 | struct cl_page_list *queue, int from, int to, | |
2277 | cl_commit_cbt cb); | |
1e1db2a9 JX |
2278 | int cl_io_read_ahead(const struct lu_env *env, struct cl_io *io, |
2279 | pgoff_t start, struct cl_read_ahead *ra); | |
10457d4b | 2280 | int cl_io_is_going(const struct lu_env *env); |
d7e09d03 PT |
2281 | |
2282 | /** | |
2283 | * True, iff \a io is an O_APPEND write(2). | |
2284 | */ | |
2285 | static inline int cl_io_is_append(const struct cl_io *io) | |
2286 | { | |
2287 | return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append; | |
2288 | } | |
2289 | ||
2290 | static inline int cl_io_is_sync_write(const struct cl_io *io) | |
2291 | { | |
2292 | return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync; | |
2293 | } | |
2294 | ||
2295 | static inline int cl_io_is_mkwrite(const struct cl_io *io) | |
2296 | { | |
2297 | return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite; | |
2298 | } | |
2299 | ||
2300 | /** | |
2301 | * True, iff \a io is a truncate(2). | |
2302 | */ | |
2303 | static inline int cl_io_is_trunc(const struct cl_io *io) | |
2304 | { | |
2305 | return io->ci_type == CIT_SETATTR && | |
2306 | (io->u.ci_setattr.sa_valid & ATTR_SIZE); | |
2307 | } | |
2308 | ||
2309 | struct cl_io *cl_io_top(struct cl_io *io); | |
2310 | ||
ec83e611 JP |
2311 | #define CL_IO_SLICE_CLEAN(foo_io, base) \ |
2312 | do { \ | |
2313 | typeof(foo_io) __foo_io = (foo_io); \ | |
d7e09d03 | 2314 | \ |
f440d909 | 2315 | BUILD_BUG_ON(offsetof(typeof(*__foo_io), base) != 0); \ |
ec83e611 JP |
2316 | memset(&__foo_io->base + 1, 0, \ |
2317 | sizeof(*__foo_io) - sizeof(__foo_io->base)); \ | |
d7e09d03 PT |
2318 | } while (0) |
2319 | ||
2320 | /** @} cl_io */ | |
2321 | ||
2322 | /** \defgroup cl_page_list cl_page_list | |
c56e256d OD |
2323 | * @{ |
2324 | */ | |
d7e09d03 PT |
2325 | |
2326 | /** | |
2327 | * Last page in the page list. | |
2328 | */ | |
2329 | static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist) | |
2330 | { | |
2331 | LASSERT(plist->pl_nr > 0); | |
2332 | return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch); | |
2333 | } | |
2334 | ||
77605e41 JX |
2335 | static inline struct cl_page *cl_page_list_first(struct cl_page_list *plist) |
2336 | { | |
2337 | LASSERT(plist->pl_nr > 0); | |
2338 | return list_entry(plist->pl_pages.next, struct cl_page, cp_batch); | |
2339 | } | |
2340 | ||
d7e09d03 PT |
2341 | /** |
2342 | * Iterate over pages in a page list. | |
2343 | */ | |
2344 | #define cl_page_list_for_each(page, list) \ | |
2345 | list_for_each_entry((page), &(list)->pl_pages, cp_batch) | |
2346 | ||
2347 | /** | |
2348 | * Iterate over pages in a page list, taking possible removals into account. | |
2349 | */ | |
2350 | #define cl_page_list_for_each_safe(page, temp, list) \ | |
2351 | list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch) | |
2352 | ||
10457d4b OD |
2353 | void cl_page_list_init(struct cl_page_list *plist); |
2354 | void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page); | |
2355 | void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src, | |
2356 | struct cl_page *page); | |
77605e41 JX |
2357 | void cl_page_list_move_head(struct cl_page_list *dst, struct cl_page_list *src, |
2358 | struct cl_page *page); | |
10457d4b | 2359 | void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head); |
77605e41 JX |
2360 | void cl_page_list_del(const struct lu_env *env, struct cl_page_list *plist, |
2361 | struct cl_page *page); | |
10457d4b OD |
2362 | void cl_page_list_disown(const struct lu_env *env, |
2363 | struct cl_io *io, struct cl_page_list *plist); | |
77605e41 | 2364 | void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist); |
10457d4b OD |
2365 | |
2366 | void cl_2queue_init(struct cl_2queue *queue); | |
2367 | void cl_2queue_disown(const struct lu_env *env, | |
2368 | struct cl_io *io, struct cl_2queue *queue); | |
2369 | void cl_2queue_discard(const struct lu_env *env, | |
2370 | struct cl_io *io, struct cl_2queue *queue); | |
2371 | void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue); | |
d7e09d03 PT |
2372 | void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page); |
2373 | ||
2374 | /** @} cl_page_list */ | |
2375 | ||
6a9b2c92 JX |
2376 | void cl_req_attr_set(const struct lu_env *env, struct cl_object *obj, |
2377 | struct cl_req_attr *attr); | |
d7e09d03 PT |
2378 | |
2379 | /** \defgroup cl_sync_io cl_sync_io | |
10457d4b OD |
2380 | * @{ |
2381 | */ | |
d7e09d03 PT |
2382 | |
2383 | /** | |
2384 | * Anchor for synchronous transfer. This is allocated on a stack by thread | |
2385 | * doing synchronous transfer, and a pointer to this structure is set up in | |
2386 | * every page submitted for transfer. Transfer completion routine updates | |
2387 | * anchor and wakes up waiting thread when transfer is complete. | |
2388 | */ | |
2389 | struct cl_sync_io { | |
2390 | /** number of pages yet to be transferred. */ | |
2391 | atomic_t csi_sync_nr; | |
2392 | /** error code. */ | |
2393 | int csi_sync_rc; | |
2394 | /** barrier of destroy this structure */ | |
2395 | atomic_t csi_barrier; | |
2396 | /** completion to be signaled when transfer is complete. */ | |
2397 | wait_queue_head_t csi_waitq; | |
e5c4e635 JX |
2398 | /** callback to invoke when this IO is finished */ |
2399 | void (*csi_end_io)(const struct lu_env *, | |
2400 | struct cl_sync_io *); | |
d7e09d03 PT |
2401 | }; |
2402 | ||
e5c4e635 JX |
2403 | void cl_sync_io_init(struct cl_sync_io *anchor, int nr, |
2404 | void (*end)(const struct lu_env *, struct cl_sync_io *)); | |
2405 | int cl_sync_io_wait(const struct lu_env *env, struct cl_sync_io *anchor, | |
d7e09d03 | 2406 | long timeout); |
e5c4e635 JX |
2407 | void cl_sync_io_note(const struct lu_env *env, struct cl_sync_io *anchor, |
2408 | int ioret); | |
2409 | void cl_sync_io_end(const struct lu_env *env, struct cl_sync_io *anchor); | |
d7e09d03 PT |
2410 | |
2411 | /** @} cl_sync_io */ | |
2412 | ||
d7e09d03 PT |
2413 | /** \defgroup cl_env cl_env |
2414 | * | |
2415 | * lu_env handling for a client. | |
2416 | * | |
2417 | * lu_env is an environment within which lustre code executes. Its major part | |
2418 | * is lu_context---a fast memory allocation mechanism that is used to conserve | |
2419 | * precious kernel stack space. Originally lu_env was designed for a server, | |
2420 | * where | |
2421 | * | |
2422 | * - there is a (mostly) fixed number of threads, and | |
2423 | * | |
2424 | * - call chains have no non-lustre portions inserted between lustre code. | |
2425 | * | |
bd9070cb | 2426 | * On a client both these assumption fails, because every user thread can |
d7e09d03 PT |
2427 | * potentially execute lustre code as part of a system call, and lustre calls |
2428 | * into VFS or MM that call back into lustre. | |
2429 | * | |
2430 | * To deal with that, cl_env wrapper functions implement the following | |
2431 | * optimizations: | |
2432 | * | |
2433 | * - allocation and destruction of environment is amortized by caching no | |
2434 | * longer used environments instead of destroying them; | |
2435 | * | |
d7e09d03 | 2436 | * \see lu_env, lu_context, lu_context_key |
c56e256d OD |
2437 | * @{ |
2438 | */ | |
d7e09d03 | 2439 | |
3ee45c7e YS |
2440 | struct lu_env *cl_env_get(u16 *refcheck); |
2441 | struct lu_env *cl_env_alloc(u16 *refcheck, __u32 tags); | |
2442 | void cl_env_put(struct lu_env *env, u16 *refcheck); | |
26f98e82 | 2443 | unsigned int cl_env_cache_purge(unsigned int nr); |
3c361c1c JX |
2444 | struct lu_env *cl_env_percpu_get(void); |
2445 | void cl_env_percpu_put(struct lu_env *env); | |
d7e09d03 PT |
2446 | |
2447 | /** @} cl_env */ | |
2448 | ||
2449 | /* | |
2450 | * Misc | |
2451 | */ | |
d7e09d03 PT |
2452 | void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb); |
2453 | ||
2454 | struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, | |
2455 | struct lu_device_type *ldt, | |
2456 | struct lu_device *next); | |
2457 | /** @} clio */ | |
2458 | ||
2459 | int cl_global_init(void); | |
2460 | void cl_global_fini(void); | |
2461 | ||
2462 | #endif /* _LINUX_CL_OBJECT_H */ |