1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
20 #include "common/config.h"
21 #include "include/counter.h"
22 #include "include/elist.h"
23 #include "include/types.h"
24 #include "include/lru.h"
25 #include "include/compact_set.h"
27 #include "MDSCacheObject.h"
31 #include "SimpleLock.h"
32 #include "ScatterLock.h"
33 #include "LocalLock.h"
34 #include "Capability.h"
35 #include "SnapRealm.h"
42 #define dout_context g_ceph_context
54 struct ObjectOperation
;
58 ostream
& operator<<(ostream
& out
, const CInode
& in
);
60 struct cinode_lock_info_t
{
65 extern cinode_lock_info_t cinode_lock_info
[];
66 extern int num_cinode_locks
;
70 * Base class for CInode, containing the backing store data and
71 * serialization methods. This exists so that we can read and
72 * handle CInodes from the backing store without hitting all
73 * the business logic in CInode proper.
75 class InodeStoreBase
{
77 inode_t inode
; // the inode itself
78 std::string symlink
; // symlink dest, if symlink
79 std::map
<std::string
, bufferptr
> xattrs
;
80 fragtree_t dirfragtree
; // dir frag tree, if any. always consistent with our dirfrag map.
81 compact_map
<snapid_t
, old_inode_t
> old_inodes
; // key = last, value.first = first
83 damage_flags_t damage_flags
;
85 InodeStoreBase() : oldest_snap(CEPH_NOSNAP
), damage_flags(0) { }
88 bool is_file() const { return inode
.is_file(); }
89 bool is_symlink() const { return inode
.is_symlink(); }
90 bool is_dir() const { return inode
.is_dir(); }
91 static object_t
get_object_name(inodeno_t ino
, frag_t fg
, const char *suffix
);
93 /* Full serialization for use in ".inode" root inode objects */
94 void encode(bufferlist
&bl
, uint64_t features
, const bufferlist
*snap_blob
=NULL
) const;
95 void decode(bufferlist::iterator
&bl
, bufferlist
& snap_blob
);
97 /* Serialization without ENCODE_START/FINISH blocks for use embedded in dentry */
98 void encode_bare(bufferlist
&bl
, uint64_t features
, const bufferlist
*snap_blob
=NULL
) const;
99 void decode_bare(bufferlist::iterator
&bl
, bufferlist
&snap_blob
, __u8 struct_v
=5);
101 /* For test/debug output */
102 void dump(Formatter
*f
) const;
104 /* For use by offline tools */
105 __u32
hash_dentry_name(const std::string
&dn
);
106 frag_t
pick_dirfrag(const std::string
&dn
);
109 class InodeStore
: public InodeStoreBase
{
111 bufferlist snap_blob
; // Encoded copy of SnapRealm, because we can't
112 // rehydrate it without full MDCache
113 void encode(bufferlist
&bl
, uint64_t features
) const {
114 InodeStoreBase::encode(bl
, features
, &snap_blob
);
116 void decode(bufferlist::iterator
&bl
) {
117 InodeStoreBase::decode(bl
, snap_blob
);
119 void encode_bare(bufferlist
&bl
, uint64_t features
) const {
120 InodeStoreBase::encode_bare(bl
, features
, &snap_blob
);
122 void decode_bare(bufferlist::iterator
&bl
) {
123 InodeStoreBase::decode_bare(bl
, snap_blob
);
126 static void generate_test_instances(std::list
<InodeStore
*>& ls
);
128 WRITE_CLASS_ENCODER_FEATURES(InodeStore
)
130 // cached inode wrapper
131 class CInode
: public MDSCacheObject
, public InodeStoreBase
, public Counter
<CInode
> {
133 MEMPOOL_CLASS_HELPERS();
135 static const int PIN_DIRFRAG
= -1;
136 static const int PIN_CAPS
= 2; // client caps
137 static const int PIN_IMPORTING
= -4; // importing
138 static const int PIN_OPENINGDIR
= 7;
139 static const int PIN_REMOTEPARENT
= 8;
140 static const int PIN_BATCHOPENJOURNAL
= 9;
141 static const int PIN_SCATTERED
= 10;
142 static const int PIN_STICKYDIRS
= 11;
143 //static const int PIN_PURGING = -12;
144 static const int PIN_FREEZING
= 13;
145 static const int PIN_FROZEN
= 14;
146 static const int PIN_IMPORTINGCAPS
= -15;
147 static const int PIN_PASTSNAPPARENT
= -16;
148 static const int PIN_OPENINGSNAPPARENTS
= 17;
149 static const int PIN_TRUNCATING
= 18;
150 static const int PIN_STRAY
= 19; // we pin our stray inode while active
151 static const int PIN_NEEDSNAPFLUSH
= 20;
152 static const int PIN_DIRTYRSTAT
= 21;
153 static const int PIN_EXPORTINGCAPS
= 22;
154 static const int PIN_DIRTYPARENT
= 23;
155 static const int PIN_DIRWAITER
= 24;
156 static const int PIN_SCRUBQUEUE
= 25;
158 const char *pin_name(int p
) const override
{
160 case PIN_DIRFRAG
: return "dirfrag";
161 case PIN_CAPS
: return "caps";
162 case PIN_IMPORTING
: return "importing";
163 case PIN_OPENINGDIR
: return "openingdir";
164 case PIN_REMOTEPARENT
: return "remoteparent";
165 case PIN_BATCHOPENJOURNAL
: return "batchopenjournal";
166 case PIN_SCATTERED
: return "scattered";
167 case PIN_STICKYDIRS
: return "stickydirs";
168 //case PIN_PURGING: return "purging";
169 case PIN_FREEZING
: return "freezing";
170 case PIN_FROZEN
: return "frozen";
171 case PIN_IMPORTINGCAPS
: return "importingcaps";
172 case PIN_EXPORTINGCAPS
: return "exportingcaps";
173 case PIN_PASTSNAPPARENT
: return "pastsnapparent";
174 case PIN_OPENINGSNAPPARENTS
: return "openingsnapparents";
175 case PIN_TRUNCATING
: return "truncating";
176 case PIN_STRAY
: return "stray";
177 case PIN_NEEDSNAPFLUSH
: return "needsnapflush";
178 case PIN_DIRTYRSTAT
: return "dirtyrstat";
179 case PIN_DIRTYPARENT
: return "dirtyparent";
180 case PIN_DIRWAITER
: return "dirwaiter";
181 case PIN_SCRUBQUEUE
: return "scrubqueue";
182 default: return generic_pin_name(p
);
187 static const int STATE_EXPORTING
= (1<<2); // on nonauth bystander.
188 static const int STATE_OPENINGDIR
= (1<<5);
189 static const int STATE_FREEZING
= (1<<7);
190 static const int STATE_FROZEN
= (1<<8);
191 static const int STATE_AMBIGUOUSAUTH
= (1<<9);
192 static const int STATE_EXPORTINGCAPS
= (1<<10);
193 static const int STATE_NEEDSRECOVER
= (1<<11);
194 static const int STATE_RECOVERING
= (1<<12);
195 static const int STATE_PURGING
= (1<<13);
196 static const int STATE_DIRTYPARENT
= (1<<14);
197 static const int STATE_DIRTYRSTAT
= (1<<15);
198 static const int STATE_STRAYPINNED
= (1<<16);
199 static const int STATE_FROZENAUTHPIN
= (1<<17);
200 static const int STATE_DIRTYPOOL
= (1<<18);
201 static const int STATE_REPAIRSTATS
= (1<<19);
202 static const int STATE_MISSINGOBJS
= (1<<20);
203 static const int STATE_EVALSTALECAPS
= (1<<21);
204 static const int STATE_QUEUEDEXPORTPIN
= (1<<22);
205 // orphan inode needs notification of releasing reference
206 static const int STATE_ORPHAN
= STATE_NOTIFYREF
;
208 static const int MASK_STATE_EXPORTED
=
209 (STATE_DIRTY
|STATE_NEEDSRECOVER
|STATE_DIRTYPARENT
|STATE_DIRTYPOOL
);
210 static const int MASK_STATE_EXPORT_KEPT
=
211 (STATE_FROZEN
|STATE_AMBIGUOUSAUTH
|STATE_EXPORTINGCAPS
|STATE_QUEUEDEXPORTPIN
);
214 static const uint64_t WAIT_DIR
= (1<<0);
215 static const uint64_t WAIT_FROZEN
= (1<<1);
216 static const uint64_t WAIT_TRUNC
= (1<<2);
217 static const uint64_t WAIT_FLOCK
= (1<<3);
219 static const uint64_t WAIT_ANY_MASK
= (uint64_t)(-1);
222 static const unsigned EXPORT_NONCE
= 1; // nonce given to replicas created by export
224 ostream
& print_db_line_prefix(ostream
& out
) override
;
229 SnapRealm
*snaprealm
;
230 SnapRealm
*containing_realm
;
231 snapid_t first
, last
;
232 compact_set
<snapid_t
> dirty_old_rstats
;
234 class scrub_stamp_info_t
{
236 /// version we started our latest scrub (whether in-progress or finished)
237 version_t scrub_start_version
;
238 /// time we started our latest scrub (whether in-progress or finished)
239 utime_t scrub_start_stamp
;
240 /// version we started our most recent finished scrub
241 version_t last_scrub_version
;
242 /// time we started our most recent finished scrub
243 utime_t last_scrub_stamp
;
244 scrub_stamp_info_t() : scrub_start_version(0), last_scrub_version(0) {}
246 scrub_start_version
= last_scrub_version
= 0;
247 scrub_start_stamp
= last_scrub_stamp
= utime_t();
251 class scrub_info_t
: public scrub_stamp_info_t
{
253 CDentry
*scrub_parent
;
254 MDSInternalContextBase
*on_finish
;
256 bool last_scrub_dirty
; /// are our stamps dirty with respect to disk state?
257 bool scrub_in_progress
; /// are we currently scrubbing?
258 bool children_scrubbed
;
260 /// my own (temporary) stamps and versions for each dirfrag we have
261 std::map
<frag_t
, scrub_stamp_info_t
> dirfrag_stamps
;
263 ScrubHeaderRef header
;
265 scrub_info_t() : scrub_stamp_info_t(),
266 scrub_parent(NULL
), on_finish(NULL
),
267 last_scrub_dirty(false), scrub_in_progress(false),
268 children_scrubbed(false) {}
271 const scrub_info_t
*scrub_info() const{
277 ScrubHeaderRef
get_scrub_header() {
278 if (scrub_infop
== nullptr) {
281 return scrub_infop
->header
;
285 bool scrub_is_in_progress() const {
286 return (scrub_infop
&& scrub_infop
->scrub_in_progress
);
289 * Start scrubbing on this inode. That could be very short if it's
290 * a file, or take a long time if we're recursively scrubbing a directory.
291 * @pre It is not currently scrubbing
292 * @post it has set up internal scrubbing state
293 * @param scrub_version What version are we scrubbing at (usually, parent
294 * directory's get_projected_version())
296 void scrub_initialize(CDentry
*scrub_parent
,
297 ScrubHeaderRef
& header
,
298 MDSInternalContextBase
*f
);
300 * Get the next dirfrag to scrub. Gives you a frag_t in output param which
301 * you must convert to a CDir (and possibly load off disk).
302 * @param dir A pointer to frag_t, will be filled in with the next dirfrag to
303 * scrub if there is one.
304 * @returns 0 on success, you should scrub the passed-out frag_t right now;
305 * ENOENT: There are no remaining dirfrags to scrub
306 * <0 There was some other error (It will return -ENOTDIR if not a directory)
308 int scrub_dirfrag_next(frag_t
* out_dirfrag
);
310 * Get the currently scrubbing dirfrags. When returned, the
311 * passed-in list will be filled in with all frag_ts which have
312 * been returned from scrub_dirfrag_next but not sent back
313 * via scrub_dirfrag_finished.
315 void scrub_dirfrags_scrubbing(list
<frag_t
> *out_dirfrags
);
317 * Report to the CInode that a dirfrag it owns has been scrubbed. Call
318 * this for every frag_t returned from scrub_dirfrag_next().
319 * @param dirfrag The frag_t that was scrubbed
321 void scrub_dirfrag_finished(frag_t dirfrag
);
323 * Call this once the scrub has been completed, whether it's a full
324 * recursive scrub on a directory or simply the data on a file (or
325 * anything in between).
326 * @param c An out param which is filled in with a Context* that must
329 void scrub_finished(MDSInternalContextBase
**c
);
331 * Report to the CInode that alldirfrags it owns have been scrubbed.
333 void scrub_children_finished() {
334 scrub_infop
->children_scrubbed
= true;
336 void scrub_set_finisher(MDSInternalContextBase
*c
) {
337 assert(!scrub_infop
->on_finish
);
338 scrub_infop
->on_finish
= c
;
343 * Create a scrub_info_t struct for the scrub_infop poitner.
345 void scrub_info_create() const;
347 * Delete the scrub_info_t struct if it's not got any useful data
349 void scrub_maybe_delete_info();
352 bool is_multiversion() const {
353 return snaprealm
|| // other snaprealms will link to me
354 inode
.is_dir() || // links to me in other snaps
355 inode
.nlink
> 1 || // there are remote links, possibly snapped, that will need to find me
356 !old_inodes
.empty(); // once multiversion, always multiversion. until old_inodes gets cleaned out.
358 snapid_t
get_oldest_snap();
360 uint64_t last_journaled
; // log offset for the last time i was journaled
361 //loff_t last_open_journaled; // log offset for the last journaled EOpen
362 utime_t last_dirstat_prop
;
365 // list item node for when we have unpropagated rstat data
366 elist
<CInode
*>::item dirty_rstat_item
;
368 bool is_dirty_rstat() {
369 return state_test(STATE_DIRTYRSTAT
);
371 void mark_dirty_rstat();
372 void clear_dirty_rstat();
374 //bool hack_accessed;
375 //utime_t hack_load_stamp;
378 * Projection methods, used to store inode changes until they have been journaled,
379 * at which point they are popped.
381 * project_inode as needed. If you're also projecting xattrs, pass
382 * in an xattr map (by pointer), then edit the map.
383 * If you're also projecting the snaprealm, call project_snaprealm after
384 * calling project_inode, and modify the snaprealm as necessary.
386 * Then, journal. Once journaling is done, pop_and_dirty_projected_inode.
387 * This function will take care of the inode itself, the xattrs, and the snaprealm.
390 struct projected_inode_t
{
392 std::map
<std::string
,bufferptr
> *xattrs
;
396 : inode(NULL
), xattrs(NULL
), snapnode(NULL
) {}
397 projected_inode_t(inode_t
*in
, sr_t
*sn
)
398 : inode(in
), xattrs(NULL
), snapnode(sn
) {}
399 projected_inode_t(inode_t
*in
, std::map
<std::string
, bufferptr
> *xp
= NULL
, sr_t
*sn
= NULL
)
400 : inode(in
), xattrs(xp
), snapnode(sn
) {}
402 std::list
<projected_inode_t
*> projected_nodes
; // projected values (only defined while dirty)
403 int num_projected_xattrs
;
404 int num_projected_srnodes
;
406 inode_t
*project_inode(std::map
<std::string
,bufferptr
> *px
=0);
407 void pop_and_dirty_projected_inode(LogSegment
*ls
);
409 projected_inode_t
*get_projected_node() {
410 if (projected_nodes
.empty())
413 return projected_nodes
.back();
416 version_t
get_projected_version() const {
417 if (projected_nodes
.empty())
418 return inode
.version
;
420 return projected_nodes
.back()->inode
->version
;
422 bool is_projected() const {
423 return !projected_nodes
.empty();
426 const inode_t
*get_projected_inode() const {
427 if (projected_nodes
.empty())
430 return projected_nodes
.back()->inode
;
432 inode_t
*get_projected_inode() {
433 if (projected_nodes
.empty())
436 return projected_nodes
.back()->inode
;
438 inode_t
*get_previous_projected_inode() {
439 assert(!projected_nodes
.empty());
440 std::list
<projected_inode_t
*>::reverse_iterator p
= projected_nodes
.rbegin();
442 if (p
!= projected_nodes
.rend())
448 std::map
<std::string
,bufferptr
> *get_projected_xattrs() {
449 if (num_projected_xattrs
> 0) {
450 for (std::list
<projected_inode_t
*>::reverse_iterator p
= projected_nodes
.rbegin();
451 p
!= projected_nodes
.rend();
458 std::map
<std::string
,bufferptr
> *get_previous_projected_xattrs() {
459 std::list
<projected_inode_t
*>::reverse_iterator p
= projected_nodes
.rbegin();
460 for (++p
; // skip the most recent projected value
461 p
!= projected_nodes
.rend();
468 sr_t
*project_snaprealm(snapid_t snapid
=0);
469 const sr_t
*get_projected_srnode() const {
470 if (num_projected_srnodes
> 0) {
471 for (std::list
<projected_inode_t
*>::const_reverse_iterator p
= projected_nodes
.rbegin();
472 p
!= projected_nodes
.rend();
475 return (*p
)->snapnode
;
478 return &snaprealm
->srnode
;
482 sr_t
*get_projected_srnode() {
483 if (num_projected_srnodes
> 0) {
484 for (std::list
<projected_inode_t
*>::reverse_iterator p
= projected_nodes
.rbegin();
485 p
!= projected_nodes
.rend();
488 return (*p
)->snapnode
;
491 return &snaprealm
->srnode
;
495 void project_past_snaprealm_parent(SnapRealm
*newparent
);
498 void pop_projected_snaprealm(sr_t
*next_snaprealm
);
501 old_inode_t
& cow_old_inode(snapid_t follows
, bool cow_head
);
502 void split_old_inode(snapid_t snap
);
503 old_inode_t
*pick_old_inode(snapid_t last
);
504 void pre_cow_old_inode();
505 void purge_stale_snap_data(const std::set
<snapid_t
>& snaps
);
507 // -- cache infrastructure --
509 compact_map
<frag_t
,CDir
*> dirfrags
; // cached dir fragments under this Inode
511 scrub_info_t
*scrub_infop
;
514 bool has_dirfrags() { return !dirfrags
.empty(); }
515 CDir
* get_dirfrag(frag_t fg
) {
516 if (dirfrags
.count(fg
)) {
517 //assert(g_conf->debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME
522 bool get_dirfrags_under(frag_t fg
, std::list
<CDir
*>& ls
);
523 CDir
* get_approx_dirfrag(frag_t fg
);
524 void get_dirfrags(std::list
<CDir
*>& ls
);
525 void get_nested_dirfrags(std::list
<CDir
*>& ls
);
526 void get_subtree_dirfrags(std::list
<CDir
*>& ls
);
527 CDir
*get_or_open_dirfrag(MDCache
*mdcache
, frag_t fg
);
528 CDir
*add_dirfrag(CDir
*dir
);
529 void close_dirfrag(frag_t fg
);
530 void close_dirfrags();
531 bool has_subtree_root_dirfrag(int auth
=-1);
532 bool has_subtree_or_exporting_dirfrag();
534 void force_dirfrags();
535 void verify_dirfrags();
537 void get_stickydirs();
538 void put_stickydirs();
541 // parent dentries in cache
542 CDentry
*parent
; // primary link
543 compact_set
<CDentry
*> remote_parents
; // if hard linked
545 std::list
<CDentry
*> projected_parent
; // for in-progress rename, (un)link, etc.
547 mds_authority_t inode_auth
;
549 // -- distributed state --
552 std::map
<client_t
, Capability
*> client_caps
; // client -> caps
553 compact_map
<int32_t, int32_t> mds_caps_wanted
; // [auth] mds -> caps wanted
554 int replica_caps_wanted
; // [replica] what i've requested from auth
557 compact_map
<int, std::set
<client_t
> > client_snap_caps
; // [auth] [snap] dirty metadata we still need from the head
558 compact_map
<snapid_t
, std::set
<client_t
> > client_need_snapflush
;
560 void add_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
);
561 void remove_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
);
562 bool split_need_snapflush(CInode
*cowin
, CInode
*in
);
566 ceph_lock_state_t
*fcntl_locks
;
567 ceph_lock_state_t
*flock_locks
;
569 ceph_lock_state_t
*get_fcntl_lock_state() {
571 fcntl_locks
= new ceph_lock_state_t(g_ceph_context
, CEPH_LOCK_FCNTL
);
574 void clear_fcntl_lock_state() {
578 ceph_lock_state_t
*get_flock_lock_state() {
580 flock_locks
= new ceph_lock_state_t(g_ceph_context
, CEPH_LOCK_FLOCK
);
583 void clear_flock_lock_state() {
587 void clear_file_locks() {
588 clear_fcntl_lock_state();
589 clear_flock_lock_state();
591 void _encode_file_locks(bufferlist
& bl
) const {
592 bool has_fcntl_locks
= fcntl_locks
&& !fcntl_locks
->empty();
593 ::encode(has_fcntl_locks
, bl
);
595 ::encode(*fcntl_locks
, bl
);
596 bool has_flock_locks
= flock_locks
&& !flock_locks
->empty();
597 ::encode(has_flock_locks
, bl
);
599 ::encode(*flock_locks
, bl
);
601 void _decode_file_locks(bufferlist::iterator
& p
) {
602 bool has_fcntl_locks
;
603 ::decode(has_fcntl_locks
, p
);
605 ::decode(*get_fcntl_lock_state(), p
);
607 clear_fcntl_lock_state();
608 bool has_flock_locks
;
609 ::decode(has_flock_locks
, p
);
611 ::decode(*get_flock_lock_state(), p
);
613 clear_flock_lock_state();
616 // LogSegment lists i (may) belong to
618 elist
<CInode
*>::item item_dirty
;
619 elist
<CInode
*>::item item_caps
;
620 elist
<CInode
*>::item item_open_file
;
621 elist
<CInode
*>::item item_dirty_parent
;
622 elist
<CInode
*>::item item_dirty_dirfrag_dir
;
623 elist
<CInode
*>::item item_dirty_dirfrag_nest
;
624 elist
<CInode
*>::item item_dirty_dirfrag_dirfragtree
;
625 elist
<CInode
*>::item item_scrub
;
627 // also update RecoveryQueue::RecoveryQueue() if you change this
628 elist
<CInode
*>::item
& item_recover_queue
= item_dirty_dirfrag_dir
;
629 elist
<CInode
*>::item
& item_recover_queue_front
= item_dirty_dirfrag_nest
;
632 int auth_pin_freeze_allowance
;
634 inode_load_vec_t pop
;
639 friend class Migrator
;
640 friend class MDCache
;
641 friend class StrayManager
;
643 friend class CInodeExport
;
645 // ---------------------------
646 CInode(MDCache
*c
, bool auth
=true, snapid_t f
=2, snapid_t l
=CEPH_NOSNAP
) :
648 snaprealm(0), containing_realm(0),
650 last_journaled(0), //last_open_journaled(0),
651 //hack_accessed(true),
652 num_projected_xattrs(0),
653 num_projected_srnodes(0),
657 inode_auth(CDIR_AUTH_DEFAULT
),
658 replica_caps_wanted(0),
659 fcntl_locks(0), flock_locks(0),
660 item_dirty(this), item_caps(this), item_open_file(this), item_dirty_parent(this),
661 item_dirty_dirfrag_dir(this),
662 item_dirty_dirfrag_nest(this),
663 item_dirty_dirfrag_dirfragtree(this),
664 auth_pin_freeze_allowance(0),
665 pop(ceph_clock_now()),
666 versionlock(this, &versionlock_type
),
667 authlock(this, &authlock_type
),
668 linklock(this, &linklock_type
),
669 dirfragtreelock(this, &dirfragtreelock_type
),
670 filelock(this, &filelock_type
),
671 xattrlock(this, &xattrlock_type
),
672 snaplock(this, &snaplock_type
),
673 nestlock(this, &nestlock_type
),
674 flocklock(this, &flocklock_type
),
675 policylock(this, &policylock_type
),
676 loner_cap(-1), want_loner_cap(-1)
679 if (auth
) state_set(STATE_AUTH
);
685 assert(num_projected_xattrs
== 0);
686 assert(num_projected_srnodes
== 0);
691 bool is_root() const { return inode
.ino
== MDS_INO_ROOT
; }
692 bool is_stray() const { return MDS_INO_IS_STRAY(inode
.ino
); }
693 mds_rank_t
get_stray_owner() const {
694 return (mds_rank_t
)MDS_INO_STRAY_OWNER(inode
.ino
);
696 bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(inode
.ino
); }
697 bool is_base() const { return is_root() || is_mdsdir(); }
698 bool is_system() const { return inode
.ino
< MDS_INO_SYSTEM_BASE
; }
699 bool is_normal() const { return !(is_base() || is_system() || is_stray()); }
701 bool is_head() const { return last
== CEPH_NOSNAP
; }
703 // note: this overloads MDSCacheObject
704 bool is_ambiguous_auth() const {
705 return state_test(STATE_AMBIGUOUSAUTH
) ||
706 MDSCacheObject::is_ambiguous_auth();
708 void set_ambiguous_auth() {
709 state_set(STATE_AMBIGUOUSAUTH
);
711 void clear_ambiguous_auth(std::list
<MDSInternalContextBase
*>& finished
);
712 void clear_ambiguous_auth();
714 inodeno_t
ino() const { return inode
.ino
; }
715 vinodeno_t
vino() const { return vinodeno_t(inode
.ino
, last
); }
716 int d_type() const { return IFTODT(inode
.mode
); }
718 inode_t
& get_inode() { return inode
; }
719 CDentry
* get_parent_dn() { return parent
; }
720 const CDentry
* get_parent_dn() const { return parent
; }
721 const CDentry
* get_projected_parent_dn() const { return !projected_parent
.empty() ? projected_parent
.back() : parent
; }
722 CDentry
* get_projected_parent_dn() { return !projected_parent
.empty() ? projected_parent
.back() : parent
; }
723 CDir
*get_parent_dir();
724 const CDir
*get_projected_parent_dir() const;
725 CDir
*get_projected_parent_dir();
726 CInode
*get_parent_inode();
728 bool is_lt(const MDSCacheObject
*r
) const override
{
729 const CInode
*o
= static_cast<const CInode
*>(r
);
730 return ino() < o
->ino() ||
731 (ino() == o
->ino() && last
< o
->last
);
735 bool is_projected_ancestor_of(CInode
*other
);
737 void make_path_string(std::string
& s
, bool projected
=false, const CDentry
*use_parent
=NULL
) const;
738 void make_path(filepath
& s
, bool projected
=false) const;
739 void name_stray_dentry(std::string
& dname
);
742 version_t
get_version() const { return inode
.version
; }
744 version_t
pre_dirty();
745 void _mark_dirty(LogSegment
*ls
);
746 void mark_dirty(version_t projected_dirv
, LogSegment
*ls
);
749 void store(MDSInternalContextBase
*fin
);
750 void _stored(int r
, version_t cv
, Context
*fin
);
752 * Flush a CInode to disk. This includes the backtrace, the parent
753 * directory's link, and the Inode object itself (if a base directory).
754 * @pre is_auth() on both the inode and its containing directory
755 * @pre can_auth_pin()
756 * @param fin The Context to call when the flush is completed.
758 void flush(MDSInternalContextBase
*fin
);
759 void fetch(MDSInternalContextBase
*fin
);
760 void _fetched(bufferlist
& bl
, bufferlist
& bl2
, Context
*fin
);
763 void build_backtrace(int64_t pool
, inode_backtrace_t
& bt
);
764 void store_backtrace(MDSInternalContextBase
*fin
, int op_prio
=-1);
765 void _stored_backtrace(int r
, version_t v
, Context
*fin
);
766 void fetch_backtrace(Context
*fin
, bufferlist
*backtrace
);
769 * Return the pool ID where we currently write backtraces for
770 * this inode (in addition to inode.old_pools)
772 * @returns a pool ID >=0
774 int64_t get_backtrace_pool() const;
776 void _mark_dirty_parent(LogSegment
*ls
, bool dirty_pool
=false);
777 void clear_dirty_parent();
778 void verify_diri_backtrace(bufferlist
&bl
, int err
);
779 bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT
); }
780 bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL
); }
782 void encode_snap_blob(bufferlist
&bl
);
783 void decode_snap_blob(bufferlist
&bl
);
784 void encode_store(bufferlist
& bl
, uint64_t features
);
785 void decode_store(bufferlist::iterator
& bl
);
787 void encode_replica(mds_rank_t rep
, bufferlist
& bl
, uint64_t features
, bool need_recover
) {
791 if (!is_replicated())
792 replicate_relax_locks();
794 __u32 nonce
= add_replica(rep
);
797 _encode_base(bl
, features
);
798 _encode_locks_state_for_replica(bl
, need_recover
);
800 void decode_replica(bufferlist::iterator
& p
, bool is_new
) {
803 replica_nonce
= nonce
;
806 _decode_locks_state(p
, is_new
);
811 compact_map
<frag_t
, std::list
<MDSInternalContextBase
*> > waiting_on_dir
;
813 void add_dir_waiter(frag_t fg
, MDSInternalContextBase
*c
);
814 void take_dir_waiting(frag_t fg
, std::list
<MDSInternalContextBase
*>& ls
);
815 bool is_waiting_for_dir(frag_t fg
) {
816 return waiting_on_dir
.count(fg
);
818 void add_waiter(uint64_t tag
, MDSInternalContextBase
*c
) override
;
819 void take_waiting(uint64_t tag
, std::list
<MDSInternalContextBase
*>& ls
) override
;
821 // -- encode/decode helpers --
822 void _encode_base(bufferlist
& bl
, uint64_t features
);
823 void _decode_base(bufferlist::iterator
& p
);
824 void _encode_locks_full(bufferlist
& bl
);
825 void _decode_locks_full(bufferlist::iterator
& p
);
826 void _encode_locks_state_for_replica(bufferlist
& bl
, bool need_recover
);
827 void _encode_locks_state_for_rejoin(bufferlist
& bl
, int rep
);
828 void _decode_locks_state(bufferlist::iterator
& p
, bool is_new
);
829 void _decode_locks_rejoin(bufferlist::iterator
& p
, std::list
<MDSInternalContextBase
*>& waiters
,
830 std::list
<SimpleLock
*>& eval_locks
, bool survivor
);
832 // -- import/export --
833 void encode_export(bufferlist
& bl
);
834 void finish_export(utime_t now
);
835 void abort_export() {
836 put(PIN_TEMPEXPORTING
);
837 assert(state_test(STATE_EXPORTINGCAPS
));
838 state_clear(STATE_EXPORTINGCAPS
);
839 put(PIN_EXPORTINGCAPS
);
841 void decode_import(bufferlist::iterator
& p
, LogSegment
*ls
);
844 // for giving to clients
845 int encode_inodestat(bufferlist
& bl
, Session
*session
, SnapRealm
*realm
,
846 snapid_t snapid
=CEPH_NOSNAP
, unsigned max_bytes
=0,
847 int getattr_wants
=0);
848 void encode_cap_message(MClientCaps
*m
, Capability
*cap
);
853 static LockType versionlock_type
;
854 static LockType authlock_type
;
855 static LockType linklock_type
;
856 static LockType dirfragtreelock_type
;
857 static LockType filelock_type
;
858 static LockType xattrlock_type
;
859 static LockType snaplock_type
;
860 static LockType nestlock_type
;
861 static LockType flocklock_type
;
862 static LockType policylock_type
;
864 LocalLock versionlock
;
867 ScatterLock dirfragtreelock
;
868 ScatterLock filelock
;
869 SimpleLock xattrlock
;
871 ScatterLock nestlock
;
872 SimpleLock flocklock
;
873 SimpleLock policylock
;
875 SimpleLock
* get_lock(int type
) override
{
877 case CEPH_LOCK_IFILE
: return &filelock
;
878 case CEPH_LOCK_IAUTH
: return &authlock
;
879 case CEPH_LOCK_ILINK
: return &linklock
;
880 case CEPH_LOCK_IDFT
: return &dirfragtreelock
;
881 case CEPH_LOCK_IXATTR
: return &xattrlock
;
882 case CEPH_LOCK_ISNAP
: return &snaplock
;
883 case CEPH_LOCK_INEST
: return &nestlock
;
884 case CEPH_LOCK_IFLOCK
: return &flocklock
;
885 case CEPH_LOCK_IPOLICY
: return &policylock
;
890 void set_object_info(MDSCacheObjectInfo
&info
) override
;
891 void encode_lock_state(int type
, bufferlist
& bl
) override
;
892 void decode_lock_state(int type
, bufferlist
& bl
) override
;
894 void _finish_frag_update(CDir
*dir
, MutationRef
& mut
);
896 void clear_dirty_scattered(int type
) override
;
897 bool is_dirty_scattered();
898 void clear_scatter_dirty(); // on rejoin ack
900 void start_scatter(ScatterLock
*lock
);
901 void finish_scatter_update(ScatterLock
*lock
, CDir
*dir
,
902 version_t inode_version
, version_t dir_accounted_version
);
903 void finish_scatter_gather_update(int type
);
904 void finish_scatter_gather_update_accounted(int type
, MutationRef
& mut
, EMetaBlob
*metablob
);
907 void open_snaprealm(bool no_split
=false);
908 void close_snaprealm(bool no_join
=false);
909 SnapRealm
*find_snaprealm() const;
910 void encode_snap(bufferlist
& bl
);
911 void decode_snap(bufferlist::iterator
& p
);
915 client_t loner_cap
, want_loner_cap
;
917 client_t
get_loner() const { return loner_cap
; }
918 client_t
get_wanted_loner() const { return want_loner_cap
; }
920 // this is the loner state our locks should aim for
921 client_t
get_target_loner() const {
922 if (loner_cap
== want_loner_cap
)
928 client_t
calc_ideal_loner();
929 void set_loner_cap(client_t l
);
930 bool choose_ideal_loner();
931 bool try_set_loner();
932 bool try_drop_loner();
934 // choose new lock state during recovery, based on issued caps
935 void choose_lock_state(SimpleLock
*lock
, int allissued
);
936 void choose_lock_states(int dirty_caps
);
938 int count_nonstale_caps() {
940 for (std::map
<client_t
,Capability
*>::iterator it
= client_caps
.begin();
941 it
!= client_caps
.end();
943 if (!it
->second
->is_stale())
947 bool multiple_nonstale_caps() {
949 for (std::map
<client_t
,Capability
*>::iterator it
= client_caps
.begin();
950 it
!= client_caps
.end();
952 if (!it
->second
->is_stale()) {
960 bool is_any_caps() { return !client_caps
.empty(); }
961 bool is_any_nonstale_caps() { return count_nonstale_caps(); }
963 const compact_map
<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted
; }
964 compact_map
<int32_t,int32_t>& get_mds_caps_wanted() { return mds_caps_wanted
; }
966 const std::map
<client_t
,Capability
*>& get_client_caps() const { return client_caps
; }
967 Capability
*get_client_cap(client_t client
) {
968 auto client_caps_entry
= client_caps
.find(client
);
969 if (client_caps_entry
!= client_caps
.end())
970 return client_caps_entry
->second
;
973 int get_client_cap_pending(client_t client
) const {
974 auto client_caps_entry
= client_caps
.find(client
);
975 if (client_caps_entry
!= client_caps
.end()) {
976 return client_caps_entry
->second
->pending();
982 Capability
*add_client_cap(client_t client
, Session
*session
, SnapRealm
*conrealm
=0);
983 void remove_client_cap(client_t client
);
984 void move_to_realm(SnapRealm
*realm
);
986 Capability
*reconnect_cap(client_t client
, const cap_reconnect_t
& icr
, Session
*session
);
987 void clear_client_caps_after_export();
988 void export_client_caps(std::map
<client_t
,Capability::Export
>& cl
);
991 int get_caps_liked() const;
992 int get_caps_allowed_ever() const;
993 int get_caps_allowed_by_type(int type
) const;
994 int get_caps_careful() const;
995 int get_xlocker_mask(client_t client
) const;
996 int get_caps_allowed_for_client(Session
*s
, inode_t
*file_i
) const;
998 // caps issued, wanted
999 int get_caps_issued(int *ploner
= 0, int *pother
= 0, int *pxlocker
= 0,
1000 int shift
= 0, int mask
= -1);
1001 bool is_any_caps_wanted() const;
1002 int get_caps_wanted(int *ploner
= 0, int *pother
= 0, int shift
= 0, int mask
= -1) const;
1003 bool issued_caps_need_gather(SimpleLock
*lock
);
1004 void replicate_relax_locks();
1007 mds_authority_t
authority() const override
;
1010 void adjust_nested_auth_pins(int a
, void *by
);
1011 bool can_auth_pin() const override
;
1012 void auth_pin(void *by
) override
;
1013 void auth_unpin(void *by
) override
;
1016 bool is_freezing_inode() const { return state_test(STATE_FREEZING
); }
1017 bool is_frozen_inode() const { return state_test(STATE_FROZEN
); }
1018 bool is_frozen_auth_pin() const { return state_test(STATE_FROZENAUTHPIN
); }
1019 bool is_frozen() const override
;
1020 bool is_frozen_dir() const;
1021 bool is_freezing() const override
;
1023 /* Freeze the inode. auth_pin_allowance lets the caller account for any
1024 * auth_pins it is itself holding/responsible for. */
1025 bool freeze_inode(int auth_pin_allowance
=0);
1026 void unfreeze_inode(std::list
<MDSInternalContextBase
*>& finished
);
1027 void unfreeze_inode();
1029 void freeze_auth_pin();
1030 void unfreeze_auth_pin();
1032 // -- reference counting --
1033 void bad_put(int by
) override
{
1034 generic_dout(0) << " bad put " << *this << " by " << by
<< " " << pin_name(by
) << " was " << ref
1036 << " (" << ref_map
<< ")"
1040 assert(ref_map
[by
] > 0);
1044 void bad_get(int by
) override
{
1045 generic_dout(0) << " bad get " << *this << " by " << by
<< " " << pin_name(by
) << " was " << ref
1047 << " (" << ref_map
<< ")"
1051 assert(ref_map
[by
] >= 0);
1054 void first_get() override
;
1055 void last_put() override
;
1056 void _put() override
;
1059 // -- hierarchy stuff --
1061 void set_primary_parent(CDentry
*p
) {
1062 assert(parent
== 0);
1065 void remove_primary_parent(CDentry
*dn
) {
1066 assert(dn
== parent
);
1069 void add_remote_parent(CDentry
*p
);
1070 void remove_remote_parent(CDentry
*p
);
1071 int num_remote_parents() {
1072 return remote_parents
.size();
1075 void push_projected_parent(CDentry
*dn
) {
1076 projected_parent
.push_back(dn
);
1078 void pop_projected_parent() {
1079 assert(projected_parent
.size());
1080 parent
= projected_parent
.front();
1081 projected_parent
.pop_front();
1085 void maybe_export_pin(bool update
=false);
1086 void set_export_pin(mds_rank_t rank
);
1087 mds_rank_t
get_export_pin(bool inherit
=true) const;
1088 bool is_exportable(mds_rank_t dest
) const;
1090 void print(ostream
& out
) override
;
1091 void dump(Formatter
*f
) const;
1094 * @defgroup Scrubbing and fsck
1099 * Report the results of validation against a particular inode.
1100 * Each member is a pair of bools.
1101 * <member>.first represents if validation was performed against the member.
1102 * <member.second represents if the member passed validation.
1103 * performed_validation is set to true if the validation was actually
1104 * run. It might not be run if, for instance, the inode is marked as dirty.
1105 * passed_validation is set to true if everything that was checked
1106 * passed its validation.
1108 struct validated_data
{
1109 template<typename T
>struct member_status
{
1110 bool checked
= false;
1111 bool passed
= false;
1112 bool repaired
= false;
1113 int ondisk_read_retval
= 0;
1116 std::stringstream error_str
;
1119 bool performed_validation
;
1120 bool passed_validation
;
1122 struct raw_stats_t
{
1123 frag_info_t dirstat
;
1127 member_status
<inode_backtrace_t
> backtrace
;
1128 member_status
<inode_t
> inode
;
1129 member_status
<raw_stats_t
> raw_stats
;
1131 validated_data() : performed_validation(false),
1132 passed_validation(false) {}
1134 void dump(Formatter
*f
) const;
1136 bool all_damage_repaired() const;
1140 * Validate that the on-disk state of an inode matches what
1141 * we expect from our memory state. Currently this checks that:
1142 * 1) The backtrace associated with the file data exists and is correct
1143 * 2) For directories, the actual inode metadata matches our memory state,
1144 * 3) For directories, the rstats match
1146 * @param results A freshly-created validated_data struct, with values set
1147 * as described in the struct documentation.
1148 * @param mdr The request to be responeded upon the completion of the
1149 * validation (or NULL)
1150 * @param fin Context to call back on completion (or NULL)
1152 void validate_disk_state(validated_data
*results
,
1153 MDSInternalContext
*fin
);
1154 static void dump_validation_results(const validated_data
& results
,
1157 bool _validate_disk_state(class ValidationContinuation
*c
,
1158 int rval
, int stage
);
1159 friend class ValidationContinuation
;
1160 /** @} Scrubbing and fsck */
1163 ostream
& operator<<(ostream
& out
, const CInode::scrub_stamp_info_t
& si
);