1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
21 #include <string_view>
23 #include "common/config.h"
24 #include "include/counter.h"
25 #include "include/elist.h"
26 #include "include/types.h"
27 #include "include/lru.h"
28 #include "include/compact_set.h"
30 #include "MDSCacheObject.h"
31 #include "MDSContext.h"
36 #include "SimpleLock.h"
37 #include "ScatterLock.h"
38 #include "LocalLock.h"
39 #include "Capability.h"
40 #include "SnapRealm.h"
43 #include "messages/MClientCaps.h"
45 #define dout_context g_ceph_context
54 struct ObjectOperation
;
57 struct cinode_lock_info_t
{
63 * Base class for CInode, containing the backing store data and
64 * serialization methods. This exists so that we can read and
65 * handle CInodes from the backing store without hitting all
66 * the business logic in CInode proper.
68 class InodeStoreBase
{
70 typedef inode_t
<mempool::mds_co::pool_allocator
> mempool_inode
;
71 typedef old_inode_t
<mempool::mds_co::pool_allocator
> mempool_old_inode
;
72 typedef mempool::mds_co::compact_map
<snapid_t
, mempool_old_inode
> mempool_old_inode_map
;
73 typedef xattr_map
<mempool::mds_co::pool_allocator
> mempool_xattr_map
; // FIXME bufferptr not in mempool
78 bool is_file() const { return inode
.is_file(); }
79 bool is_symlink() const { return inode
.is_symlink(); }
80 bool is_dir() const { return inode
.is_dir(); }
81 static object_t
get_object_name(inodeno_t ino
, frag_t fg
, std::string_view suffix
);
83 /* Full serialization for use in ".inode" root inode objects */
84 void encode(bufferlist
&bl
, uint64_t features
, const bufferlist
*snap_blob
=NULL
) const;
85 void decode(bufferlist::const_iterator
&bl
, bufferlist
& snap_blob
);
87 /* Serialization without ENCODE_START/FINISH blocks for use embedded in dentry */
88 void encode_bare(bufferlist
&bl
, uint64_t features
, const bufferlist
*snap_blob
=NULL
) const;
89 void decode_bare(bufferlist::const_iterator
&bl
, bufferlist
&snap_blob
, __u8 struct_v
=5);
91 /* For test/debug output */
92 void dump(Formatter
*f
) const;
94 /* For use by offline tools */
95 __u32
hash_dentry_name(std::string_view dn
);
96 frag_t
pick_dirfrag(std::string_view dn
);
98 mempool_inode inode
; // the inode itself
99 mempool::mds_co::string symlink
; // symlink dest, if symlink
100 mempool_xattr_map xattrs
;
101 fragtree_t dirfragtree
; // dir frag tree, if any. always consistent with our dirfrag map.
102 mempool_old_inode_map old_inodes
; // key = last, value.first = first
103 snapid_t oldest_snap
= CEPH_NOSNAP
;
104 damage_flags_t damage_flags
= 0;
107 inline void decode_noshare(InodeStoreBase::mempool_xattr_map
& xattrs
,
108 ceph::buffer::list::const_iterator
&p
)
110 decode_noshare
<mempool::mds_co::pool_allocator
>(xattrs
, p
);
113 class InodeStore
: public InodeStoreBase
{
115 void encode(bufferlist
&bl
, uint64_t features
) const {
116 InodeStoreBase::encode(bl
, features
, &snap_blob
);
118 void decode(bufferlist::const_iterator
&bl
) {
119 InodeStoreBase::decode(bl
, snap_blob
);
121 void encode_bare(bufferlist
&bl
, uint64_t features
) const {
122 InodeStoreBase::encode_bare(bl
, features
, &snap_blob
);
124 void decode_bare(bufferlist::const_iterator
&bl
) {
125 InodeStoreBase::decode_bare(bl
, snap_blob
);
128 static void generate_test_instances(std::list
<InodeStore
*>& ls
);
130 // FIXME bufferlist not part of mempool
131 bufferlist snap_blob
; // Encoded copy of SnapRealm, because we can't
132 // rehydrate it without full MDCache
134 WRITE_CLASS_ENCODER_FEATURES(InodeStore
)
136 // just for ceph-dencoder
137 class InodeStoreBare
: public InodeStore
{
139 void encode(bufferlist
&bl
, uint64_t features
) const {
140 InodeStore::encode_bare(bl
, features
);
142 void decode(bufferlist::const_iterator
&bl
) {
143 InodeStore::decode_bare(bl
);
145 static void generate_test_instances(std::list
<InodeStoreBare
*>& ls
);
147 WRITE_CLASS_ENCODER_FEATURES(InodeStoreBare
)
149 // cached inode wrapper
150 class CInode
: public MDSCacheObject
, public InodeStoreBase
, public Counter
<CInode
> {
152 MEMPOOL_CLASS_HELPERS();
154 using mempool_cap_map
= mempool::mds_co::map
<client_t
, Capability
>;
156 * @defgroup Scrubbing and fsck
160 * Report the results of validation against a particular inode.
161 * Each member is a pair of bools.
162 * <member>.first represents if validation was performed against the member.
163 * <member.second represents if the member passed validation.
164 * performed_validation is set to true if the validation was actually
165 * run. It might not be run if, for instance, the inode is marked as dirty.
166 * passed_validation is set to true if everything that was checked
167 * passed its validation.
169 struct validated_data
{
170 template<typename T
>struct member_status
{
171 bool checked
= false;
173 bool repaired
= false;
174 int ondisk_read_retval
= 0;
177 std::stringstream error_str
;
187 void dump(Formatter
*f
) const;
189 bool all_damage_repaired() const;
191 bool performed_validation
= false;
192 bool passed_validation
= false;
194 member_status
<inode_backtrace_t
> backtrace
;
195 member_status
<mempool_inode
> inode
; // XXX should not be in mempool; wait for pmr
196 member_status
<raw_stats_t
> raw_stats
;
202 friend class Migrator
;
203 friend class MDCache
;
204 friend class StrayManager
;
206 friend ostream
& operator<<(ostream
&, const CInode
&);
208 class scrub_stamp_info_t
{
210 scrub_stamp_info_t() {}
212 scrub_start_version
= last_scrub_version
= 0;
213 scrub_start_stamp
= last_scrub_stamp
= utime_t();
215 /// version we started our latest scrub (whether in-progress or finished)
216 version_t scrub_start_version
= 0;
217 /// time we started our latest scrub (whether in-progress or finished)
218 utime_t scrub_start_stamp
;
219 /// version we started our most recent finished scrub
220 version_t last_scrub_version
= 0;
221 /// time we started our most recent finished scrub
222 utime_t last_scrub_stamp
;
225 class scrub_info_t
: public scrub_stamp_info_t
{
229 CDentry
*scrub_parent
= nullptr;
230 MDSContext
*on_finish
= nullptr;
232 bool last_scrub_dirty
= false; /// are our stamps dirty with respect to disk state?
233 bool scrub_in_progress
= false; /// are we currently scrubbing?
234 bool children_scrubbed
= false;
236 /// my own (temporary) stamps and versions for each dirfrag we have
237 std::map
<frag_t
, scrub_stamp_info_t
> dirfrag_stamps
; // XXX not part of mempool
239 ScrubHeaderRef header
;
243 * Projection methods, used to store inode changes until they have been journaled,
244 * at which point they are popped.
246 * project_inode as needed. If you're changing xattrs or sr_t, then pass true
247 * as needed then change the xattrs/snapnode member as needed. (Dirty
248 * exception: project_past_snaprealm_parent allows you to project the
249 * snapnode after doing project_inode (i.e. you don't need to pass
252 * Then, journal. Once journaling is done, pop_and_dirty_projected_inode.
253 * This function will take care of the inode itself, the xattrs, and the snaprealm.
256 class projected_inode
{
258 static sr_t
* const UNDEF_SRNODE
;
260 projected_inode() = delete;
261 explicit projected_inode(const mempool_inode
&in
) : inode(in
) {}
264 std::unique_ptr
<mempool_xattr_map
> xattrs
;
265 sr_t
*snapnode
= UNDEF_SRNODE
;
269 static const int PIN_DIRFRAG
= -1;
270 static const int PIN_CAPS
= 2; // client caps
271 static const int PIN_IMPORTING
= -4; // importing
272 static const int PIN_OPENINGDIR
= 7;
273 static const int PIN_REMOTEPARENT
= 8;
274 static const int PIN_BATCHOPENJOURNAL
= 9;
275 static const int PIN_SCATTERED
= 10;
276 static const int PIN_STICKYDIRS
= 11;
277 //static const int PIN_PURGING = -12;
278 static const int PIN_FREEZING
= 13;
279 static const int PIN_FROZEN
= 14;
280 static const int PIN_IMPORTINGCAPS
= -15;
281 static const int PIN_PASTSNAPPARENT
= -16;
282 static const int PIN_OPENINGSNAPPARENTS
= 17;
283 static const int PIN_TRUNCATING
= 18;
284 static const int PIN_STRAY
= 19; // we pin our stray inode while active
285 static const int PIN_NEEDSNAPFLUSH
= 20;
286 static const int PIN_DIRTYRSTAT
= 21;
287 static const int PIN_EXPORTINGCAPS
= 22;
288 static const int PIN_DIRTYPARENT
= 23;
289 static const int PIN_DIRWAITER
= 24;
290 static const int PIN_SCRUBQUEUE
= 25;
293 static const int DUMP_INODE_STORE_BASE
= (1 << 0);
294 static const int DUMP_MDS_CACHE_OBJECT
= (1 << 1);
295 static const int DUMP_LOCKS
= (1 << 2);
296 static const int DUMP_STATE
= (1 << 3);
297 static const int DUMP_CAPS
= (1 << 4);
298 static const int DUMP_PATH
= (1 << 5);
299 static const int DUMP_DIRFRAGS
= (1 << 6);
300 static const int DUMP_ALL
= (-1);
301 static const int DUMP_DEFAULT
= DUMP_ALL
& (~DUMP_PATH
) & (~DUMP_DIRFRAGS
);
304 static const int STATE_EXPORTING
= (1<<0); // on nonauth bystander.
305 static const int STATE_OPENINGDIR
= (1<<1);
306 static const int STATE_FREEZING
= (1<<2);
307 static const int STATE_FROZEN
= (1<<3);
308 static const int STATE_AMBIGUOUSAUTH
= (1<<4);
309 static const int STATE_EXPORTINGCAPS
= (1<<5);
310 static const int STATE_NEEDSRECOVER
= (1<<6);
311 static const int STATE_RECOVERING
= (1<<7);
312 static const int STATE_PURGING
= (1<<8);
313 static const int STATE_DIRTYPARENT
= (1<<9);
314 static const int STATE_DIRTYRSTAT
= (1<<10);
315 static const int STATE_STRAYPINNED
= (1<<11);
316 static const int STATE_FROZENAUTHPIN
= (1<<12);
317 static const int STATE_DIRTYPOOL
= (1<<13);
318 static const int STATE_REPAIRSTATS
= (1<<14);
319 static const int STATE_MISSINGOBJS
= (1<<15);
320 static const int STATE_EVALSTALECAPS
= (1<<16);
321 static const int STATE_QUEUEDEXPORTPIN
= (1<<17);
322 static const int STATE_TRACKEDBYOFT
= (1<<18); // tracked by open file table
323 static const int STATE_DELAYEDEXPORTPIN
= (1<<19);
324 static const int STATE_DISTEPHEMERALPIN
= (1<<20);
325 static const int STATE_RANDEPHEMERALPIN
= (1<<21);
326 static const int STATE_CLIENTWRITEABLE
= (1<<22);
328 // orphan inode needs notification of releasing reference
329 static const int STATE_ORPHAN
= STATE_NOTIFYREF
;
331 static const int MASK_STATE_EXPORTED
=
332 (STATE_DIRTY
|STATE_NEEDSRECOVER
|STATE_DIRTYPARENT
|STATE_DIRTYPOOL
|
333 STATE_DISTEPHEMERALPIN
|STATE_RANDEPHEMERALPIN
);
334 static const int MASK_STATE_EXPORT_KEPT
=
335 (STATE_FROZEN
|STATE_AMBIGUOUSAUTH
|STATE_EXPORTINGCAPS
|
336 STATE_QUEUEDEXPORTPIN
|STATE_TRACKEDBYOFT
|STATE_DELAYEDEXPORTPIN
|
337 STATE_DISTEPHEMERALPIN
|STATE_RANDEPHEMERALPIN
);
339 /* These are for "permanent" state markers that are passed around between
340 * MDS. Nothing protects/updates it like a typical MDS lock.
342 * Currently, we just use this for REPLICATED inodes. The reason we need to
343 * replicate the random epin state is because the directory inode is still
344 * under the authority of the parent subtree. So it's not exported normally
345 * and we can't pass around the state that way. The importer of the dirfrags
346 * still needs to know that the inode is random pinned though otherwise it
347 * doesn't know that the dirfrags are pinned.
349 static const int MASK_STATE_REPLICATED
= STATE_RANDEPHEMERALPIN
;
352 static const uint64_t WAIT_DIR
= (1<<0);
353 static const uint64_t WAIT_FROZEN
= (1<<1);
354 static const uint64_t WAIT_TRUNC
= (1<<2);
355 static const uint64_t WAIT_FLOCK
= (1<<3);
357 static const uint64_t WAIT_ANY_MASK
= (uint64_t)(-1);
360 static const unsigned EXPORT_NONCE
= 1; // nonce given to replicas created by export
362 // ---------------------------
364 CInode(MDCache
*c
, bool auth
=true, snapid_t f
=2, snapid_t l
=CEPH_NOSNAP
);
369 ceph_assert(num_projected_xattrs
== 0);
370 ceph_assert(num_projected_srnodes
== 0);
371 ceph_assert(num_caps_notable
== 0);
372 ceph_assert(num_subtree_roots
== 0);
373 ceph_assert(num_exporting_dirs
== 0);
374 ceph_assert(batch_ops
.empty());
377 std::map
<int, std::unique_ptr
<BatchOp
>> batch_ops
;
379 std::string_view
pin_name(int p
) const override
;
381 ostream
& print_db_line_prefix(ostream
& out
) override
;
383 const scrub_info_t
*scrub_info() const{
389 ScrubHeaderRef
get_scrub_header() {
390 if (scrub_infop
== nullptr) {
393 return scrub_infop
->header
;
397 bool scrub_is_in_progress() const {
398 return (scrub_infop
&& scrub_infop
->scrub_in_progress
);
401 * Start scrubbing on this inode. That could be very short if it's
402 * a file, or take a long time if we're recursively scrubbing a directory.
403 * @pre It is not currently scrubbing
404 * @post it has set up internal scrubbing state
405 * @param scrub_version What version are we scrubbing at (usually, parent
406 * directory's get_projected_version())
408 void scrub_initialize(CDentry
*scrub_parent
,
409 ScrubHeaderRef
& header
,
412 * Get the next dirfrag to scrub. Gives you a frag_t in output param which
413 * you must convert to a CDir (and possibly load off disk).
414 * @param dir A pointer to frag_t, will be filled in with the next dirfrag to
415 * scrub if there is one.
416 * @returns 0 on success, you should scrub the passed-out frag_t right now;
417 * ENOENT: There are no remaining dirfrags to scrub
418 * <0 There was some other error (It will return -ENOTDIR if not a directory)
420 int scrub_dirfrag_next(frag_t
* out_dirfrag
);
422 * Get the currently scrubbing dirfrags. When returned, the
423 * passed-in list will be filled in with all frag_ts which have
424 * been returned from scrub_dirfrag_next but not sent back
425 * via scrub_dirfrag_finished.
427 void scrub_dirfrags_scrubbing(frag_vec_t
*out_dirfrags
);
429 * Report to the CInode that a dirfrag it owns has been scrubbed. Call
430 * this for every frag_t returned from scrub_dirfrag_next().
431 * @param dirfrag The frag_t that was scrubbed
433 void scrub_dirfrag_finished(frag_t dirfrag
);
435 * Call this once the scrub has been completed, whether it's a full
436 * recursive scrub on a directory or simply the data on a file (or
437 * anything in between).
438 * @param c An out param which is filled in with a Context* that must
441 void scrub_finished(MDSContext
**c
);
443 void scrub_aborted(MDSContext
**c
);
446 * Report to the CInode that alldirfrags it owns have been scrubbed.
448 void scrub_children_finished() {
449 scrub_infop
->children_scrubbed
= true;
451 void scrub_set_finisher(MDSContext
*c
) {
452 ceph_assert(!scrub_infop
->on_finish
);
453 scrub_infop
->on_finish
= c
;
456 bool is_multiversion() const {
457 return snaprealm
|| // other snaprealms will link to me
458 inode
.is_dir() || // links to me in other snaps
459 inode
.nlink
> 1 || // there are remote links, possibly snapped, that will need to find me
460 !old_inodes
.empty(); // once multiversion, always multiversion. until old_inodes gets cleaned out.
462 snapid_t
get_oldest_snap();
464 bool is_dirty_rstat() {
465 return state_test(STATE_DIRTYRSTAT
);
467 void mark_dirty_rstat();
468 void clear_dirty_rstat();
470 CInode::projected_inode
&project_inode(bool xattr
= false, bool snap
= false);
471 void pop_and_dirty_projected_inode(LogSegment
*ls
);
473 projected_inode
*get_projected_node() {
474 if (projected_nodes
.empty())
477 return &projected_nodes
.back();
480 version_t
get_projected_version() const {
481 if (projected_nodes
.empty())
482 return inode
.version
;
484 return projected_nodes
.back().inode
.version
;
486 bool is_projected() const {
487 return !projected_nodes
.empty();
490 const mempool_inode
*get_projected_inode() const {
491 if (projected_nodes
.empty())
494 return &projected_nodes
.back().inode
;
496 mempool_inode
*get_projected_inode() {
497 if (projected_nodes
.empty())
500 return &projected_nodes
.back().inode
;
502 mempool_inode
*get_previous_projected_inode() {
503 ceph_assert(!projected_nodes
.empty());
504 auto it
= projected_nodes
.rbegin();
506 if (it
!= projected_nodes
.rend())
512 mempool_xattr_map
*get_projected_xattrs();
513 mempool_xattr_map
*get_previous_projected_xattrs();
515 sr_t
*prepare_new_srnode(snapid_t snapid
);
516 void project_snaprealm(sr_t
*new_srnode
);
517 sr_t
*project_snaprealm(snapid_t snapid
=0) {
518 sr_t
* new_srnode
= prepare_new_srnode(snapid
);
519 project_snaprealm(new_srnode
);
522 const sr_t
*get_projected_srnode() const;
524 void mark_snaprealm_global(sr_t
*new_srnode
);
525 void clear_snaprealm_global(sr_t
*new_srnode
);
526 bool is_projected_snaprealm_global() const;
528 void record_snaprealm_past_parent(sr_t
*new_snap
, SnapRealm
*newparent
);
529 void record_snaprealm_parent_dentry(sr_t
*new_snap
, SnapRealm
*newparent
,
530 CDentry
*dn
, bool primary_dn
);
531 void project_snaprealm_past_parent(SnapRealm
*newparent
);
532 void early_pop_projected_snaprealm();
534 mempool_old_inode
& cow_old_inode(snapid_t follows
, bool cow_head
);
535 void split_old_inode(snapid_t snap
);
536 mempool_old_inode
*pick_old_inode(snapid_t last
);
537 void pre_cow_old_inode();
538 bool has_snap_data(snapid_t s
);
539 void purge_stale_snap_data(const std::set
<snapid_t
>& snaps
);
541 size_t get_num_dirfrags() const { return dirfrags
.size(); }
542 CDir
* get_dirfrag(frag_t fg
) {
543 auto pi
= dirfrags
.find(fg
);
544 if (pi
!= dirfrags
.end()) {
545 //assert(g_conf()->debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME
550 std::pair
<bool, std::vector
<CDir
*>> get_dirfrags_under(frag_t fg
);
551 CDir
* get_approx_dirfrag(frag_t fg
);
553 template<typename Container
>
554 void get_dirfrags(Container
& ls
) const {
556 if constexpr (std::is_same_v
<Container
, std::vector
<CDir
*>>)
557 ls
.reserve(ls
.size() + dirfrags
.size());
558 for (const auto &p
: dirfrags
)
559 ls
.push_back(p
.second
);
562 auto get_dirfrags() const {
563 std::vector
<CDir
*> result
;
564 get_dirfrags(result
);
568 void get_nested_dirfrags(std::vector
<CDir
*>&) const;
569 std::vector
<CDir
*> get_nested_dirfrags() const {
570 std::vector
<CDir
*> v
;
571 get_nested_dirfrags(v
);
574 void get_subtree_dirfrags(std::vector
<CDir
*>&) const;
575 std::vector
<CDir
*> get_subtree_dirfrags() const {
576 std::vector
<CDir
*> v
;
577 get_subtree_dirfrags(v
);
580 int get_num_subtree_roots() const {
581 return num_subtree_roots
;
584 CDir
*get_or_open_dirfrag(MDCache
*mdcache
, frag_t fg
);
585 CDir
*add_dirfrag(CDir
*dir
);
586 void close_dirfrag(frag_t fg
);
587 void close_dirfrags();
588 bool has_subtree_root_dirfrag(int auth
=-1);
589 bool has_subtree_or_exporting_dirfrag();
591 void force_dirfrags();
592 void verify_dirfrags();
594 void get_stickydirs();
595 void put_stickydirs();
597 void add_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
);
598 void remove_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
);
599 pair
<bool,bool> split_need_snapflush(CInode
*cowin
, CInode
*in
);
602 bool is_root() const { return ino() == CEPH_INO_ROOT
; }
603 bool is_stray() const { return MDS_INO_IS_STRAY(inode
.ino
); }
604 mds_rank_t
get_stray_owner() const {
605 return (mds_rank_t
)MDS_INO_STRAY_OWNER(inode
.ino
);
607 bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(inode
.ino
); }
608 bool is_base() const { return MDS_INO_IS_BASE(inode
.ino
); }
609 bool is_system() const { return inode
.ino
< MDS_INO_SYSTEM_BASE
; }
610 bool is_normal() const { return !(is_base() || is_system() || is_stray()); }
612 bool is_head() const { return last
== CEPH_NOSNAP
; }
614 // note: this overloads MDSCacheObject
615 bool is_ambiguous_auth() const {
616 return state_test(STATE_AMBIGUOUSAUTH
) ||
617 MDSCacheObject::is_ambiguous_auth();
619 void set_ambiguous_auth() {
620 state_set(STATE_AMBIGUOUSAUTH
);
622 void clear_ambiguous_auth(MDSContext::vec
& finished
);
623 void clear_ambiguous_auth();
625 inodeno_t
ino() const { return inode
.ino
; }
626 vinodeno_t
vino() const { return vinodeno_t(inode
.ino
, last
); }
627 int d_type() const { return IFTODT(inode
.mode
); }
629 mempool_inode
& get_inode() { return inode
; }
630 const mempool_inode
& get_inode() const { return inode
; }
631 CDentry
* get_parent_dn() { return parent
; }
632 const CDentry
* get_parent_dn() const { return parent
; }
633 CDentry
* get_projected_parent_dn() { return !projected_parent
.empty() ? projected_parent
.back() : parent
; }
634 const CDentry
* get_projected_parent_dn() const { return !projected_parent
.empty() ? projected_parent
.back() : parent
; }
635 const CDentry
* get_oldest_parent_dn() const {
638 return !projected_parent
.empty() ? projected_parent
.front(): NULL
;
640 CDir
*get_parent_dir();
641 const CDir
*get_projected_parent_dir() const;
642 CDir
*get_projected_parent_dir();
643 CInode
*get_parent_inode();
645 bool is_lt(const MDSCacheObject
*r
) const override
{
646 const CInode
*o
= static_cast<const CInode
*>(r
);
647 return ino() < o
->ino() ||
648 (ino() == o
->ino() && last
< o
->last
);
652 bool is_ancestor_of(const CInode
*other
) const;
653 bool is_projected_ancestor_of(const CInode
*other
) const;
655 void make_path_string(std::string
& s
, bool projected
=false, const CDentry
*use_parent
=NULL
) const;
656 void make_path(filepath
& s
, bool projected
=false) const;
657 void name_stray_dentry(std::string
& dname
);
660 version_t
get_version() const { return inode
.version
; }
662 version_t
pre_dirty();
663 void _mark_dirty(LogSegment
*ls
);
664 void mark_dirty(version_t projected_dirv
, LogSegment
*ls
);
667 void store(MDSContext
*fin
);
668 void _stored(int r
, version_t cv
, Context
*fin
);
670 * Flush a CInode to disk. This includes the backtrace, the parent
671 * directory's link, and the Inode object itself (if a base directory).
672 * @pre is_auth() on both the inode and its containing directory
673 * @pre can_auth_pin()
674 * @param fin The Context to call when the flush is completed.
676 void flush(MDSContext
*fin
);
677 void fetch(MDSContext
*fin
);
678 void _fetched(bufferlist
& bl
, bufferlist
& bl2
, Context
*fin
);
680 void build_backtrace(int64_t pool
, inode_backtrace_t
& bt
);
681 void store_backtrace(MDSContext
*fin
, int op_prio
=-1);
682 void _stored_backtrace(int r
, version_t v
, Context
*fin
);
683 void fetch_backtrace(Context
*fin
, bufferlist
*backtrace
);
685 void mark_dirty_parent(LogSegment
*ls
, bool dirty_pool
=false);
686 void clear_dirty_parent();
687 void verify_diri_backtrace(bufferlist
&bl
, int err
);
688 bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT
); }
689 bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL
); }
691 void encode_snap_blob(bufferlist
&bl
);
692 void decode_snap_blob(const bufferlist
&bl
);
693 void encode_store(bufferlist
& bl
, uint64_t features
);
694 void decode_store(bufferlist::const_iterator
& bl
);
696 void add_dir_waiter(frag_t fg
, MDSContext
*c
);
697 void take_dir_waiting(frag_t fg
, MDSContext::vec
& ls
);
698 bool is_waiting_for_dir(frag_t fg
) {
699 return waiting_on_dir
.count(fg
);
701 void add_waiter(uint64_t tag
, MDSContext
*c
) override
;
702 void take_waiting(uint64_t tag
, MDSContext::vec
& ls
) override
;
704 // -- encode/decode helpers --
705 void _encode_base(bufferlist
& bl
, uint64_t features
);
706 void _decode_base(bufferlist::const_iterator
& p
);
707 void _encode_locks_full(bufferlist
& bl
);
708 void _decode_locks_full(bufferlist::const_iterator
& p
);
709 void _encode_locks_state_for_replica(bufferlist
& bl
, bool need_recover
);
710 void _encode_locks_state_for_rejoin(bufferlist
& bl
, int rep
);
711 void _decode_locks_state_for_replica(bufferlist::const_iterator
& p
, bool is_new
);
712 void _decode_locks_rejoin(bufferlist::const_iterator
& p
, MDSContext::vec
& waiters
,
713 std::list
<SimpleLock
*>& eval_locks
, bool survivor
);
715 // -- import/export --
716 void encode_export(bufferlist
& bl
);
717 void finish_export();
718 void abort_export() {
719 put(PIN_TEMPEXPORTING
);
720 ceph_assert(state_test(STATE_EXPORTINGCAPS
));
721 state_clear(STATE_EXPORTINGCAPS
);
722 put(PIN_EXPORTINGCAPS
);
724 void decode_import(bufferlist::const_iterator
& p
, LogSegment
*ls
);
726 // for giving to clients
727 int encode_inodestat(bufferlist
& bl
, Session
*session
, SnapRealm
*realm
,
728 snapid_t snapid
=CEPH_NOSNAP
, unsigned max_bytes
=0,
729 int getattr_wants
=0);
730 void encode_cap_message(const ref_t
<MClientCaps
> &m
, Capability
*cap
);
732 SimpleLock
* get_lock(int type
) override
;
734 void set_object_info(MDSCacheObjectInfo
&info
) override
;
736 void encode_lock_state(int type
, bufferlist
& bl
) override
;
737 void decode_lock_state(int type
, const bufferlist
& bl
) override
;
738 void encode_lock_iauth(bufferlist
& bl
);
739 void decode_lock_iauth(bufferlist::const_iterator
& p
);
740 void encode_lock_ilink(bufferlist
& bl
);
741 void decode_lock_ilink(bufferlist::const_iterator
& p
);
742 void encode_lock_idft(bufferlist
& bl
);
743 void decode_lock_idft(bufferlist::const_iterator
& p
);
744 void encode_lock_ifile(bufferlist
& bl
);
745 void decode_lock_ifile(bufferlist::const_iterator
& p
);
746 void encode_lock_inest(bufferlist
& bl
);
747 void decode_lock_inest(bufferlist::const_iterator
& p
);
748 void encode_lock_ixattr(bufferlist
& bl
);
749 void decode_lock_ixattr(bufferlist::const_iterator
& p
);
750 void encode_lock_isnap(bufferlist
& bl
);
751 void decode_lock_isnap(bufferlist::const_iterator
& p
);
752 void encode_lock_iflock(bufferlist
& bl
);
753 void decode_lock_iflock(bufferlist::const_iterator
& p
);
754 void encode_lock_ipolicy(bufferlist
& bl
);
755 void decode_lock_ipolicy(bufferlist::const_iterator
& p
);
757 void _finish_frag_update(CDir
*dir
, MutationRef
& mut
);
759 void clear_dirty_scattered(int type
) override
;
760 bool is_dirty_scattered();
761 void clear_scatter_dirty(); // on rejoin ack
763 void start_scatter(ScatterLock
*lock
);
764 void finish_scatter_update(ScatterLock
*lock
, CDir
*dir
,
765 version_t inode_version
, version_t dir_accounted_version
);
766 void finish_scatter_gather_update(int type
);
767 void finish_scatter_gather_update_accounted(int type
, MutationRef
& mut
, EMetaBlob
*metablob
);
770 void open_snaprealm(bool no_split
=false);
771 void close_snaprealm(bool no_join
=false);
772 SnapRealm
*find_snaprealm() const;
773 void encode_snap(bufferlist
& bl
);
774 void decode_snap(bufferlist::const_iterator
& p
);
776 client_t
get_loner() const { return loner_cap
; }
777 client_t
get_wanted_loner() const { return want_loner_cap
; }
779 // this is the loner state our locks should aim for
780 client_t
get_target_loner() const {
781 if (loner_cap
== want_loner_cap
)
787 client_t
calc_ideal_loner();
788 void set_loner_cap(client_t l
);
789 bool choose_ideal_loner();
790 bool try_set_loner();
791 bool try_drop_loner();
793 // choose new lock state during recovery, based on issued caps
794 void choose_lock_state(SimpleLock
*lock
, int allissued
);
795 void choose_lock_states(int dirty_caps
);
797 int count_nonstale_caps();
798 bool multiple_nonstale_caps();
800 bool is_any_caps() { return !client_caps
.empty(); }
801 bool is_any_nonstale_caps() { return count_nonstale_caps(); }
803 const mempool::mds_co::compact_map
<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted
; }
804 void set_mds_caps_wanted(mempool::mds_co::compact_map
<int32_t,int32_t>& m
);
805 void set_mds_caps_wanted(mds_rank_t mds
, int32_t wanted
);
807 const mempool_cap_map
& get_client_caps() const { return client_caps
; }
808 Capability
*get_client_cap(client_t client
) {
809 auto client_caps_entry
= client_caps
.find(client
);
810 if (client_caps_entry
!= client_caps
.end())
811 return &client_caps_entry
->second
;
814 int get_client_cap_pending(client_t client
) const {
815 auto client_caps_entry
= client_caps
.find(client
);
816 if (client_caps_entry
!= client_caps
.end()) {
817 return client_caps_entry
->second
.pending();
823 int get_num_caps_notable() const { return num_caps_notable
; }
824 void adjust_num_caps_notable(int d
);
826 Capability
*add_client_cap(client_t client
, Session
*session
,
827 SnapRealm
*conrealm
=nullptr, bool new_inode
=false);
828 void remove_client_cap(client_t client
);
829 void move_to_realm(SnapRealm
*realm
);
831 Capability
*reconnect_cap(client_t client
, const cap_reconnect_t
& icr
, Session
*session
);
832 void clear_client_caps_after_export();
833 void export_client_caps(std::map
<client_t
,Capability::Export
>& cl
);
836 int get_caps_liked() const;
837 int get_caps_allowed_ever() const;
838 int get_caps_allowed_by_type(int type
) const;
839 int get_caps_careful() const;
840 int get_xlocker_mask(client_t client
) const;
841 int get_caps_allowed_for_client(Session
*s
, Capability
*cap
, mempool_inode
*file_i
) const;
843 // caps issued, wanted
844 int get_caps_issued(int *ploner
= 0, int *pother
= 0, int *pxlocker
= 0,
845 int shift
= 0, int mask
= -1);
846 bool is_any_caps_wanted() const;
847 int get_caps_wanted(int *ploner
= 0, int *pother
= 0, int shift
= 0, int mask
= -1) const;
848 bool issued_caps_need_gather(SimpleLock
*lock
);
851 bool is_clientwriteable() const { return state
& STATE_CLIENTWRITEABLE
; }
852 void mark_clientwriteable();
853 void clear_clientwriteable();
856 mds_authority_t
authority() const override
;
859 bool can_auth_pin(int *err_ret
=nullptr) const override
;
860 void auth_pin(void *by
) override
;
861 void auth_unpin(void *by
) override
;
864 bool is_freezing_inode() const { return state_test(STATE_FREEZING
); }
865 bool is_frozen_inode() const { return state_test(STATE_FROZEN
); }
866 bool is_frozen_auth_pin() const { return state_test(STATE_FROZENAUTHPIN
); }
867 bool is_frozen() const override
;
868 bool is_frozen_dir() const;
869 bool is_freezing() const override
;
871 /* Freeze the inode. auth_pin_allowance lets the caller account for any
872 * auth_pins it is itself holding/responsible for. */
873 bool freeze_inode(int auth_pin_allowance
=0);
874 void unfreeze_inode(MDSContext::vec
& finished
);
875 void unfreeze_inode();
877 void freeze_auth_pin();
878 void unfreeze_auth_pin();
880 // -- reference counting --
881 void bad_put(int by
) override
{
882 generic_dout(0) << " bad put " << *this << " by " << by
<< " " << pin_name(by
) << " was " << ref
884 << " (" << ref_map
<< ")"
888 ceph_assert(ref_map
[by
] > 0);
890 ceph_assert(ref
> 0);
892 void bad_get(int by
) override
{
893 generic_dout(0) << " bad get " << *this << " by " << by
<< " " << pin_name(by
) << " was " << ref
895 << " (" << ref_map
<< ")"
899 ceph_assert(ref_map
[by
] >= 0);
902 void first_get() override
;
903 void last_put() override
;
904 void _put() override
;
906 // -- hierarchy stuff --
907 void set_primary_parent(CDentry
*p
) {
908 ceph_assert(parent
== 0 ||
909 g_conf().get_val
<bool>("mds_hack_allow_loading_invalid_metadata"));
912 void remove_primary_parent(CDentry
*dn
) {
913 ceph_assert(dn
== parent
);
916 void add_remote_parent(CDentry
*p
);
917 void remove_remote_parent(CDentry
*p
);
918 int num_remote_parents() {
919 return remote_parents
.size();
922 void push_projected_parent(CDentry
*dn
) {
923 projected_parent
.push_back(dn
);
925 void pop_projected_parent() {
926 ceph_assert(projected_parent
.size());
927 parent
= projected_parent
.front();
928 projected_parent
.pop_front();
930 bool is_parent_projected() const {
931 return !projected_parent
.empty();
934 mds_rank_t
get_export_pin(bool inherit
=true, bool ephemeral
=true) const;
935 void set_export_pin(mds_rank_t rank
);
936 void queue_export_pin(mds_rank_t target
);
937 void maybe_export_pin(bool update
=false);
939 void check_pin_policy();
941 void set_ephemeral_dist(bool yes
);
942 void maybe_ephemeral_dist(bool update
=false);
943 void maybe_ephemeral_dist_children(bool update
=false);
944 void setxattr_ephemeral_dist(bool val
=false);
945 bool is_ephemeral_dist() const {
946 return state_test(STATE_DISTEPHEMERALPIN
);
949 double get_ephemeral_rand(bool inherit
=true) const;
950 void set_ephemeral_rand(bool yes
);
951 void maybe_ephemeral_rand(bool fresh
=false, double threshold
=-1.0);
952 void setxattr_ephemeral_rand(double prob
=0.0);
953 bool is_ephemeral_rand() const {
954 return state_test(STATE_RANDEPHEMERALPIN
);
957 bool has_ephemeral_policy() const {
958 return get_inode().export_ephemeral_random_pin
> 0.0 ||
959 get_inode().export_ephemeral_distributed_pin
;
961 bool is_ephemerally_pinned() const {
962 return state_test(STATE_DISTEPHEMERALPIN
) ||
963 state_test(STATE_RANDEPHEMERALPIN
);
965 bool is_exportable(mds_rank_t dest
) const;
969 maybe_ephemeral_dist();
970 maybe_ephemeral_rand();
973 void print(ostream
& out
) override
;
974 void dump(Formatter
*f
, int flags
= DUMP_DEFAULT
) const;
977 * Validate that the on-disk state of an inode matches what
978 * we expect from our memory state. Currently this checks that:
979 * 1) The backtrace associated with the file data exists and is correct
980 * 2) For directories, the actual inode metadata matches our memory state,
981 * 3) For directories, the rstats match
983 * @param results A freshly-created validated_data struct, with values set
984 * as described in the struct documentation.
985 * @param mdr The request to be responeded upon the completion of the
986 * validation (or NULL)
987 * @param fin Context to call back on completion (or NULL)
989 void validate_disk_state(validated_data
*results
,
991 static void dump_validation_results(const validated_data
& results
,
994 //bool hack_accessed = false;
995 //utime_t hack_load_stamp;
999 SnapRealm
*snaprealm
= nullptr;
1000 SnapRealm
*containing_realm
= nullptr;
1001 snapid_t first
, last
;
1002 mempool::mds_co::compact_set
<snapid_t
> dirty_old_rstats
;
1004 uint64_t last_journaled
= 0; // log offset for the last time i was journaled
1005 //loff_t last_open_journaled; // log offset for the last journaled EOpen
1006 utime_t last_dirstat_prop
;
1008 // list item node for when we have unpropagated rstat data
1009 elist
<CInode
*>::item dirty_rstat_item
;
1011 mempool::mds_co::set
<client_t
> client_snap_caps
;
1012 mempool::mds_co::compact_map
<snapid_t
, mempool::mds_co::set
<client_t
> > client_need_snapflush
;
1014 // LogSegment lists i (may) belong to
1015 elist
<CInode
*>::item item_dirty
;
1016 elist
<CInode
*>::item item_caps
;
1017 elist
<CInode
*>::item item_open_file
;
1018 elist
<CInode
*>::item item_dirty_parent
;
1019 elist
<CInode
*>::item item_dirty_dirfrag_dir
;
1020 elist
<CInode
*>::item item_dirty_dirfrag_nest
;
1021 elist
<CInode
*>::item item_dirty_dirfrag_dirfragtree
;
1022 elist
<CInode
*>::item item_scrub
;
1024 // also update RecoveryQueue::RecoveryQueue() if you change this
1025 elist
<CInode
*>::item
& item_recover_queue
= item_dirty_dirfrag_dir
;
1026 elist
<CInode
*>::item
& item_recover_queue_front
= item_dirty_dirfrag_nest
;
1028 inode_load_vec_t pop
;
1029 elist
<CInode
*>::item item_pop_lru
;
1032 static LockType versionlock_type
;
1033 static LockType authlock_type
;
1034 static LockType linklock_type
;
1035 static LockType dirfragtreelock_type
;
1036 static LockType filelock_type
;
1037 static LockType xattrlock_type
;
1038 static LockType snaplock_type
;
1039 static LockType nestlock_type
;
1040 static LockType flocklock_type
;
1041 static LockType policylock_type
;
1043 // FIXME not part of mempool
1044 LocalLock versionlock
;
1045 SimpleLock authlock
;
1046 SimpleLock linklock
;
1047 ScatterLock dirfragtreelock
;
1048 ScatterLock filelock
;
1049 SimpleLock xattrlock
;
1050 SimpleLock snaplock
;
1051 ScatterLock nestlock
;
1052 SimpleLock flocklock
;
1053 SimpleLock policylock
;
1057 client_t loner_cap
= -1, want_loner_cap
= -1;
1060 ceph_lock_state_t
*get_fcntl_lock_state() {
1062 fcntl_locks
= new ceph_lock_state_t(g_ceph_context
, CEPH_LOCK_FCNTL
);
1065 void clear_fcntl_lock_state() {
1069 ceph_lock_state_t
*get_flock_lock_state() {
1071 flock_locks
= new ceph_lock_state_t(g_ceph_context
, CEPH_LOCK_FLOCK
);
1074 void clear_flock_lock_state() {
1078 void clear_file_locks() {
1079 clear_fcntl_lock_state();
1080 clear_flock_lock_state();
1082 void _encode_file_locks(bufferlist
& bl
) const {
1084 bool has_fcntl_locks
= fcntl_locks
&& !fcntl_locks
->empty();
1085 encode(has_fcntl_locks
, bl
);
1086 if (has_fcntl_locks
)
1087 encode(*fcntl_locks
, bl
);
1088 bool has_flock_locks
= flock_locks
&& !flock_locks
->empty();
1089 encode(has_flock_locks
, bl
);
1090 if (has_flock_locks
)
1091 encode(*flock_locks
, bl
);
1093 void _decode_file_locks(bufferlist::const_iterator
& p
) {
1095 bool has_fcntl_locks
;
1096 decode(has_fcntl_locks
, p
);
1097 if (has_fcntl_locks
)
1098 decode(*get_fcntl_lock_state(), p
);
1100 clear_fcntl_lock_state();
1101 bool has_flock_locks
;
1102 decode(has_flock_locks
, p
);
1103 if (has_flock_locks
)
1104 decode(*get_flock_lock_state(), p
);
1106 clear_flock_lock_state();
1110 * Return the pool ID where we currently write backtraces for
1111 * this inode (in addition to inode.old_pools)
1113 * @returns a pool ID >=0
1115 int64_t get_backtrace_pool() const;
1117 // parent dentries in cache
1118 CDentry
*parent
= nullptr; // primary link
1119 mempool::mds_co::compact_set
<CDentry
*> remote_parents
; // if hard linked
1121 mempool::mds_co::list
<CDentry
*> projected_parent
; // for in-progress rename, (un)link, etc.
1123 mds_authority_t inode_auth
= CDIR_AUTH_DEFAULT
;
1125 // -- distributed state --
1126 // file capabilities
1127 mempool_cap_map client_caps
; // client -> caps
1128 mempool::mds_co::compact_map
<int32_t, int32_t> mds_caps_wanted
; // [auth] mds -> caps wanted
1129 int replica_caps_wanted
= 0; // [replica] what i've requested from auth
1130 int num_caps_notable
= 0;
1132 ceph_lock_state_t
*fcntl_locks
= nullptr;
1133 ceph_lock_state_t
*flock_locks
= nullptr;
1136 mempool::mds_co::compact_map
<frag_t
, MDSContext::vec
> waiting_on_dir
;
1139 // -- freezing inode --
1140 int auth_pin_freeze_allowance
= 0;
1141 elist
<CInode
*>::item item_freezing_inode
;
1142 void maybe_finish_freeze_inode();
1145 friend class ValidationContinuation
;
1148 * Create a scrub_info_t struct for the scrub_infop pointer.
1150 void scrub_info_create() const;
1152 * Delete the scrub_info_t struct if it's not got any useful data
1154 void scrub_maybe_delete_info();
1156 void pop_projected_snaprealm(sr_t
*next_snaprealm
, bool early
);
1158 bool _validate_disk_state(class ValidationContinuation
*c
,
1159 int rval
, int stage
);
1161 mempool::mds_co::list
<projected_inode
> projected_nodes
; // projected values (only defined while dirty)
1162 size_t num_projected_xattrs
= 0;
1163 size_t num_projected_srnodes
= 0;
1165 // -- cache infrastructure --
1166 mempool::mds_co::compact_map
<frag_t
,CDir
*> dirfrags
; // cached dir fragments under this Inode
1168 //for the purpose of quickly determining whether there's a subtree root or exporting dir
1169 int num_subtree_roots
= 0;
1170 int num_exporting_dirs
= 0;
1172 int stickydir_ref
= 0;
1173 scrub_info_t
*scrub_infop
= nullptr;
1174 /** @} Scrubbing and fsck */
1177 ostream
& operator<<(ostream
& out
, const CInode
& in
);
1178 ostream
& operator<<(ostream
& out
, const CInode::scrub_stamp_info_t
& si
);
1180 extern cinode_lock_info_t cinode_lock_info
[];
1181 extern int num_cinode_locks
;