1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
21 #include <string_view>
23 #include "common/config.h"
24 #include "common/RefCountedObj.h"
25 #include "include/compat.h"
26 #include "include/counter.h"
27 #include "include/elist.h"
28 #include "include/types.h"
29 #include "include/lru.h"
30 #include "include/compact_set.h"
32 #include "MDSCacheObject.h"
33 #include "MDSContext.h"
38 #include "SimpleLock.h"
39 #include "ScatterLock.h"
40 #include "LocalLockC.h"
41 #include "Capability.h"
42 #include "SnapRealm.h"
45 #include "messages/MClientCaps.h"
47 #define dout_context g_ceph_context
56 struct ObjectOperation
;
59 struct cinode_lock_info_t
{
64 struct CInodeCommitOperation
{
66 CInodeCommitOperation(int prio
, int64_t po
)
67 : pool(po
), priority(prio
) {
69 CInodeCommitOperation(int prio
, int64_t po
, file_layout_t l
, uint64_t f
, std::string_view s
)
70 : pool(po
), priority(prio
), _layout(l
), _features(f
), _symlink(s
) {
71 update_layout_symlink
= true;
74 void update(ObjectOperation
&op
, inode_backtrace_t
&bt
);
75 int64_t get_pool() { return pool
; }
78 int64_t pool
; ///< pool id
80 bool update_layout_symlink
= false;
81 file_layout_t _layout
;
83 std::string_view _symlink
;
86 struct CInodeCommitOperations
{
87 std::vector
<CInodeCommitOperation
> ops_vec
;
94 * Base class for CInode, containing the backing store data and
95 * serialization methods. This exists so that we can read and
96 * handle CInodes from the backing store without hitting all
97 * the business logic in CInode proper.
99 class InodeStoreBase
{
101 using mempool_inode
= inode_t
<mempool::mds_co::pool_allocator
>;
102 using inode_ptr
= std::shared_ptr
<mempool_inode
>;
103 using inode_const_ptr
= std::shared_ptr
<const mempool_inode
>;
105 template <typename
...Args
>
106 static inode_ptr
allocate_inode(Args
&& ...args
) {
107 static mempool::mds_co::pool_allocator
<mempool_inode
> allocator
;
108 return std::allocate_shared
<mempool_inode
>(allocator
, std::forward
<Args
>(args
)...);
111 using mempool_xattr_map
= xattr_map
<mempool::mds_co::pool_allocator
>; // FIXME bufferptr not in mempool
112 using xattr_map_ptr
= std::shared_ptr
<mempool_xattr_map
>;
113 using xattr_map_const_ptr
= std::shared_ptr
<const mempool_xattr_map
>;
115 template <typename
...Args
>
116 static xattr_map_ptr
allocate_xattr_map(Args
&& ...args
) {
117 static mempool::mds_co::pool_allocator
<mempool_xattr_map
> allocator
;
118 return std::allocate_shared
<mempool_xattr_map
>(allocator
, std::forward
<Args
>(args
)...);
121 using mempool_old_inode
= old_inode_t
<mempool::mds_co::pool_allocator
>;
122 using mempool_old_inode_map
= mempool::mds_co::map
<snapid_t
, mempool_old_inode
>;
123 using old_inode_map_ptr
= std::shared_ptr
<mempool_old_inode_map
>;
124 using old_inode_map_const_ptr
= std::shared_ptr
<const mempool_old_inode_map
>;
126 template <typename
...Args
>
127 static old_inode_map_ptr
allocate_old_inode_map(Args
&& ...args
) {
128 static mempool::mds_co::pool_allocator
<mempool_old_inode_map
> allocator
;
129 return std::allocate_shared
<mempool_old_inode_map
>(allocator
, std::forward
<Args
>(args
)...);
132 void reset_inode(inode_const_ptr
&& ptr
) {
133 inode
= std::move(ptr
);
136 void reset_xattrs(xattr_map_const_ptr
&& ptr
) {
137 xattrs
= std::move(ptr
);
140 void reset_old_inodes(old_inode_map_const_ptr
&& ptr
) {
141 old_inodes
= std::move(ptr
);
144 void encode_xattrs(bufferlist
&bl
) const;
145 void decode_xattrs(bufferlist::const_iterator
&p
);
146 void encode_old_inodes(bufferlist
&bl
, uint64_t features
) const;
147 void decode_old_inodes(bufferlist::const_iterator
&p
);
150 static object_t
get_object_name(inodeno_t ino
, frag_t fg
, std::string_view suffix
);
152 /* Full serialization for use in ".inode" root inode objects */
153 void encode(ceph::buffer::list
&bl
, uint64_t features
, const ceph::buffer::list
*snap_blob
=NULL
) const;
154 void decode(ceph::buffer::list::const_iterator
&bl
, ceph::buffer::list
& snap_blob
);
156 /* Serialization without ENCODE_START/FINISH blocks for use embedded in dentry */
157 void encode_bare(ceph::buffer::list
&bl
, uint64_t features
, const ceph::buffer::list
*snap_blob
=NULL
) const;
158 void decode_bare(ceph::buffer::list::const_iterator
&bl
, ceph::buffer::list
&snap_blob
, __u8 struct_v
=5);
160 /* For test/debug output */
161 void dump(ceph::Formatter
*f
) const;
163 void decode_json(JSONObj
*obj
);
164 static void xattrs_cb(InodeStoreBase::mempool_xattr_map
& c
, JSONObj
*obj
);
165 static void old_indoes_cb(InodeStoreBase::mempool_old_inode_map
& c
, JSONObj
*obj
);
167 /* For use by offline tools */
168 __u32
hash_dentry_name(std::string_view dn
);
169 frag_t
pick_dirfrag(std::string_view dn
);
171 mempool::mds_co::string symlink
; // symlink dest, if symlink
172 fragtree_t dirfragtree
; // dir frag tree, if any. always consistent with our dirfrag map.
173 snapid_t oldest_snap
= CEPH_NOSNAP
;
174 damage_flags_t damage_flags
= 0;
177 static inode_const_ptr empty_inode
;
179 // Following members are pointers to constant data, the constant data can
180 // be shared by CInode and log events. To update these members in CInode,
181 // read-copy-update should be used.
182 inode_const_ptr inode
= empty_inode
;
183 xattr_map_const_ptr xattrs
;
184 old_inode_map_const_ptr old_inodes
; // key = last, value.first = first
187 inline void decode_noshare(InodeStoreBase::mempool_xattr_map
& xattrs
,
188 ceph::buffer::list::const_iterator
&p
)
190 decode_noshare
<mempool::mds_co::pool_allocator
>(xattrs
, p
);
193 class InodeStore
: public InodeStoreBase
{
195 mempool_inode
* get_inode() {
196 if (inode
== empty_inode
)
197 reset_inode(allocate_inode());
198 return const_cast<mempool_inode
*>(inode
.get());
200 mempool_xattr_map
* get_xattrs() { return const_cast<mempool_xattr_map
*>(xattrs
.get()); }
202 void encode(ceph::buffer::list
&bl
, uint64_t features
) const {
203 InodeStoreBase::encode(bl
, features
, &snap_blob
);
205 void decode(ceph::buffer::list::const_iterator
&bl
) {
206 InodeStoreBase::decode(bl
, snap_blob
);
208 void encode_bare(ceph::buffer::list
&bl
, uint64_t features
) const {
209 InodeStoreBase::encode_bare(bl
, features
, &snap_blob
);
211 void decode_bare(ceph::buffer::list::const_iterator
&bl
) {
212 InodeStoreBase::decode_bare(bl
, snap_blob
);
215 static void generate_test_instances(std::list
<InodeStore
*>& ls
);
217 using InodeStoreBase::inode
;
218 using InodeStoreBase::xattrs
;
219 using InodeStoreBase::old_inodes
;
221 // FIXME bufferlist not part of mempool
222 ceph::buffer::list snap_blob
; // Encoded copy of SnapRealm, because we can't
223 // rehydrate it without full MDCache
225 WRITE_CLASS_ENCODER_FEATURES(InodeStore
)
227 // just for ceph-dencoder
228 class InodeStoreBare
: public InodeStore
{
230 void encode(ceph::buffer::list
&bl
, uint64_t features
) const {
231 InodeStore::encode_bare(bl
, features
);
233 void decode(ceph::buffer::list::const_iterator
&bl
) {
234 InodeStore::decode_bare(bl
);
236 static void generate_test_instances(std::list
<InodeStoreBare
*>& ls
);
238 WRITE_CLASS_ENCODER_FEATURES(InodeStoreBare
)
240 // cached inode wrapper
241 class CInode
: public MDSCacheObject
, public InodeStoreBase
, public Counter
<CInode
> {
243 MEMPOOL_CLASS_HELPERS();
245 using mempool_cap_map
= mempool::mds_co::map
<client_t
, Capability
>;
247 * @defgroup Scrubbing and fsck
251 * Report the results of validation against a particular inode.
252 * Each member is a pair of bools.
253 * <member>.first represents if validation was performed against the member.
254 * <member.second represents if the member passed validation.
255 * performed_validation is set to true if the validation was actually
256 * run. It might not be run if, for instance, the inode is marked as dirty.
257 * passed_validation is set to true if everything that was checked
258 * passed its validation.
260 struct validated_data
{
261 template<typename T
>struct member_status
{
262 bool checked
= false;
264 bool repaired
= false;
265 int ondisk_read_retval
= 0;
268 std::stringstream error_str
;
278 void dump(ceph::Formatter
*f
) const;
280 bool all_damage_repaired() const;
282 bool performed_validation
= false;
283 bool passed_validation
= false;
285 member_status
<inode_backtrace_t
> backtrace
;
286 member_status
<mempool_inode
> inode
; // XXX should not be in mempool; wait for pmr
287 member_status
<raw_stats_t
> raw_stats
;
293 friend class Migrator
;
294 friend class MDCache
;
295 friend class StrayManager
;
297 friend std::ostream
& operator<<(std::ostream
&, const CInode
&);
303 version_t last_scrub_version
= 0;
304 utime_t last_scrub_stamp
;
306 bool last_scrub_dirty
= false; /// are our stamps dirty with respect to disk state?
307 bool scrub_in_progress
= false; /// are we currently scrubbing?
309 fragset_t queued_frags
;
311 ScrubHeaderRef header
;
315 static const int PIN_DIRFRAG
= -1;
316 static const int PIN_CAPS
= 2; // client caps
317 static const int PIN_IMPORTING
= -4; // importing
318 static const int PIN_OPENINGDIR
= 7;
319 static const int PIN_REMOTEPARENT
= 8;
320 static const int PIN_BATCHOPENJOURNAL
= 9;
321 static const int PIN_SCATTERED
= 10;
322 static const int PIN_STICKYDIRS
= 11;
323 //static const int PIN_PURGING = -12;
324 static const int PIN_FREEZING
= 13;
325 static const int PIN_FROZEN
= 14;
326 static const int PIN_IMPORTINGCAPS
= -15;
327 static const int PIN_PASTSNAPPARENT
= -16;
328 static const int PIN_OPENINGSNAPPARENTS
= 17;
329 static const int PIN_TRUNCATING
= 18;
330 static const int PIN_STRAY
= 19; // we pin our stray inode while active
331 static const int PIN_NEEDSNAPFLUSH
= 20;
332 static const int PIN_DIRTYRSTAT
= 21;
333 static const int PIN_EXPORTINGCAPS
= 22;
334 static const int PIN_DIRTYPARENT
= 23;
335 static const int PIN_DIRWAITER
= 24;
338 static const int DUMP_INODE_STORE_BASE
= (1 << 0);
339 static const int DUMP_MDS_CACHE_OBJECT
= (1 << 1);
340 static const int DUMP_LOCKS
= (1 << 2);
341 static const int DUMP_STATE
= (1 << 3);
342 static const int DUMP_CAPS
= (1 << 4);
343 static const int DUMP_PATH
= (1 << 5);
344 static const int DUMP_DIRFRAGS
= (1 << 6);
345 static const int DUMP_ALL
= (-1);
346 static const int DUMP_DEFAULT
= DUMP_ALL
& (~DUMP_PATH
) & (~DUMP_DIRFRAGS
);
349 static const int STATE_EXPORTING
= (1<<0); // on nonauth bystander.
350 static const int STATE_OPENINGDIR
= (1<<1);
351 static const int STATE_FREEZING
= (1<<2);
352 static const int STATE_FROZEN
= (1<<3);
353 static const int STATE_AMBIGUOUSAUTH
= (1<<4);
354 static const int STATE_EXPORTINGCAPS
= (1<<5);
355 static const int STATE_NEEDSRECOVER
= (1<<6);
356 static const int STATE_RECOVERING
= (1<<7);
357 static const int STATE_PURGING
= (1<<8);
358 static const int STATE_DIRTYPARENT
= (1<<9);
359 static const int STATE_DIRTYRSTAT
= (1<<10);
360 static const int STATE_STRAYPINNED
= (1<<11);
361 static const int STATE_FROZENAUTHPIN
= (1<<12);
362 static const int STATE_DIRTYPOOL
= (1<<13);
363 static const int STATE_REPAIRSTATS
= (1<<14);
364 static const int STATE_MISSINGOBJS
= (1<<15);
365 static const int STATE_EVALSTALECAPS
= (1<<16);
366 static const int STATE_QUEUEDEXPORTPIN
= (1<<17);
367 static const int STATE_TRACKEDBYOFT
= (1<<18); // tracked by open file table
368 static const int STATE_DELAYEDEXPORTPIN
= (1<<19);
369 static const int STATE_DISTEPHEMERALPIN
= (1<<20);
370 static const int STATE_RANDEPHEMERALPIN
= (1<<21);
371 static const int STATE_CLIENTWRITEABLE
= (1<<22);
373 // orphan inode needs notification of releasing reference
374 static const int STATE_ORPHAN
= STATE_NOTIFYREF
;
376 static const int MASK_STATE_EXPORTED
=
377 (STATE_DIRTY
|STATE_NEEDSRECOVER
|STATE_DIRTYPARENT
|STATE_DIRTYPOOL
|
378 STATE_DISTEPHEMERALPIN
|STATE_RANDEPHEMERALPIN
);
379 static const int MASK_STATE_EXPORT_KEPT
=
380 (STATE_FROZEN
|STATE_AMBIGUOUSAUTH
|STATE_EXPORTINGCAPS
|
381 STATE_QUEUEDEXPORTPIN
|STATE_TRACKEDBYOFT
|STATE_DELAYEDEXPORTPIN
|
382 STATE_DISTEPHEMERALPIN
|STATE_RANDEPHEMERALPIN
);
384 /* These are for "permanent" state markers that are passed around between
385 * MDS. Nothing protects/updates it like a typical MDS lock.
387 * Currently, we just use this for REPLICATED inodes. The reason we need to
388 * replicate the random epin state is because the directory inode is still
389 * under the authority of the parent subtree. So it's not exported normally
390 * and we can't pass around the state that way. The importer of the dirfrags
391 * still needs to know that the inode is random pinned though otherwise it
392 * doesn't know that the dirfrags are pinned.
394 static const int MASK_STATE_REPLICATED
= STATE_RANDEPHEMERALPIN
;
397 static const uint64_t WAIT_DIR
= (1<<0);
398 static const uint64_t WAIT_FROZEN
= (1<<1);
399 static const uint64_t WAIT_TRUNC
= (1<<2);
400 static const uint64_t WAIT_FLOCK
= (1<<3);
402 static const uint64_t WAIT_ANY_MASK
= (uint64_t)(-1);
405 static const unsigned EXPORT_NONCE
= 1; // nonce given to replicas created by export
407 // ---------------------------
409 CInode(MDCache
*c
, bool auth
=true, snapid_t f
=2, snapid_t l
=CEPH_NOSNAP
);
414 ceph_assert(num_projected_srnodes
== 0);
415 ceph_assert(num_caps_notable
== 0);
416 ceph_assert(num_subtree_roots
== 0);
417 ceph_assert(num_exporting_dirs
== 0);
418 ceph_assert(batch_ops
.empty());
421 std::map
<int, std::unique_ptr
<BatchOp
>> batch_ops
;
423 std::string_view
pin_name(int p
) const override
;
425 std::ostream
& print_db_line_prefix(std::ostream
& out
) const override
;
427 const scrub_info_t
*scrub_info() const {
430 return scrub_infop
.get();
433 const ScrubHeaderRef
& get_scrub_header() {
434 static const ScrubHeaderRef nullref
;
435 return scrub_infop
? scrub_infop
->header
: nullref
;
438 bool scrub_is_in_progress() const {
439 return (scrub_infop
&& scrub_infop
->scrub_in_progress
);
442 * Start scrubbing on this inode. That could be very short if it's
443 * a file, or take a long time if we're recursively scrubbing a directory.
444 * @pre It is not currently scrubbing
445 * @post it has set up internal scrubbing state
446 * @param scrub_version What version are we scrubbing at (usually, parent
447 * directory's get_projected_version())
449 void scrub_initialize(ScrubHeaderRef
& header
);
451 * Call this once the scrub has been completed, whether it's a full
452 * recursive scrub on a directory or simply the data on a file (or
453 * anything in between).
454 * @param c An out param which is filled in with a Context* that must
457 void scrub_finished();
459 void scrub_aborted();
461 fragset_t
& scrub_queued_frags() {
462 ceph_assert(scrub_infop
);
463 return scrub_infop
->queued_frags
;
466 bool is_multiversion() const {
467 return snaprealm
|| // other snaprealms will link to me
468 get_inode()->is_dir() || // links to me in other snaps
469 get_inode()->nlink
> 1 || // there are remote links, possibly snapped, that will need to find me
470 is_any_old_inodes(); // once multiversion, always multiversion. until old_inodes gets cleaned out.
472 snapid_t
get_oldest_snap();
474 bool is_dirty_rstat() {
475 return state_test(STATE_DIRTYRSTAT
);
477 void mark_dirty_rstat();
478 void clear_dirty_rstat();
480 //bool hack_accessed = false;
481 //utime_t hack_load_stamp;
484 * Projection methods, used to store inode changes until they have been journaled,
485 * at which point they are popped.
487 * project_inode as needed. If you're changing xattrs or sr_t, then pass true
488 * as needed then change the xattrs/snapnode member as needed. (Dirty
489 * exception: project_past_snaprealm_parent allows you to project the
490 * snapnode after doing project_inode (i.e. you don't need to pass
493 * Then, journal. Once journaling is done, pop_and_dirty_projected_inode.
494 * This function will take care of the inode itself, the xattrs, and the snaprealm.
497 struct projected_inode
{
498 static sr_t
* const UNDEF_SRNODE
;
500 inode_ptr
const inode
;
501 xattr_map_ptr
const xattrs
;
502 sr_t
* const snapnode
;
504 projected_inode() = delete;
505 explicit projected_inode(inode_ptr
&& i
, xattr_map_ptr
&& x
, sr_t
*s
=nullptr) :
506 inode(std::move(i
)), xattrs(std::move(x
)), snapnode(s
) {}
508 projected_inode
project_inode(const MutationRef
& mut
,
509 bool xattr
= false, bool snap
= false);
511 void pop_and_dirty_projected_inode(LogSegment
*ls
, const MutationRef
& mut
);
513 version_t
get_projected_version() const {
514 if (projected_nodes
.empty())
515 return get_inode()->version
;
517 return projected_nodes
.back().inode
->version
;
519 bool is_projected() const {
520 return !projected_nodes
.empty();
523 const inode_const_ptr
& get_projected_inode() const {
524 if (projected_nodes
.empty())
527 return projected_nodes
.back().inode
;
529 // inode should have already been projected in caller's context
530 mempool_inode
* _get_projected_inode() {
531 ceph_assert(!projected_nodes
.empty());
532 return const_cast<mempool_inode
*>(projected_nodes
.back().inode
.get());
534 const inode_const_ptr
& get_previous_projected_inode() const {
535 ceph_assert(!projected_nodes
.empty());
536 auto it
= projected_nodes
.rbegin();
538 if (it
!= projected_nodes
.rend())
544 const xattr_map_const_ptr
& get_projected_xattrs() {
545 if (projected_nodes
.empty())
548 return projected_nodes
.back().xattrs
;
550 const xattr_map_const_ptr
& get_previous_projected_xattrs() {
551 ceph_assert(!projected_nodes
.empty());
552 auto it
= projected_nodes
.rbegin();
554 if (it
!= projected_nodes
.rend())
560 sr_t
*prepare_new_srnode(snapid_t snapid
);
561 void project_snaprealm(sr_t
*new_srnode
);
562 sr_t
*project_snaprealm(snapid_t snapid
=0) {
563 sr_t
* new_srnode
= prepare_new_srnode(snapid
);
564 project_snaprealm(new_srnode
);
567 const sr_t
*get_projected_srnode() const;
569 void mark_snaprealm_global(sr_t
*new_srnode
);
570 void clear_snaprealm_global(sr_t
*new_srnode
);
571 bool is_projected_snaprealm_global() const;
573 void record_snaprealm_past_parent(sr_t
*new_snap
, SnapRealm
*newparent
);
574 void record_snaprealm_parent_dentry(sr_t
*new_snap
, SnapRealm
*newparent
,
575 CDentry
*dn
, bool primary_dn
);
576 void project_snaprealm_past_parent(SnapRealm
*newparent
);
577 void early_pop_projected_snaprealm();
579 const mempool_old_inode
& cow_old_inode(snapid_t follows
, bool cow_head
);
580 void split_old_inode(snapid_t snap
);
581 snapid_t
pick_old_inode(snapid_t last
) const;
582 void pre_cow_old_inode();
583 bool has_snap_data(snapid_t s
);
584 void purge_stale_snap_data(const std::set
<snapid_t
>& snaps
);
586 size_t get_num_dirfrags() const { return dirfrags
.size(); }
587 CDir
* get_dirfrag(frag_t fg
) {
588 auto pi
= dirfrags
.find(fg
);
589 if (pi
!= dirfrags
.end()) {
590 //assert(g_conf()->debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME
595 std::pair
<bool, std::vector
<CDir
*>> get_dirfrags_under(frag_t fg
);
596 CDir
* get_approx_dirfrag(frag_t fg
);
598 template<typename Container
>
599 void get_dirfrags(Container
& ls
) const {
601 if constexpr (std::is_same_v
<Container
, std::vector
<CDir
*>>)
602 ls
.reserve(ls
.size() + dirfrags
.size());
603 for (const auto &p
: dirfrags
)
604 ls
.push_back(p
.second
);
607 auto get_dirfrags() const {
608 std::vector
<CDir
*> result
;
609 get_dirfrags(result
);
613 void get_nested_dirfrags(std::vector
<CDir
*>&) const;
614 std::vector
<CDir
*> get_nested_dirfrags() const {
615 std::vector
<CDir
*> v
;
616 get_nested_dirfrags(v
);
619 void get_subtree_dirfrags(std::vector
<CDir
*>&) const;
620 std::vector
<CDir
*> get_subtree_dirfrags() const {
621 std::vector
<CDir
*> v
;
622 get_subtree_dirfrags(v
);
625 int get_num_subtree_roots() const {
626 return num_subtree_roots
;
629 CDir
*get_or_open_dirfrag(MDCache
*mdcache
, frag_t fg
);
630 CDir
*add_dirfrag(CDir
*dir
);
631 void close_dirfrag(frag_t fg
);
632 void close_dirfrags();
633 bool has_subtree_root_dirfrag(int auth
=-1);
634 bool has_subtree_or_exporting_dirfrag();
636 void force_dirfrags();
637 void verify_dirfrags();
639 void get_stickydirs();
640 void put_stickydirs();
642 void add_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
);
643 void remove_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
);
644 std::pair
<bool,bool> split_need_snapflush(CInode
*cowin
, CInode
*in
);
648 inodeno_t
ino() const { return get_inode()->ino
; }
649 vinodeno_t
vino() const { return vinodeno_t(ino(), last
); }
650 int d_type() const { return IFTODT(get_inode()->mode
); }
651 bool is_root() const { return ino() == CEPH_INO_ROOT
; }
652 bool is_stray() const { return MDS_INO_IS_STRAY(ino()); }
653 mds_rank_t
get_stray_owner() const {
654 return (mds_rank_t
)MDS_INO_STRAY_OWNER(ino());
656 bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(ino()); }
657 bool is_base() const { return MDS_INO_IS_BASE(ino()); }
658 bool is_system() const { return ino() < MDS_INO_SYSTEM_BASE
; }
659 bool is_lost_and_found() const { return ino() == CEPH_INO_LOST_AND_FOUND
; }
660 bool is_normal() const { return !(is_base() || is_system() || is_stray()); }
661 bool is_file() const { return get_inode()->is_file(); }
662 bool is_symlink() const { return get_inode()->is_symlink(); }
663 bool is_dir() const { return get_inode()->is_dir(); }
665 bool is_head() const { return last
== CEPH_NOSNAP
; }
667 // note: this overloads MDSCacheObject
668 bool is_ambiguous_auth() const {
669 return state_test(STATE_AMBIGUOUSAUTH
) ||
670 MDSCacheObject::is_ambiguous_auth();
672 void set_ambiguous_auth() {
673 state_set(STATE_AMBIGUOUSAUTH
);
675 void clear_ambiguous_auth(MDSContext::vec
& finished
);
676 void clear_ambiguous_auth();
678 const inode_const_ptr
& get_inode() const {
682 // only used for updating newly allocated CInode
683 mempool_inode
* _get_inode() {
684 if (inode
== empty_inode
)
685 reset_inode(allocate_inode());
686 return const_cast<mempool_inode
*>(inode
.get());
689 const xattr_map_const_ptr
& get_xattrs() const { return xattrs
; }
691 bool is_any_old_inodes() const { return old_inodes
&& !old_inodes
->empty(); }
692 const old_inode_map_const_ptr
& get_old_inodes() const { return old_inodes
; }
694 CDentry
* get_parent_dn() { return parent
; }
695 const CDentry
* get_parent_dn() const { return parent
; }
696 CDentry
* get_projected_parent_dn() { return !projected_parent
.empty() ? projected_parent
.back() : parent
; }
697 const CDentry
* get_projected_parent_dn() const { return !projected_parent
.empty() ? projected_parent
.back() : parent
; }
698 const CDentry
* get_oldest_parent_dn() const {
701 return !projected_parent
.empty() ? projected_parent
.front(): NULL
;
703 CDir
*get_parent_dir();
704 const CDir
*get_projected_parent_dir() const;
705 CDir
*get_projected_parent_dir();
706 CInode
*get_parent_inode();
708 bool is_lt(const MDSCacheObject
*r
) const override
{
709 const CInode
*o
= static_cast<const CInode
*>(r
);
710 return ino() < o
->ino() ||
711 (ino() == o
->ino() && last
< o
->last
);
715 bool is_ancestor_of(const CInode
*other
) const;
716 bool is_projected_ancestor_of(const CInode
*other
) const;
718 void make_path_string(std::string
& s
, bool projected
=false, const CDentry
*use_parent
=NULL
) const;
719 void make_path(filepath
& s
, bool projected
=false) const;
720 void name_stray_dentry(std::string
& dname
);
723 version_t
get_version() const { return get_inode()->version
; }
725 version_t
pre_dirty();
726 void _mark_dirty(LogSegment
*ls
);
727 void mark_dirty(LogSegment
*ls
);
730 void store(MDSContext
*fin
);
731 void _stored(int r
, version_t cv
, Context
*fin
);
733 * Flush a CInode to disk. This includes the backtrace, the parent
734 * directory's link, and the Inode object itself (if a base directory).
735 * @pre is_auth() on both the inode and its containing directory
736 * @pre can_auth_pin()
737 * @param fin The Context to call when the flush is completed.
739 void flush(MDSContext
*fin
);
740 void fetch(MDSContext
*fin
);
741 void _fetched(ceph::buffer::list
& bl
, ceph::buffer::list
& bl2
, Context
*fin
);
743 void _commit_ops(int r
, C_GatherBuilder
&gather_bld
,
744 std::vector
<CInodeCommitOperation
> &ops_vec
,
745 inode_backtrace_t
&bt
);
746 void build_backtrace(int64_t pool
, inode_backtrace_t
& bt
);
747 void _store_backtrace(std::vector
<CInodeCommitOperation
> &ops_vec
,
748 inode_backtrace_t
&bt
, int op_prio
);
749 void store_backtrace(CInodeCommitOperations
&op
, int op_prio
);
750 void store_backtrace(MDSContext
*fin
, int op_prio
=-1);
751 void _stored_backtrace(int r
, version_t v
, Context
*fin
);
752 void fetch_backtrace(Context
*fin
, ceph::buffer::list
*backtrace
);
754 void mark_dirty_parent(LogSegment
*ls
, bool dirty_pool
=false);
755 void clear_dirty_parent();
756 void verify_diri_backtrace(ceph::buffer::list
&bl
, int err
);
757 bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT
); }
758 bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL
); }
760 void encode_snap_blob(ceph::buffer::list
&bl
);
761 void decode_snap_blob(const ceph::buffer::list
&bl
);
762 void encode_store(ceph::buffer::list
& bl
, uint64_t features
);
763 void decode_store(ceph::buffer::list::const_iterator
& bl
);
765 void add_dir_waiter(frag_t fg
, MDSContext
*c
);
766 void take_dir_waiting(frag_t fg
, MDSContext::vec
& ls
);
767 bool is_waiting_for_dir(frag_t fg
) {
768 return waiting_on_dir
.count(fg
);
770 void add_waiter(uint64_t tag
, MDSContext
*c
) override
;
771 void take_waiting(uint64_t tag
, MDSContext::vec
& ls
) override
;
773 // -- encode/decode helpers --
774 void _encode_base(ceph::buffer::list
& bl
, uint64_t features
);
775 void _decode_base(ceph::buffer::list::const_iterator
& p
);
776 void _encode_locks_full(ceph::buffer::list
& bl
);
777 void _decode_locks_full(ceph::buffer::list::const_iterator
& p
);
778 void _encode_locks_state_for_replica(ceph::buffer::list
& bl
, bool need_recover
);
779 void _encode_locks_state_for_rejoin(ceph::buffer::list
& bl
, int rep
);
780 void _decode_locks_state_for_replica(ceph::buffer::list::const_iterator
& p
, bool is_new
);
781 void _decode_locks_rejoin(ceph::buffer::list::const_iterator
& p
, MDSContext::vec
& waiters
,
782 std::list
<SimpleLock
*>& eval_locks
, bool survivor
);
784 // -- import/export --
785 void encode_export(ceph::buffer::list
& bl
);
786 void finish_export();
787 void abort_export() {
788 put(PIN_TEMPEXPORTING
);
789 ceph_assert(state_test(STATE_EXPORTINGCAPS
));
790 state_clear(STATE_EXPORTINGCAPS
);
791 put(PIN_EXPORTINGCAPS
);
793 void decode_import(ceph::buffer::list::const_iterator
& p
, LogSegment
*ls
);
795 // for giving to clients
796 int encode_inodestat(ceph::buffer::list
& bl
, Session
*session
, SnapRealm
*realm
,
797 snapid_t snapid
=CEPH_NOSNAP
, unsigned max_bytes
=0,
798 int getattr_wants
=0);
799 void encode_cap_message(const ceph::ref_t
<MClientCaps
> &m
, Capability
*cap
);
801 SimpleLock
* get_lock(int type
) override
;
803 void set_object_info(MDSCacheObjectInfo
&info
) override
;
805 void encode_lock_state(int type
, ceph::buffer::list
& bl
) override
;
806 void decode_lock_state(int type
, const ceph::buffer::list
& bl
) override
;
807 void encode_lock_iauth(ceph::buffer::list
& bl
);
808 void decode_lock_iauth(ceph::buffer::list::const_iterator
& p
);
809 void encode_lock_ilink(ceph::buffer::list
& bl
);
810 void decode_lock_ilink(ceph::buffer::list::const_iterator
& p
);
811 void encode_lock_idft(ceph::buffer::list
& bl
);
812 void decode_lock_idft(ceph::buffer::list::const_iterator
& p
);
813 void encode_lock_ifile(ceph::buffer::list
& bl
);
814 void decode_lock_ifile(ceph::buffer::list::const_iterator
& p
);
815 void encode_lock_inest(ceph::buffer::list
& bl
);
816 void decode_lock_inest(ceph::buffer::list::const_iterator
& p
);
817 void encode_lock_ixattr(ceph::buffer::list
& bl
);
818 void decode_lock_ixattr(ceph::buffer::list::const_iterator
& p
);
819 void encode_lock_isnap(ceph::buffer::list
& bl
);
820 void decode_lock_isnap(ceph::buffer::list::const_iterator
& p
);
821 void encode_lock_iflock(ceph::buffer::list
& bl
);
822 void decode_lock_iflock(ceph::buffer::list::const_iterator
& p
);
823 void encode_lock_ipolicy(ceph::buffer::list
& bl
);
824 void decode_lock_ipolicy(ceph::buffer::list::const_iterator
& p
);
826 void _finish_frag_update(CDir
*dir
, MutationRef
& mut
);
828 void clear_dirty_scattered(int type
) override
;
829 bool is_dirty_scattered();
830 void clear_scatter_dirty(); // on rejoin ack
832 void start_scatter(ScatterLock
*lock
);
833 void finish_scatter_update(ScatterLock
*lock
, CDir
*dir
,
834 version_t inode_version
, version_t dir_accounted_version
);
835 void finish_scatter_gather_update(int type
, MutationRef
& mut
);
836 void finish_scatter_gather_update_accounted(int type
, EMetaBlob
*metablob
);
839 void open_snaprealm(bool no_split
=false);
840 void close_snaprealm(bool no_join
=false);
841 SnapRealm
*find_snaprealm() const;
842 void encode_snap(ceph::buffer::list
& bl
);
843 void decode_snap(ceph::buffer::list::const_iterator
& p
);
845 client_t
get_loner() const { return loner_cap
; }
846 client_t
get_wanted_loner() const { return want_loner_cap
; }
848 // this is the loner state our locks should aim for
849 client_t
get_target_loner() const {
850 if (loner_cap
== want_loner_cap
)
856 client_t
calc_ideal_loner();
857 void set_loner_cap(client_t l
);
858 bool choose_ideal_loner();
859 bool try_set_loner();
860 bool try_drop_loner();
862 // choose new lock state during recovery, based on issued caps
863 void choose_lock_state(SimpleLock
*lock
, int allissued
);
864 void choose_lock_states(int dirty_caps
);
866 int count_nonstale_caps();
867 bool multiple_nonstale_caps();
869 bool is_any_caps() { return !client_caps
.empty(); }
870 bool is_any_nonstale_caps() { return count_nonstale_caps(); }
872 const mempool::mds_co::compact_map
<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted
; }
873 void set_mds_caps_wanted(mempool::mds_co::compact_map
<int32_t,int32_t>& m
);
874 void set_mds_caps_wanted(mds_rank_t mds
, int32_t wanted
);
876 const mempool_cap_map
& get_client_caps() const { return client_caps
; }
877 Capability
*get_client_cap(client_t client
) {
878 auto client_caps_entry
= client_caps
.find(client
);
879 if (client_caps_entry
!= client_caps
.end())
880 return &client_caps_entry
->second
;
883 int get_client_cap_pending(client_t client
) const {
884 auto client_caps_entry
= client_caps
.find(client
);
885 if (client_caps_entry
!= client_caps
.end()) {
886 return client_caps_entry
->second
.pending();
892 int get_num_caps_notable() const { return num_caps_notable
; }
893 void adjust_num_caps_notable(int d
);
895 Capability
*add_client_cap(client_t client
, Session
*session
,
896 SnapRealm
*conrealm
=nullptr, bool new_inode
=false);
897 void remove_client_cap(client_t client
);
898 void move_to_realm(SnapRealm
*realm
);
900 Capability
*reconnect_cap(client_t client
, const cap_reconnect_t
& icr
, Session
*session
);
901 void clear_client_caps_after_export();
902 void export_client_caps(std::map
<client_t
,Capability::Export
>& cl
);
905 int get_caps_liked() const;
906 int get_caps_allowed_ever() const;
907 int get_caps_allowed_by_type(int type
) const;
908 int get_caps_careful() const;
909 int get_xlocker_mask(client_t client
) const;
910 int get_caps_allowed_for_client(Session
*s
, Capability
*cap
,
911 const mempool_inode
*file_i
) const;
913 // caps issued, wanted
914 int get_caps_issued(int *ploner
= 0, int *pother
= 0, int *pxlocker
= 0,
915 int shift
= 0, int mask
= -1);
916 bool is_any_caps_wanted() const;
917 int get_caps_wanted(int *ploner
= 0, int *pother
= 0, int shift
= 0, int mask
= -1) const;
918 bool issued_caps_need_gather(SimpleLock
*lock
);
921 bool is_clientwriteable() const { return state
& STATE_CLIENTWRITEABLE
; }
922 void mark_clientwriteable();
923 void clear_clientwriteable();
926 mds_authority_t
authority() const override
;
929 bool can_auth_pin(int *err_ret
=nullptr) const override
;
930 void auth_pin(void *by
) override
;
931 void auth_unpin(void *by
) override
;
934 bool is_freezing_inode() const { return state_test(STATE_FREEZING
); }
935 bool is_frozen_inode() const { return state_test(STATE_FROZEN
); }
936 bool is_frozen_auth_pin() const { return state_test(STATE_FROZENAUTHPIN
); }
937 bool is_frozen() const override
;
938 bool is_frozen_dir() const;
939 bool is_freezing() const override
;
941 /* Freeze the inode. auth_pin_allowance lets the caller account for any
942 * auth_pins it is itself holding/responsible for. */
943 bool freeze_inode(int auth_pin_allowance
=0);
944 void unfreeze_inode(MDSContext::vec
& finished
);
945 void unfreeze_inode();
947 void freeze_auth_pin();
948 void unfreeze_auth_pin();
950 // -- reference counting --
951 void bad_put(int by
) override
{
952 generic_dout(0) << " bad put " << *this << " by " << by
<< " " << pin_name(by
) << " was " << ref
954 << " (" << ref_map
<< ")"
958 ceph_assert(ref_map
[by
] > 0);
960 ceph_assert(ref
> 0);
962 void bad_get(int by
) override
{
963 generic_dout(0) << " bad get " << *this << " by " << by
<< " " << pin_name(by
) << " was " << ref
965 << " (" << ref_map
<< ")"
969 ceph_assert(ref_map
[by
] >= 0);
972 void first_get() override
;
973 void last_put() override
;
974 void _put() override
;
976 // -- hierarchy stuff --
977 void set_primary_parent(CDentry
*p
) {
978 ceph_assert(parent
== 0 ||
979 g_conf().get_val
<bool>("mds_hack_allow_loading_invalid_metadata"));
982 void remove_primary_parent(CDentry
*dn
) {
983 ceph_assert(dn
== parent
);
986 void add_remote_parent(CDentry
*p
);
987 void remove_remote_parent(CDentry
*p
);
988 int num_remote_parents() {
989 return remote_parents
.size();
992 void push_projected_parent(CDentry
*dn
) {
993 projected_parent
.push_back(dn
);
995 void pop_projected_parent() {
996 ceph_assert(projected_parent
.size());
997 parent
= projected_parent
.front();
998 projected_parent
.pop_front();
1000 bool is_parent_projected() const {
1001 return !projected_parent
.empty();
1004 mds_rank_t
get_export_pin(bool inherit
=true) const;
1005 void check_pin_policy(mds_rank_t target
);
1006 void set_export_pin(mds_rank_t rank
);
1007 void queue_export_pin(mds_rank_t target
);
1008 void maybe_export_pin(bool update
=false);
1010 void set_ephemeral_pin(bool dist
, bool rand
);
1011 void clear_ephemeral_pin(bool dist
, bool rand
);
1013 void setxattr_ephemeral_dist(bool val
=false);
1014 bool is_ephemeral_dist() const {
1015 return state_test(STATE_DISTEPHEMERALPIN
);
1018 double get_ephemeral_rand() const;
1019 void maybe_ephemeral_rand(double threshold
=-1.0);
1020 void setxattr_ephemeral_rand(double prob
=0.0);
1021 bool is_ephemeral_rand() const {
1022 return state_test(STATE_RANDEPHEMERALPIN
);
1025 bool has_ephemeral_policy() const {
1026 return get_inode()->export_ephemeral_random_pin
> 0.0 ||
1027 get_inode()->export_ephemeral_distributed_pin
;
1029 bool is_ephemerally_pinned() const {
1030 return state_test(STATE_DISTEPHEMERALPIN
) ||
1031 state_test(STATE_RANDEPHEMERALPIN
);
1034 void print(std::ostream
& out
) const override
;
1035 void dump(ceph::Formatter
*f
, int flags
= DUMP_DEFAULT
) const;
1038 * Validate that the on-disk state of an inode matches what
1039 * we expect from our memory state. Currently this checks that:
1040 * 1) The backtrace associated with the file data exists and is correct
1041 * 2) For directories, the actual inode metadata matches our memory state,
1042 * 3) For directories, the rstats match
1044 * @param results A freshly-created validated_data struct, with values set
1045 * as described in the struct documentation.
1046 * @param mdr The request to be responeded upon the completion of the
1047 * validation (or NULL)
1048 * @param fin Context to call back on completion (or NULL)
1050 void validate_disk_state(validated_data
*results
,
1052 static void dump_validation_results(const validated_data
& results
,
1053 ceph::Formatter
*f
);
1055 //bool hack_accessed = false;
1056 //utime_t hack_load_stamp;
1060 SnapRealm
*snaprealm
= nullptr;
1061 SnapRealm
*containing_realm
= nullptr;
1062 snapid_t first
, last
;
1063 mempool::mds_co::compact_set
<snapid_t
> dirty_old_rstats
;
1065 uint64_t last_journaled
= 0; // log offset for the last time i was journaled
1066 //loff_t last_open_journaled; // log offset for the last journaled EOpen
1067 utime_t last_dirstat_prop
;
1069 // list item node for when we have unpropagated rstat data
1070 elist
<CInode
*>::item dirty_rstat_item
;
1072 mempool::mds_co::set
<client_t
> client_snap_caps
;
1073 mempool::mds_co::compact_map
<snapid_t
, mempool::mds_co::set
<client_t
> > client_need_snapflush
;
1075 // LogSegment lists i (may) belong to
1076 elist
<CInode
*>::item item_dirty
;
1077 elist
<CInode
*>::item item_caps
;
1078 elist
<CInode
*>::item item_open_file
;
1079 elist
<CInode
*>::item item_dirty_parent
;
1080 elist
<CInode
*>::item item_dirty_dirfrag_dir
;
1081 elist
<CInode
*>::item item_dirty_dirfrag_nest
;
1082 elist
<CInode
*>::item item_dirty_dirfrag_dirfragtree
;
1084 // also update RecoveryQueue::RecoveryQueue() if you change this
1085 elist
<CInode
*>::item
& item_recover_queue
= item_dirty_dirfrag_dir
;
1086 elist
<CInode
*>::item
& item_recover_queue_front
= item_dirty_dirfrag_nest
;
1088 inode_load_vec_t pop
;
1089 elist
<CInode
*>::item item_pop_lru
;
1092 static LockType versionlock_type
;
1093 static LockType authlock_type
;
1094 static LockType linklock_type
;
1095 static LockType dirfragtreelock_type
;
1096 static LockType filelock_type
;
1097 static LockType xattrlock_type
;
1098 static LockType snaplock_type
;
1099 static LockType nestlock_type
;
1100 static LockType flocklock_type
;
1101 static LockType policylock_type
;
1103 // FIXME not part of mempool
1104 LocalLockC versionlock
;
1105 SimpleLock authlock
;
1106 SimpleLock linklock
;
1107 ScatterLock dirfragtreelock
;
1108 ScatterLock filelock
;
1109 SimpleLock xattrlock
;
1110 SimpleLock snaplock
;
1111 ScatterLock nestlock
;
1112 SimpleLock flocklock
;
1113 SimpleLock policylock
;
1117 client_t loner_cap
= -1, want_loner_cap
= -1;
1120 ceph_lock_state_t
*get_fcntl_lock_state() {
1122 fcntl_locks
= new ceph_lock_state_t(g_ceph_context
, CEPH_LOCK_FCNTL
);
1125 void clear_fcntl_lock_state() {
1129 ceph_lock_state_t
*get_flock_lock_state() {
1131 flock_locks
= new ceph_lock_state_t(g_ceph_context
, CEPH_LOCK_FLOCK
);
1134 void clear_flock_lock_state() {
1138 void clear_file_locks() {
1139 clear_fcntl_lock_state();
1140 clear_flock_lock_state();
1142 void _encode_file_locks(ceph::buffer::list
& bl
) const {
1144 bool has_fcntl_locks
= fcntl_locks
&& !fcntl_locks
->empty();
1145 encode(has_fcntl_locks
, bl
);
1146 if (has_fcntl_locks
)
1147 encode(*fcntl_locks
, bl
);
1148 bool has_flock_locks
= flock_locks
&& !flock_locks
->empty();
1149 encode(has_flock_locks
, bl
);
1150 if (has_flock_locks
)
1151 encode(*flock_locks
, bl
);
1153 void _decode_file_locks(ceph::buffer::list::const_iterator
& p
) {
1155 bool has_fcntl_locks
;
1156 decode(has_fcntl_locks
, p
);
1157 if (has_fcntl_locks
)
1158 decode(*get_fcntl_lock_state(), p
);
1160 clear_fcntl_lock_state();
1161 bool has_flock_locks
;
1162 decode(has_flock_locks
, p
);
1163 if (has_flock_locks
)
1164 decode(*get_flock_lock_state(), p
);
1166 clear_flock_lock_state();
1170 * Return the pool ID where we currently write backtraces for
1171 * this inode (in addition to inode.old_pools)
1173 * @returns a pool ID >=0
1175 int64_t get_backtrace_pool() const;
1177 // parent dentries in cache
1178 CDentry
*parent
= nullptr; // primary link
1179 mempool::mds_co::compact_set
<CDentry
*> remote_parents
; // if hard linked
1181 mempool::mds_co::list
<CDentry
*> projected_parent
; // for in-progress rename, (un)link, etc.
1183 mds_authority_t inode_auth
= CDIR_AUTH_DEFAULT
;
1185 // -- distributed state --
1186 // file capabilities
1187 mempool_cap_map client_caps
; // client -> caps
1188 mempool::mds_co::compact_map
<int32_t, int32_t> mds_caps_wanted
; // [auth] mds -> caps wanted
1189 int replica_caps_wanted
= 0; // [replica] what i've requested from auth
1190 int num_caps_notable
= 0;
1192 ceph_lock_state_t
*fcntl_locks
= nullptr;
1193 ceph_lock_state_t
*flock_locks
= nullptr;
1196 mempool::mds_co::compact_map
<frag_t
, MDSContext::vec
> waiting_on_dir
;
1199 // -- freezing inode --
1200 int auth_pin_freeze_allowance
= 0;
1201 elist
<CInode
*>::item item_freezing_inode
;
1202 void maybe_finish_freeze_inode();
1205 friend class ValidationContinuation
;
1208 * Create a scrub_info_t struct for the scrub_infop pointer.
1210 void scrub_info_create() const;
1212 * Delete the scrub_info_t struct if it's not got any useful data
1214 void scrub_maybe_delete_info();
1216 void pop_projected_snaprealm(sr_t
*next_snaprealm
, bool early
);
1218 bool _validate_disk_state(class ValidationContinuation
*c
,
1219 int rval
, int stage
);
1221 struct projected_const_node
{
1222 inode_const_ptr inode
;
1223 xattr_map_const_ptr xattrs
;
1226 projected_const_node() = delete;
1227 projected_const_node(projected_const_node
&&) = default;
1228 explicit projected_const_node(const inode_const_ptr
& i
, const xattr_map_const_ptr
& x
, sr_t
*s
) :
1229 inode(i
), xattrs(x
), snapnode(s
) {}
1232 mempool::mds_co::list
<projected_const_node
> projected_nodes
; // projected values (only defined while dirty)
1233 size_t num_projected_srnodes
= 0;
1235 // -- cache infrastructure --
1236 mempool::mds_co::compact_map
<frag_t
,CDir
*> dirfrags
; // cached dir fragments under this Inode
1238 //for the purpose of quickly determining whether there's a subtree root or exporting dir
1239 int num_subtree_roots
= 0;
1240 int num_exporting_dirs
= 0;
1242 int stickydir_ref
= 0;
1243 std::unique_ptr
<scrub_info_t
> scrub_infop
;
1244 /** @} Scrubbing and fsck */
1247 std::ostream
& operator<<(std::ostream
& out
, const CInode
& in
);
1249 extern cinode_lock_info_t cinode_lock_info
[];
1250 extern int num_cinode_locks
;