1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
21 #include <string_view>
23 #include "common/config.h"
24 #include "common/RefCountedObj.h"
25 #include "include/compat.h"
26 #include "include/counter.h"
27 #include "include/elist.h"
28 #include "include/types.h"
29 #include "include/lru.h"
30 #include "include/compact_set.h"
32 #include "MDSCacheObject.h"
33 #include "MDSContext.h"
38 #include "SimpleLock.h"
39 #include "ScatterLock.h"
40 #include "LocalLockC.h"
41 #include "Capability.h"
42 #include "SnapRealm.h"
45 #include "messages/MClientCaps.h"
47 #define dout_context g_ceph_context
56 struct ObjectOperation
;
59 struct cinode_lock_info_t
{
64 struct CInodeCommitOperation
{
66 CInodeCommitOperation(int prio
, int64_t po
)
67 : pool(po
), priority(prio
) {
69 CInodeCommitOperation(int prio
, int64_t po
, file_layout_t l
, uint64_t f
)
70 : pool(po
), priority(prio
), _layout(l
), _features(f
) {
74 void update(ObjectOperation
&op
, inode_backtrace_t
&bt
);
75 int64_t get_pool() { return pool
; }
78 int64_t pool
; ///< pool id
80 bool update_layout
= false;
81 file_layout_t _layout
;
85 struct CInodeCommitOperations
{
86 std::vector
<CInodeCommitOperation
> ops_vec
;
93 * Base class for CInode, containing the backing store data and
94 * serialization methods. This exists so that we can read and
95 * handle CInodes from the backing store without hitting all
96 * the business logic in CInode proper.
98 class InodeStoreBase
{
100 using mempool_inode
= inode_t
<mempool::mds_co::pool_allocator
>;
101 using inode_ptr
= std::shared_ptr
<mempool_inode
>;
102 using inode_const_ptr
= std::shared_ptr
<const mempool_inode
>;
104 template <typename
...Args
>
105 static inode_ptr
allocate_inode(Args
&& ...args
) {
106 static mempool::mds_co::pool_allocator
<mempool_inode
> allocator
;
107 return std::allocate_shared
<mempool_inode
>(allocator
, std::forward
<Args
>(args
)...);
110 using mempool_xattr_map
= xattr_map
<mempool::mds_co::pool_allocator
>; // FIXME bufferptr not in mempool
111 using xattr_map_ptr
= std::shared_ptr
<mempool_xattr_map
>;
112 using xattr_map_const_ptr
= std::shared_ptr
<const mempool_xattr_map
>;
114 template <typename
...Args
>
115 static xattr_map_ptr
allocate_xattr_map(Args
&& ...args
) {
116 static mempool::mds_co::pool_allocator
<mempool_xattr_map
> allocator
;
117 return std::allocate_shared
<mempool_xattr_map
>(allocator
, std::forward
<Args
>(args
)...);
120 using mempool_old_inode
= old_inode_t
<mempool::mds_co::pool_allocator
>;
121 using mempool_old_inode_map
= mempool::mds_co::map
<snapid_t
, mempool_old_inode
>;
122 using old_inode_map_ptr
= std::shared_ptr
<mempool_old_inode_map
>;
123 using old_inode_map_const_ptr
= std::shared_ptr
<const mempool_old_inode_map
>;
125 template <typename
...Args
>
126 static old_inode_map_ptr
allocate_old_inode_map(Args
&& ...args
) {
127 static mempool::mds_co::pool_allocator
<mempool_old_inode_map
> allocator
;
128 return std::allocate_shared
<mempool_old_inode_map
>(allocator
, std::forward
<Args
>(args
)...);
131 void reset_inode(inode_const_ptr
&& ptr
) {
132 inode
= std::move(ptr
);
135 void reset_xattrs(xattr_map_const_ptr
&& ptr
) {
136 xattrs
= std::move(ptr
);
139 void reset_old_inodes(old_inode_map_const_ptr
&& ptr
) {
140 old_inodes
= std::move(ptr
);
143 void encode_xattrs(bufferlist
&bl
) const;
144 void decode_xattrs(bufferlist::const_iterator
&p
);
145 void encode_old_inodes(bufferlist
&bl
, uint64_t features
) const;
146 void decode_old_inodes(bufferlist::const_iterator
&p
);
149 static object_t
get_object_name(inodeno_t ino
, frag_t fg
, std::string_view suffix
);
151 /* Full serialization for use in ".inode" root inode objects */
152 void encode(ceph::buffer::list
&bl
, uint64_t features
, const ceph::buffer::list
*snap_blob
=NULL
) const;
153 void decode(ceph::buffer::list::const_iterator
&bl
, ceph::buffer::list
& snap_blob
);
155 /* Serialization without ENCODE_START/FINISH blocks for use embedded in dentry */
156 void encode_bare(ceph::buffer::list
&bl
, uint64_t features
, const ceph::buffer::list
*snap_blob
=NULL
) const;
157 void decode_bare(ceph::buffer::list::const_iterator
&bl
, ceph::buffer::list
&snap_blob
, __u8 struct_v
=5);
159 /* For test/debug output */
160 void dump(ceph::Formatter
*f
) const;
162 void decode_json(JSONObj
*obj
);
163 static void xattrs_cb(InodeStoreBase::mempool_xattr_map
& c
, JSONObj
*obj
);
164 static void old_indoes_cb(InodeStoreBase::mempool_old_inode_map
& c
, JSONObj
*obj
);
166 /* For use by offline tools */
167 __u32
hash_dentry_name(std::string_view dn
);
168 frag_t
pick_dirfrag(std::string_view dn
);
170 mempool::mds_co::string symlink
; // symlink dest, if symlink
171 fragtree_t dirfragtree
; // dir frag tree, if any. always consistent with our dirfrag map.
172 snapid_t oldest_snap
= CEPH_NOSNAP
;
173 damage_flags_t damage_flags
= 0;
176 static inode_const_ptr empty_inode
;
178 // Following members are pointers to constant data, the constant data can
179 // be shared by CInode and log events. To update these members in CInode,
180 // read-copy-update should be used.
181 inode_const_ptr inode
= empty_inode
;
182 xattr_map_const_ptr xattrs
;
183 old_inode_map_const_ptr old_inodes
; // key = last, value.first = first
186 inline void decode_noshare(InodeStoreBase::mempool_xattr_map
& xattrs
,
187 ceph::buffer::list::const_iterator
&p
)
189 decode_noshare
<mempool::mds_co::pool_allocator
>(xattrs
, p
);
192 class InodeStore
: public InodeStoreBase
{
194 mempool_inode
* get_inode() {
195 if (inode
== empty_inode
)
196 reset_inode(allocate_inode());
197 return const_cast<mempool_inode
*>(inode
.get());
199 mempool_xattr_map
* get_xattrs() { return const_cast<mempool_xattr_map
*>(xattrs
.get()); }
201 void encode(ceph::buffer::list
&bl
, uint64_t features
) const {
202 InodeStoreBase::encode(bl
, features
, &snap_blob
);
204 void decode(ceph::buffer::list::const_iterator
&bl
) {
205 InodeStoreBase::decode(bl
, snap_blob
);
207 void encode_bare(ceph::buffer::list
&bl
, uint64_t features
) const {
208 InodeStoreBase::encode_bare(bl
, features
, &snap_blob
);
210 void decode_bare(ceph::buffer::list::const_iterator
&bl
) {
211 InodeStoreBase::decode_bare(bl
, snap_blob
);
214 static void generate_test_instances(std::list
<InodeStore
*>& ls
);
216 using InodeStoreBase::inode
;
217 using InodeStoreBase::xattrs
;
218 using InodeStoreBase::old_inodes
;
220 // FIXME bufferlist not part of mempool
221 ceph::buffer::list snap_blob
; // Encoded copy of SnapRealm, because we can't
222 // rehydrate it without full MDCache
224 WRITE_CLASS_ENCODER_FEATURES(InodeStore
)
226 // just for ceph-dencoder
227 class InodeStoreBare
: public InodeStore
{
229 void encode(ceph::buffer::list
&bl
, uint64_t features
) const {
230 InodeStore::encode_bare(bl
, features
);
232 void decode(ceph::buffer::list::const_iterator
&bl
) {
233 InodeStore::decode_bare(bl
);
235 static void generate_test_instances(std::list
<InodeStoreBare
*>& ls
);
237 WRITE_CLASS_ENCODER_FEATURES(InodeStoreBare
)
239 // cached inode wrapper
240 class CInode
: public MDSCacheObject
, public InodeStoreBase
, public Counter
<CInode
> {
242 MEMPOOL_CLASS_HELPERS();
244 using mempool_cap_map
= mempool::mds_co::map
<client_t
, Capability
>;
246 * @defgroup Scrubbing and fsck
250 * Report the results of validation against a particular inode.
251 * Each member is a pair of bools.
252 * <member>.first represents if validation was performed against the member.
253 * <member.second represents if the member passed validation.
254 * performed_validation is set to true if the validation was actually
255 * run. It might not be run if, for instance, the inode is marked as dirty.
256 * passed_validation is set to true if everything that was checked
257 * passed its validation.
259 struct validated_data
{
260 template<typename T
>struct member_status
{
261 bool checked
= false;
263 bool repaired
= false;
264 int ondisk_read_retval
= 0;
267 std::stringstream error_str
;
277 void dump(ceph::Formatter
*f
) const;
279 bool all_damage_repaired() const;
281 bool performed_validation
= false;
282 bool passed_validation
= false;
284 member_status
<inode_backtrace_t
> backtrace
;
285 member_status
<mempool_inode
> inode
; // XXX should not be in mempool; wait for pmr
286 member_status
<raw_stats_t
> raw_stats
;
292 friend class Migrator
;
293 friend class MDCache
;
294 friend class StrayManager
;
296 friend std::ostream
& operator<<(std::ostream
&, const CInode
&);
302 version_t last_scrub_version
= 0;
303 utime_t last_scrub_stamp
;
305 bool last_scrub_dirty
= false; /// are our stamps dirty with respect to disk state?
306 bool scrub_in_progress
= false; /// are we currently scrubbing?
308 fragset_t queued_frags
;
310 ScrubHeaderRef header
;
314 static const int PIN_DIRFRAG
= -1;
315 static const int PIN_CAPS
= 2; // client caps
316 static const int PIN_IMPORTING
= -4; // importing
317 static const int PIN_OPENINGDIR
= 7;
318 static const int PIN_REMOTEPARENT
= 8;
319 static const int PIN_BATCHOPENJOURNAL
= 9;
320 static const int PIN_SCATTERED
= 10;
321 static const int PIN_STICKYDIRS
= 11;
322 //static const int PIN_PURGING = -12;
323 static const int PIN_FREEZING
= 13;
324 static const int PIN_FROZEN
= 14;
325 static const int PIN_IMPORTINGCAPS
= -15;
326 static const int PIN_PASTSNAPPARENT
= -16;
327 static const int PIN_OPENINGSNAPPARENTS
= 17;
328 static const int PIN_TRUNCATING
= 18;
329 static const int PIN_STRAY
= 19; // we pin our stray inode while active
330 static const int PIN_NEEDSNAPFLUSH
= 20;
331 static const int PIN_DIRTYRSTAT
= 21;
332 static const int PIN_EXPORTINGCAPS
= 22;
333 static const int PIN_DIRTYPARENT
= 23;
334 static const int PIN_DIRWAITER
= 24;
337 static const int DUMP_INODE_STORE_BASE
= (1 << 0);
338 static const int DUMP_MDS_CACHE_OBJECT
= (1 << 1);
339 static const int DUMP_LOCKS
= (1 << 2);
340 static const int DUMP_STATE
= (1 << 3);
341 static const int DUMP_CAPS
= (1 << 4);
342 static const int DUMP_PATH
= (1 << 5);
343 static const int DUMP_DIRFRAGS
= (1 << 6);
344 static const int DUMP_ALL
= (-1);
345 static const int DUMP_DEFAULT
= DUMP_ALL
& (~DUMP_PATH
) & (~DUMP_DIRFRAGS
);
348 static const int STATE_EXPORTING
= (1<<0); // on nonauth bystander.
349 static const int STATE_OPENINGDIR
= (1<<1);
350 static const int STATE_FREEZING
= (1<<2);
351 static const int STATE_FROZEN
= (1<<3);
352 static const int STATE_AMBIGUOUSAUTH
= (1<<4);
353 static const int STATE_EXPORTINGCAPS
= (1<<5);
354 static const int STATE_NEEDSRECOVER
= (1<<6);
355 static const int STATE_RECOVERING
= (1<<7);
356 static const int STATE_PURGING
= (1<<8);
357 static const int STATE_DIRTYPARENT
= (1<<9);
358 static const int STATE_DIRTYRSTAT
= (1<<10);
359 static const int STATE_STRAYPINNED
= (1<<11);
360 static const int STATE_FROZENAUTHPIN
= (1<<12);
361 static const int STATE_DIRTYPOOL
= (1<<13);
362 static const int STATE_REPAIRSTATS
= (1<<14);
363 static const int STATE_MISSINGOBJS
= (1<<15);
364 static const int STATE_EVALSTALECAPS
= (1<<16);
365 static const int STATE_QUEUEDEXPORTPIN
= (1<<17);
366 static const int STATE_TRACKEDBYOFT
= (1<<18); // tracked by open file table
367 static const int STATE_DELAYEDEXPORTPIN
= (1<<19);
368 static const int STATE_DISTEPHEMERALPIN
= (1<<20);
369 static const int STATE_RANDEPHEMERALPIN
= (1<<21);
370 static const int STATE_CLIENTWRITEABLE
= (1<<22);
372 // orphan inode needs notification of releasing reference
373 static const int STATE_ORPHAN
= STATE_NOTIFYREF
;
375 static const int MASK_STATE_EXPORTED
=
376 (STATE_DIRTY
|STATE_NEEDSRECOVER
|STATE_DIRTYPARENT
|STATE_DIRTYPOOL
|
377 STATE_DISTEPHEMERALPIN
|STATE_RANDEPHEMERALPIN
);
378 static const int MASK_STATE_EXPORT_KEPT
=
379 (STATE_FROZEN
|STATE_AMBIGUOUSAUTH
|STATE_EXPORTINGCAPS
|
380 STATE_QUEUEDEXPORTPIN
|STATE_TRACKEDBYOFT
|STATE_DELAYEDEXPORTPIN
|
381 STATE_DISTEPHEMERALPIN
|STATE_RANDEPHEMERALPIN
);
383 /* These are for "permanent" state markers that are passed around between
384 * MDS. Nothing protects/updates it like a typical MDS lock.
386 * Currently, we just use this for REPLICATED inodes. The reason we need to
387 * replicate the random epin state is because the directory inode is still
388 * under the authority of the parent subtree. So it's not exported normally
389 * and we can't pass around the state that way. The importer of the dirfrags
390 * still needs to know that the inode is random pinned though otherwise it
391 * doesn't know that the dirfrags are pinned.
393 static const int MASK_STATE_REPLICATED
= STATE_RANDEPHEMERALPIN
;
396 static const uint64_t WAIT_DIR
= (1<<0);
397 static const uint64_t WAIT_FROZEN
= (1<<1);
398 static const uint64_t WAIT_TRUNC
= (1<<2);
399 static const uint64_t WAIT_FLOCK
= (1<<3);
401 static const uint64_t WAIT_ANY_MASK
= (uint64_t)(-1);
404 static const unsigned EXPORT_NONCE
= 1; // nonce given to replicas created by export
406 // ---------------------------
408 CInode(MDCache
*c
, bool auth
=true, snapid_t f
=2, snapid_t l
=CEPH_NOSNAP
);
413 ceph_assert(num_projected_srnodes
== 0);
414 ceph_assert(num_caps_notable
== 0);
415 ceph_assert(num_subtree_roots
== 0);
416 ceph_assert(num_exporting_dirs
== 0);
417 ceph_assert(batch_ops
.empty());
420 std::map
<int, std::unique_ptr
<BatchOp
>> batch_ops
;
422 std::string_view
pin_name(int p
) const override
;
424 std::ostream
& print_db_line_prefix(std::ostream
& out
) override
;
426 const scrub_info_t
*scrub_info() const {
429 return scrub_infop
.get();
432 const ScrubHeaderRef
& get_scrub_header() {
433 static const ScrubHeaderRef nullref
;
434 return scrub_infop
? scrub_infop
->header
: nullref
;
437 bool scrub_is_in_progress() const {
438 return (scrub_infop
&& scrub_infop
->scrub_in_progress
);
441 * Start scrubbing on this inode. That could be very short if it's
442 * a file, or take a long time if we're recursively scrubbing a directory.
443 * @pre It is not currently scrubbing
444 * @post it has set up internal scrubbing state
445 * @param scrub_version What version are we scrubbing at (usually, parent
446 * directory's get_projected_version())
448 void scrub_initialize(ScrubHeaderRef
& header
);
450 * Call this once the scrub has been completed, whether it's a full
451 * recursive scrub on a directory or simply the data on a file (or
452 * anything in between).
453 * @param c An out param which is filled in with a Context* that must
456 void scrub_finished();
458 void scrub_aborted();
460 fragset_t
& scrub_queued_frags() {
461 ceph_assert(scrub_infop
);
462 return scrub_infop
->queued_frags
;
465 bool is_multiversion() const {
466 return snaprealm
|| // other snaprealms will link to me
467 get_inode()->is_dir() || // links to me in other snaps
468 get_inode()->nlink
> 1 || // there are remote links, possibly snapped, that will need to find me
469 is_any_old_inodes(); // once multiversion, always multiversion. until old_inodes gets cleaned out.
471 snapid_t
get_oldest_snap();
473 bool is_dirty_rstat() {
474 return state_test(STATE_DIRTYRSTAT
);
476 void mark_dirty_rstat();
477 void clear_dirty_rstat();
479 //bool hack_accessed = false;
480 //utime_t hack_load_stamp;
483 * Projection methods, used to store inode changes until they have been journaled,
484 * at which point they are popped.
486 * project_inode as needed. If you're changing xattrs or sr_t, then pass true
487 * as needed then change the xattrs/snapnode member as needed. (Dirty
488 * exception: project_past_snaprealm_parent allows you to project the
489 * snapnode after doing project_inode (i.e. you don't need to pass
492 * Then, journal. Once journaling is done, pop_and_dirty_projected_inode.
493 * This function will take care of the inode itself, the xattrs, and the snaprealm.
496 struct projected_inode
{
497 static sr_t
* const UNDEF_SRNODE
;
499 inode_ptr
const inode
;
500 xattr_map_ptr
const xattrs
;
501 sr_t
* const snapnode
;
503 projected_inode() = delete;
504 explicit projected_inode(inode_ptr
&& i
, xattr_map_ptr
&& x
, sr_t
*s
=nullptr) :
505 inode(std::move(i
)), xattrs(std::move(x
)), snapnode(s
) {}
507 projected_inode
project_inode(const MutationRef
& mut
,
508 bool xattr
= false, bool snap
= false);
510 void pop_and_dirty_projected_inode(LogSegment
*ls
, const MutationRef
& mut
);
512 version_t
get_projected_version() const {
513 if (projected_nodes
.empty())
514 return get_inode()->version
;
516 return projected_nodes
.back().inode
->version
;
518 bool is_projected() const {
519 return !projected_nodes
.empty();
522 const inode_const_ptr
& get_projected_inode() const {
523 if (projected_nodes
.empty())
526 return projected_nodes
.back().inode
;
528 // inode should have already been projected in caller's context
529 mempool_inode
* _get_projected_inode() {
530 ceph_assert(!projected_nodes
.empty());
531 return const_cast<mempool_inode
*>(projected_nodes
.back().inode
.get());
533 const inode_const_ptr
& get_previous_projected_inode() const {
534 ceph_assert(!projected_nodes
.empty());
535 auto it
= projected_nodes
.rbegin();
537 if (it
!= projected_nodes
.rend())
543 const xattr_map_const_ptr
& get_projected_xattrs() {
544 if (projected_nodes
.empty())
547 return projected_nodes
.back().xattrs
;
549 const xattr_map_const_ptr
& get_previous_projected_xattrs() {
550 ceph_assert(!projected_nodes
.empty());
551 auto it
= projected_nodes
.rbegin();
553 if (it
!= projected_nodes
.rend())
559 sr_t
*prepare_new_srnode(snapid_t snapid
);
560 void project_snaprealm(sr_t
*new_srnode
);
561 sr_t
*project_snaprealm(snapid_t snapid
=0) {
562 sr_t
* new_srnode
= prepare_new_srnode(snapid
);
563 project_snaprealm(new_srnode
);
566 const sr_t
*get_projected_srnode() const;
568 void mark_snaprealm_global(sr_t
*new_srnode
);
569 void clear_snaprealm_global(sr_t
*new_srnode
);
570 bool is_projected_snaprealm_global() const;
572 void record_snaprealm_past_parent(sr_t
*new_snap
, SnapRealm
*newparent
);
573 void record_snaprealm_parent_dentry(sr_t
*new_snap
, SnapRealm
*newparent
,
574 CDentry
*dn
, bool primary_dn
);
575 void project_snaprealm_past_parent(SnapRealm
*newparent
);
576 void early_pop_projected_snaprealm();
578 const mempool_old_inode
& cow_old_inode(snapid_t follows
, bool cow_head
);
579 void split_old_inode(snapid_t snap
);
580 snapid_t
pick_old_inode(snapid_t last
) const;
581 void pre_cow_old_inode();
582 bool has_snap_data(snapid_t s
);
583 void purge_stale_snap_data(const std::set
<snapid_t
>& snaps
);
585 size_t get_num_dirfrags() const { return dirfrags
.size(); }
586 CDir
* get_dirfrag(frag_t fg
) {
587 auto pi
= dirfrags
.find(fg
);
588 if (pi
!= dirfrags
.end()) {
589 //assert(g_conf()->debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME
594 std::pair
<bool, std::vector
<CDir
*>> get_dirfrags_under(frag_t fg
);
595 CDir
* get_approx_dirfrag(frag_t fg
);
597 template<typename Container
>
598 void get_dirfrags(Container
& ls
) const {
600 if constexpr (std::is_same_v
<Container
, std::vector
<CDir
*>>)
601 ls
.reserve(ls
.size() + dirfrags
.size());
602 for (const auto &p
: dirfrags
)
603 ls
.push_back(p
.second
);
606 auto get_dirfrags() const {
607 std::vector
<CDir
*> result
;
608 get_dirfrags(result
);
612 void get_nested_dirfrags(std::vector
<CDir
*>&) const;
613 std::vector
<CDir
*> get_nested_dirfrags() const {
614 std::vector
<CDir
*> v
;
615 get_nested_dirfrags(v
);
618 void get_subtree_dirfrags(std::vector
<CDir
*>&) const;
619 std::vector
<CDir
*> get_subtree_dirfrags() const {
620 std::vector
<CDir
*> v
;
621 get_subtree_dirfrags(v
);
624 int get_num_subtree_roots() const {
625 return num_subtree_roots
;
628 CDir
*get_or_open_dirfrag(MDCache
*mdcache
, frag_t fg
);
629 CDir
*add_dirfrag(CDir
*dir
);
630 void close_dirfrag(frag_t fg
);
631 void close_dirfrags();
632 bool has_subtree_root_dirfrag(int auth
=-1);
633 bool has_subtree_or_exporting_dirfrag();
635 void force_dirfrags();
636 void verify_dirfrags();
638 void get_stickydirs();
639 void put_stickydirs();
641 void add_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
);
642 void remove_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
);
643 std::pair
<bool,bool> split_need_snapflush(CInode
*cowin
, CInode
*in
);
647 inodeno_t
ino() const { return get_inode()->ino
; }
648 vinodeno_t
vino() const { return vinodeno_t(ino(), last
); }
649 int d_type() const { return IFTODT(get_inode()->mode
); }
650 bool is_root() const { return ino() == CEPH_INO_ROOT
; }
651 bool is_stray() const { return MDS_INO_IS_STRAY(ino()); }
652 mds_rank_t
get_stray_owner() const {
653 return (mds_rank_t
)MDS_INO_STRAY_OWNER(ino());
655 bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(ino()); }
656 bool is_base() const { return MDS_INO_IS_BASE(ino()); }
657 bool is_system() const { return ino() < MDS_INO_SYSTEM_BASE
; }
658 bool is_normal() const { return !(is_base() || is_system() || is_stray()); }
659 bool is_file() const { return get_inode()->is_file(); }
660 bool is_symlink() const { return get_inode()->is_symlink(); }
661 bool is_dir() const { return get_inode()->is_dir(); }
663 bool is_head() const { return last
== CEPH_NOSNAP
; }
665 // note: this overloads MDSCacheObject
666 bool is_ambiguous_auth() const {
667 return state_test(STATE_AMBIGUOUSAUTH
) ||
668 MDSCacheObject::is_ambiguous_auth();
670 void set_ambiguous_auth() {
671 state_set(STATE_AMBIGUOUSAUTH
);
673 void clear_ambiguous_auth(MDSContext::vec
& finished
);
674 void clear_ambiguous_auth();
676 const inode_const_ptr
& get_inode() const {
680 // only used for updating newly allocated CInode
681 mempool_inode
* _get_inode() {
682 if (inode
== empty_inode
)
683 reset_inode(allocate_inode());
684 return const_cast<mempool_inode
*>(inode
.get());
687 const xattr_map_const_ptr
& get_xattrs() const { return xattrs
; }
689 bool is_any_old_inodes() const { return old_inodes
&& !old_inodes
->empty(); }
690 const old_inode_map_const_ptr
& get_old_inodes() const { return old_inodes
; }
692 CDentry
* get_parent_dn() { return parent
; }
693 const CDentry
* get_parent_dn() const { return parent
; }
694 CDentry
* get_projected_parent_dn() { return !projected_parent
.empty() ? projected_parent
.back() : parent
; }
695 const CDentry
* get_projected_parent_dn() const { return !projected_parent
.empty() ? projected_parent
.back() : parent
; }
696 const CDentry
* get_oldest_parent_dn() const {
699 return !projected_parent
.empty() ? projected_parent
.front(): NULL
;
701 CDir
*get_parent_dir();
702 const CDir
*get_projected_parent_dir() const;
703 CDir
*get_projected_parent_dir();
704 CInode
*get_parent_inode();
706 bool is_lt(const MDSCacheObject
*r
) const override
{
707 const CInode
*o
= static_cast<const CInode
*>(r
);
708 return ino() < o
->ino() ||
709 (ino() == o
->ino() && last
< o
->last
);
713 bool is_ancestor_of(const CInode
*other
) const;
714 bool is_projected_ancestor_of(const CInode
*other
) const;
716 void make_path_string(std::string
& s
, bool projected
=false, const CDentry
*use_parent
=NULL
) const;
717 void make_path(filepath
& s
, bool projected
=false) const;
718 void name_stray_dentry(std::string
& dname
);
721 version_t
get_version() const { return get_inode()->version
; }
723 version_t
pre_dirty();
724 void _mark_dirty(LogSegment
*ls
);
725 void mark_dirty(LogSegment
*ls
);
728 void store(MDSContext
*fin
);
729 void _stored(int r
, version_t cv
, Context
*fin
);
731 * Flush a CInode to disk. This includes the backtrace, the parent
732 * directory's link, and the Inode object itself (if a base directory).
733 * @pre is_auth() on both the inode and its containing directory
734 * @pre can_auth_pin()
735 * @param fin The Context to call when the flush is completed.
737 void flush(MDSContext
*fin
);
738 void fetch(MDSContext
*fin
);
739 void _fetched(ceph::buffer::list
& bl
, ceph::buffer::list
& bl2
, Context
*fin
);
741 void _commit_ops(int r
, C_GatherBuilder
&gather_bld
,
742 std::vector
<CInodeCommitOperation
> &ops_vec
,
743 inode_backtrace_t
&bt
);
744 void build_backtrace(int64_t pool
, inode_backtrace_t
& bt
);
745 void _store_backtrace(std::vector
<CInodeCommitOperation
> &ops_vec
,
746 inode_backtrace_t
&bt
, int op_prio
);
747 void store_backtrace(CInodeCommitOperations
&op
, int op_prio
);
748 void store_backtrace(MDSContext
*fin
, int op_prio
=-1);
749 void _stored_backtrace(int r
, version_t v
, Context
*fin
);
750 void fetch_backtrace(Context
*fin
, ceph::buffer::list
*backtrace
);
752 void mark_dirty_parent(LogSegment
*ls
, bool dirty_pool
=false);
753 void clear_dirty_parent();
754 void verify_diri_backtrace(ceph::buffer::list
&bl
, int err
);
755 bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT
); }
756 bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL
); }
758 void encode_snap_blob(ceph::buffer::list
&bl
);
759 void decode_snap_blob(const ceph::buffer::list
&bl
);
760 void encode_store(ceph::buffer::list
& bl
, uint64_t features
);
761 void decode_store(ceph::buffer::list::const_iterator
& bl
);
763 void add_dir_waiter(frag_t fg
, MDSContext
*c
);
764 void take_dir_waiting(frag_t fg
, MDSContext::vec
& ls
);
765 bool is_waiting_for_dir(frag_t fg
) {
766 return waiting_on_dir
.count(fg
);
768 void add_waiter(uint64_t tag
, MDSContext
*c
) override
;
769 void take_waiting(uint64_t tag
, MDSContext::vec
& ls
) override
;
771 // -- encode/decode helpers --
772 void _encode_base(ceph::buffer::list
& bl
, uint64_t features
);
773 void _decode_base(ceph::buffer::list::const_iterator
& p
);
774 void _encode_locks_full(ceph::buffer::list
& bl
);
775 void _decode_locks_full(ceph::buffer::list::const_iterator
& p
);
776 void _encode_locks_state_for_replica(ceph::buffer::list
& bl
, bool need_recover
);
777 void _encode_locks_state_for_rejoin(ceph::buffer::list
& bl
, int rep
);
778 void _decode_locks_state_for_replica(ceph::buffer::list::const_iterator
& p
, bool is_new
);
779 void _decode_locks_rejoin(ceph::buffer::list::const_iterator
& p
, MDSContext::vec
& waiters
,
780 std::list
<SimpleLock
*>& eval_locks
, bool survivor
);
782 // -- import/export --
783 void encode_export(ceph::buffer::list
& bl
);
784 void finish_export();
785 void abort_export() {
786 put(PIN_TEMPEXPORTING
);
787 ceph_assert(state_test(STATE_EXPORTINGCAPS
));
788 state_clear(STATE_EXPORTINGCAPS
);
789 put(PIN_EXPORTINGCAPS
);
791 void decode_import(ceph::buffer::list::const_iterator
& p
, LogSegment
*ls
);
793 // for giving to clients
794 int encode_inodestat(ceph::buffer::list
& bl
, Session
*session
, SnapRealm
*realm
,
795 snapid_t snapid
=CEPH_NOSNAP
, unsigned max_bytes
=0,
796 int getattr_wants
=0);
797 void encode_cap_message(const ceph::ref_t
<MClientCaps
> &m
, Capability
*cap
);
799 SimpleLock
* get_lock(int type
) override
;
801 void set_object_info(MDSCacheObjectInfo
&info
) override
;
803 void encode_lock_state(int type
, ceph::buffer::list
& bl
) override
;
804 void decode_lock_state(int type
, const ceph::buffer::list
& bl
) override
;
805 void encode_lock_iauth(ceph::buffer::list
& bl
);
806 void decode_lock_iauth(ceph::buffer::list::const_iterator
& p
);
807 void encode_lock_ilink(ceph::buffer::list
& bl
);
808 void decode_lock_ilink(ceph::buffer::list::const_iterator
& p
);
809 void encode_lock_idft(ceph::buffer::list
& bl
);
810 void decode_lock_idft(ceph::buffer::list::const_iterator
& p
);
811 void encode_lock_ifile(ceph::buffer::list
& bl
);
812 void decode_lock_ifile(ceph::buffer::list::const_iterator
& p
);
813 void encode_lock_inest(ceph::buffer::list
& bl
);
814 void decode_lock_inest(ceph::buffer::list::const_iterator
& p
);
815 void encode_lock_ixattr(ceph::buffer::list
& bl
);
816 void decode_lock_ixattr(ceph::buffer::list::const_iterator
& p
);
817 void encode_lock_isnap(ceph::buffer::list
& bl
);
818 void decode_lock_isnap(ceph::buffer::list::const_iterator
& p
);
819 void encode_lock_iflock(ceph::buffer::list
& bl
);
820 void decode_lock_iflock(ceph::buffer::list::const_iterator
& p
);
821 void encode_lock_ipolicy(ceph::buffer::list
& bl
);
822 void decode_lock_ipolicy(ceph::buffer::list::const_iterator
& p
);
824 void _finish_frag_update(CDir
*dir
, MutationRef
& mut
);
826 void clear_dirty_scattered(int type
) override
;
827 bool is_dirty_scattered();
828 void clear_scatter_dirty(); // on rejoin ack
830 void start_scatter(ScatterLock
*lock
);
831 void finish_scatter_update(ScatterLock
*lock
, CDir
*dir
,
832 version_t inode_version
, version_t dir_accounted_version
);
833 void finish_scatter_gather_update(int type
, MutationRef
& mut
);
834 void finish_scatter_gather_update_accounted(int type
, EMetaBlob
*metablob
);
837 void open_snaprealm(bool no_split
=false);
838 void close_snaprealm(bool no_join
=false);
839 SnapRealm
*find_snaprealm() const;
840 void encode_snap(ceph::buffer::list
& bl
);
841 void decode_snap(ceph::buffer::list::const_iterator
& p
);
843 client_t
get_loner() const { return loner_cap
; }
844 client_t
get_wanted_loner() const { return want_loner_cap
; }
846 // this is the loner state our locks should aim for
847 client_t
get_target_loner() const {
848 if (loner_cap
== want_loner_cap
)
854 client_t
calc_ideal_loner();
855 void set_loner_cap(client_t l
);
856 bool choose_ideal_loner();
857 bool try_set_loner();
858 bool try_drop_loner();
860 // choose new lock state during recovery, based on issued caps
861 void choose_lock_state(SimpleLock
*lock
, int allissued
);
862 void choose_lock_states(int dirty_caps
);
864 int count_nonstale_caps();
865 bool multiple_nonstale_caps();
867 bool is_any_caps() { return !client_caps
.empty(); }
868 bool is_any_nonstale_caps() { return count_nonstale_caps(); }
870 const mempool::mds_co::compact_map
<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted
; }
871 void set_mds_caps_wanted(mempool::mds_co::compact_map
<int32_t,int32_t>& m
);
872 void set_mds_caps_wanted(mds_rank_t mds
, int32_t wanted
);
874 const mempool_cap_map
& get_client_caps() const { return client_caps
; }
875 Capability
*get_client_cap(client_t client
) {
876 auto client_caps_entry
= client_caps
.find(client
);
877 if (client_caps_entry
!= client_caps
.end())
878 return &client_caps_entry
->second
;
881 int get_client_cap_pending(client_t client
) const {
882 auto client_caps_entry
= client_caps
.find(client
);
883 if (client_caps_entry
!= client_caps
.end()) {
884 return client_caps_entry
->second
.pending();
890 int get_num_caps_notable() const { return num_caps_notable
; }
891 void adjust_num_caps_notable(int d
);
893 Capability
*add_client_cap(client_t client
, Session
*session
,
894 SnapRealm
*conrealm
=nullptr, bool new_inode
=false);
895 void remove_client_cap(client_t client
);
896 void move_to_realm(SnapRealm
*realm
);
898 Capability
*reconnect_cap(client_t client
, const cap_reconnect_t
& icr
, Session
*session
);
899 void clear_client_caps_after_export();
900 void export_client_caps(std::map
<client_t
,Capability::Export
>& cl
);
903 int get_caps_liked() const;
904 int get_caps_allowed_ever() const;
905 int get_caps_allowed_by_type(int type
) const;
906 int get_caps_careful() const;
907 int get_xlocker_mask(client_t client
) const;
908 int get_caps_allowed_for_client(Session
*s
, Capability
*cap
,
909 const mempool_inode
*file_i
) const;
911 // caps issued, wanted
912 int get_caps_issued(int *ploner
= 0, int *pother
= 0, int *pxlocker
= 0,
913 int shift
= 0, int mask
= -1);
914 bool is_any_caps_wanted() const;
915 int get_caps_wanted(int *ploner
= 0, int *pother
= 0, int shift
= 0, int mask
= -1) const;
916 bool issued_caps_need_gather(SimpleLock
*lock
);
919 bool is_clientwriteable() const { return state
& STATE_CLIENTWRITEABLE
; }
920 void mark_clientwriteable();
921 void clear_clientwriteable();
924 mds_authority_t
authority() const override
;
927 bool can_auth_pin(int *err_ret
=nullptr) const override
;
928 void auth_pin(void *by
) override
;
929 void auth_unpin(void *by
) override
;
932 bool is_freezing_inode() const { return state_test(STATE_FREEZING
); }
933 bool is_frozen_inode() const { return state_test(STATE_FROZEN
); }
934 bool is_frozen_auth_pin() const { return state_test(STATE_FROZENAUTHPIN
); }
935 bool is_frozen() const override
;
936 bool is_frozen_dir() const;
937 bool is_freezing() const override
;
939 /* Freeze the inode. auth_pin_allowance lets the caller account for any
940 * auth_pins it is itself holding/responsible for. */
941 bool freeze_inode(int auth_pin_allowance
=0);
942 void unfreeze_inode(MDSContext::vec
& finished
);
943 void unfreeze_inode();
945 void freeze_auth_pin();
946 void unfreeze_auth_pin();
948 // -- reference counting --
949 void bad_put(int by
) override
{
950 generic_dout(0) << " bad put " << *this << " by " << by
<< " " << pin_name(by
) << " was " << ref
952 << " (" << ref_map
<< ")"
956 ceph_assert(ref_map
[by
] > 0);
958 ceph_assert(ref
> 0);
960 void bad_get(int by
) override
{
961 generic_dout(0) << " bad get " << *this << " by " << by
<< " " << pin_name(by
) << " was " << ref
963 << " (" << ref_map
<< ")"
967 ceph_assert(ref_map
[by
] >= 0);
970 void first_get() override
;
971 void last_put() override
;
972 void _put() override
;
974 // -- hierarchy stuff --
975 void set_primary_parent(CDentry
*p
) {
976 ceph_assert(parent
== 0 ||
977 g_conf().get_val
<bool>("mds_hack_allow_loading_invalid_metadata"));
980 void remove_primary_parent(CDentry
*dn
) {
981 ceph_assert(dn
== parent
);
984 void add_remote_parent(CDentry
*p
);
985 void remove_remote_parent(CDentry
*p
);
986 int num_remote_parents() {
987 return remote_parents
.size();
990 void push_projected_parent(CDentry
*dn
) {
991 projected_parent
.push_back(dn
);
993 void pop_projected_parent() {
994 ceph_assert(projected_parent
.size());
995 parent
= projected_parent
.front();
996 projected_parent
.pop_front();
998 bool is_parent_projected() const {
999 return !projected_parent
.empty();
1002 mds_rank_t
get_export_pin(bool inherit
=true) const;
1003 void check_pin_policy(mds_rank_t target
);
1004 void set_export_pin(mds_rank_t rank
);
1005 void queue_export_pin(mds_rank_t target
);
1006 void maybe_export_pin(bool update
=false);
1008 void set_ephemeral_pin(bool dist
, bool rand
);
1009 void clear_ephemeral_pin(bool dist
, bool rand
);
1011 void setxattr_ephemeral_dist(bool val
=false);
1012 bool is_ephemeral_dist() const {
1013 return state_test(STATE_DISTEPHEMERALPIN
);
1016 double get_ephemeral_rand() const;
1017 void maybe_ephemeral_rand(double threshold
=-1.0);
1018 void setxattr_ephemeral_rand(double prob
=0.0);
1019 bool is_ephemeral_rand() const {
1020 return state_test(STATE_RANDEPHEMERALPIN
);
1023 bool has_ephemeral_policy() const {
1024 return get_inode()->export_ephemeral_random_pin
> 0.0 ||
1025 get_inode()->export_ephemeral_distributed_pin
;
1027 bool is_ephemerally_pinned() const {
1028 return state_test(STATE_DISTEPHEMERALPIN
) ||
1029 state_test(STATE_RANDEPHEMERALPIN
);
1032 void print(std::ostream
& out
) override
;
1033 void dump(ceph::Formatter
*f
, int flags
= DUMP_DEFAULT
) const;
1036 * Validate that the on-disk state of an inode matches what
1037 * we expect from our memory state. Currently this checks that:
1038 * 1) The backtrace associated with the file data exists and is correct
1039 * 2) For directories, the actual inode metadata matches our memory state,
1040 * 3) For directories, the rstats match
1042 * @param results A freshly-created validated_data struct, with values set
1043 * as described in the struct documentation.
1044 * @param mdr The request to be responeded upon the completion of the
1045 * validation (or NULL)
1046 * @param fin Context to call back on completion (or NULL)
1048 void validate_disk_state(validated_data
*results
,
1050 static void dump_validation_results(const validated_data
& results
,
1051 ceph::Formatter
*f
);
1053 //bool hack_accessed = false;
1054 //utime_t hack_load_stamp;
1058 SnapRealm
*snaprealm
= nullptr;
1059 SnapRealm
*containing_realm
= nullptr;
1060 snapid_t first
, last
;
1061 mempool::mds_co::compact_set
<snapid_t
> dirty_old_rstats
;
1063 uint64_t last_journaled
= 0; // log offset for the last time i was journaled
1064 //loff_t last_open_journaled; // log offset for the last journaled EOpen
1065 utime_t last_dirstat_prop
;
1067 // list item node for when we have unpropagated rstat data
1068 elist
<CInode
*>::item dirty_rstat_item
;
1070 mempool::mds_co::set
<client_t
> client_snap_caps
;
1071 mempool::mds_co::compact_map
<snapid_t
, mempool::mds_co::set
<client_t
> > client_need_snapflush
;
1073 // LogSegment lists i (may) belong to
1074 elist
<CInode
*>::item item_dirty
;
1075 elist
<CInode
*>::item item_caps
;
1076 elist
<CInode
*>::item item_open_file
;
1077 elist
<CInode
*>::item item_dirty_parent
;
1078 elist
<CInode
*>::item item_dirty_dirfrag_dir
;
1079 elist
<CInode
*>::item item_dirty_dirfrag_nest
;
1080 elist
<CInode
*>::item item_dirty_dirfrag_dirfragtree
;
1082 // also update RecoveryQueue::RecoveryQueue() if you change this
1083 elist
<CInode
*>::item
& item_recover_queue
= item_dirty_dirfrag_dir
;
1084 elist
<CInode
*>::item
& item_recover_queue_front
= item_dirty_dirfrag_nest
;
1086 inode_load_vec_t pop
;
1087 elist
<CInode
*>::item item_pop_lru
;
1090 static LockType versionlock_type
;
1091 static LockType authlock_type
;
1092 static LockType linklock_type
;
1093 static LockType dirfragtreelock_type
;
1094 static LockType filelock_type
;
1095 static LockType xattrlock_type
;
1096 static LockType snaplock_type
;
1097 static LockType nestlock_type
;
1098 static LockType flocklock_type
;
1099 static LockType policylock_type
;
1101 // FIXME not part of mempool
1102 LocalLockC versionlock
;
1103 SimpleLock authlock
;
1104 SimpleLock linklock
;
1105 ScatterLock dirfragtreelock
;
1106 ScatterLock filelock
;
1107 SimpleLock xattrlock
;
1108 SimpleLock snaplock
;
1109 ScatterLock nestlock
;
1110 SimpleLock flocklock
;
1111 SimpleLock policylock
;
1115 client_t loner_cap
= -1, want_loner_cap
= -1;
1118 ceph_lock_state_t
*get_fcntl_lock_state() {
1120 fcntl_locks
= new ceph_lock_state_t(g_ceph_context
, CEPH_LOCK_FCNTL
);
1123 void clear_fcntl_lock_state() {
1127 ceph_lock_state_t
*get_flock_lock_state() {
1129 flock_locks
= new ceph_lock_state_t(g_ceph_context
, CEPH_LOCK_FLOCK
);
1132 void clear_flock_lock_state() {
1136 void clear_file_locks() {
1137 clear_fcntl_lock_state();
1138 clear_flock_lock_state();
1140 void _encode_file_locks(ceph::buffer::list
& bl
) const {
1142 bool has_fcntl_locks
= fcntl_locks
&& !fcntl_locks
->empty();
1143 encode(has_fcntl_locks
, bl
);
1144 if (has_fcntl_locks
)
1145 encode(*fcntl_locks
, bl
);
1146 bool has_flock_locks
= flock_locks
&& !flock_locks
->empty();
1147 encode(has_flock_locks
, bl
);
1148 if (has_flock_locks
)
1149 encode(*flock_locks
, bl
);
1151 void _decode_file_locks(ceph::buffer::list::const_iterator
& p
) {
1153 bool has_fcntl_locks
;
1154 decode(has_fcntl_locks
, p
);
1155 if (has_fcntl_locks
)
1156 decode(*get_fcntl_lock_state(), p
);
1158 clear_fcntl_lock_state();
1159 bool has_flock_locks
;
1160 decode(has_flock_locks
, p
);
1161 if (has_flock_locks
)
1162 decode(*get_flock_lock_state(), p
);
1164 clear_flock_lock_state();
1168 * Return the pool ID where we currently write backtraces for
1169 * this inode (in addition to inode.old_pools)
1171 * @returns a pool ID >=0
1173 int64_t get_backtrace_pool() const;
1175 // parent dentries in cache
1176 CDentry
*parent
= nullptr; // primary link
1177 mempool::mds_co::compact_set
<CDentry
*> remote_parents
; // if hard linked
1179 mempool::mds_co::list
<CDentry
*> projected_parent
; // for in-progress rename, (un)link, etc.
1181 mds_authority_t inode_auth
= CDIR_AUTH_DEFAULT
;
1183 // -- distributed state --
1184 // file capabilities
1185 mempool_cap_map client_caps
; // client -> caps
1186 mempool::mds_co::compact_map
<int32_t, int32_t> mds_caps_wanted
; // [auth] mds -> caps wanted
1187 int replica_caps_wanted
= 0; // [replica] what i've requested from auth
1188 int num_caps_notable
= 0;
1190 ceph_lock_state_t
*fcntl_locks
= nullptr;
1191 ceph_lock_state_t
*flock_locks
= nullptr;
1194 mempool::mds_co::compact_map
<frag_t
, MDSContext::vec
> waiting_on_dir
;
1197 // -- freezing inode --
1198 int auth_pin_freeze_allowance
= 0;
1199 elist
<CInode
*>::item item_freezing_inode
;
1200 void maybe_finish_freeze_inode();
1203 friend class ValidationContinuation
;
1206 * Create a scrub_info_t struct for the scrub_infop pointer.
1208 void scrub_info_create() const;
1210 * Delete the scrub_info_t struct if it's not got any useful data
1212 void scrub_maybe_delete_info();
1214 void pop_projected_snaprealm(sr_t
*next_snaprealm
, bool early
);
1216 bool _validate_disk_state(class ValidationContinuation
*c
,
1217 int rval
, int stage
);
1219 struct projected_const_node
{
1220 inode_const_ptr inode
;
1221 xattr_map_const_ptr xattrs
;
1224 projected_const_node() = delete;
1225 projected_const_node(projected_const_node
&&) = default;
1226 explicit projected_const_node(const inode_const_ptr
& i
, const xattr_map_const_ptr
& x
, sr_t
*s
) :
1227 inode(i
), xattrs(x
), snapnode(s
) {}
1230 mempool::mds_co::list
<projected_const_node
> projected_nodes
; // projected values (only defined while dirty)
1231 size_t num_projected_srnodes
= 0;
1233 // -- cache infrastructure --
1234 mempool::mds_co::compact_map
<frag_t
,CDir
*> dirfrags
; // cached dir fragments under this Inode
1236 //for the purpose of quickly determining whether there's a subtree root or exporting dir
1237 int num_subtree_roots
= 0;
1238 int num_exporting_dirs
= 0;
1240 int stickydir_ref
= 0;
1241 std::unique_ptr
<scrub_info_t
> scrub_infop
;
1242 /** @} Scrubbing and fsck */
1245 std::ostream
& operator<<(std::ostream
& out
, const CInode
& in
);
1247 extern cinode_lock_info_t cinode_lock_info
[];
1248 extern int num_cinode_locks
;