1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/int_types.h"
16 #include "common/errno.h"
30 #include "events/EUpdate.h"
32 #include "osdc/Objecter.h"
36 #include "LogSegment.h"
38 #include "common/Clock.h"
40 #include "common/config.h"
41 #include "global/global_context.h"
42 #include "include/ceph_assert.h"
44 #include "mds/MDSContinuation.h"
45 #include "mds/InoTable.h"
46 #include "cephfs_features.h"
47 #include "osdc/Objecter.h"
49 #define dout_context g_ceph_context
50 #define dout_subsys ceph_subsys_mds
52 #define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") "
54 void CInodeCommitOperation::update(ObjectOperation
&op
, inode_backtrace_t
&bt
) {
57 op
.priority
= priority
;
61 encode(bt
, parent_bl
);
62 op
.setxattr("parent", parent_bl
);
64 // for the old pool there is no need to update the layout
69 encode(_layout
, layout_bl
, _features
);
70 op
.setxattr("layout", layout_bl
);
73 class CInodeIOContext
: public MDSIOContextBase
77 MDSRank
*get_mds() override
{return in
->mdcache
->mds
;}
79 explicit CInodeIOContext(CInode
*in_
) : in(in_
) {
80 ceph_assert(in
!= NULL
);
84 sr_t
* const CInode::projected_inode::UNDEF_SRNODE
= (sr_t
*)(unsigned long)-1;
86 LockType
CInode::versionlock_type(CEPH_LOCK_IVERSION
);
87 LockType
CInode::authlock_type(CEPH_LOCK_IAUTH
);
88 LockType
CInode::linklock_type(CEPH_LOCK_ILINK
);
89 LockType
CInode::dirfragtreelock_type(CEPH_LOCK_IDFT
);
90 LockType
CInode::filelock_type(CEPH_LOCK_IFILE
);
91 LockType
CInode::xattrlock_type(CEPH_LOCK_IXATTR
);
92 LockType
CInode::snaplock_type(CEPH_LOCK_ISNAP
);
93 LockType
CInode::nestlock_type(CEPH_LOCK_INEST
);
94 LockType
CInode::flocklock_type(CEPH_LOCK_IFLOCK
);
95 LockType
CInode::policylock_type(CEPH_LOCK_IPOLICY
);
97 std::string_view
CInode::pin_name(int p
) const
100 case PIN_DIRFRAG
: return "dirfrag";
101 case PIN_CAPS
: return "caps";
102 case PIN_IMPORTING
: return "importing";
103 case PIN_OPENINGDIR
: return "openingdir";
104 case PIN_REMOTEPARENT
: return "remoteparent";
105 case PIN_BATCHOPENJOURNAL
: return "batchopenjournal";
106 case PIN_SCATTERED
: return "scattered";
107 case PIN_STICKYDIRS
: return "stickydirs";
108 //case PIN_PURGING: return "purging";
109 case PIN_FREEZING
: return "freezing";
110 case PIN_FROZEN
: return "frozen";
111 case PIN_IMPORTINGCAPS
: return "importingcaps";
112 case PIN_EXPORTINGCAPS
: return "exportingcaps";
113 case PIN_PASTSNAPPARENT
: return "pastsnapparent";
114 case PIN_OPENINGSNAPPARENTS
: return "openingsnapparents";
115 case PIN_TRUNCATING
: return "truncating";
116 case PIN_STRAY
: return "stray";
117 case PIN_NEEDSNAPFLUSH
: return "needsnapflush";
118 case PIN_DIRTYRSTAT
: return "dirtyrstat";
119 case PIN_DIRTYPARENT
: return "dirtyparent";
120 case PIN_DIRWAITER
: return "dirwaiter";
121 default: return generic_pin_name(p
);
125 //int cinode_pins[CINODE_NUM_PINS]; // counts
126 ostream
& CInode::print_db_line_prefix(ostream
& out
)
128 return out
<< ceph_clock_now() << " mds." << mdcache
->mds
->get_nodeid() << ".cache.ino(" << ino() << ") ";
132 * write caps and lock ids
134 struct cinode_lock_info_t cinode_lock_info
[] = {
135 { CEPH_LOCK_IFILE
, CEPH_CAP_ANY_FILE_WR
},
136 { CEPH_LOCK_IAUTH
, CEPH_CAP_AUTH_EXCL
},
137 { CEPH_LOCK_ILINK
, CEPH_CAP_LINK_EXCL
},
138 { CEPH_LOCK_IXATTR
, CEPH_CAP_XATTR_EXCL
},
140 int num_cinode_locks
= sizeof(cinode_lock_info
) / sizeof(cinode_lock_info
[0]);
142 ostream
& operator<<(ostream
& out
, const CInode
& in
)
145 in
.make_path_string(path
, true);
147 out
<< "[inode " << in
.ino();
149 << (in
.is_multiversion() ? "...":"")
150 << in
.first
<< "," << in
.last
<< "]";
151 out
<< " " << path
<< (in
.is_dir() ? "/":"");
155 if (in
.is_replicated())
156 out
<< in
.get_replicas();
158 mds_authority_t a
= in
.authority();
159 out
<< " rep@" << a
.first
;
160 if (a
.second
!= CDIR_AUTH_UNKNOWN
)
161 out
<< "," << a
.second
;
162 out
<< "." << in
.get_replica_nonce();
166 out
<< " symlink='" << in
.symlink
<< "'";
167 if (in
.is_dir() && !in
.dirfragtree
.empty())
168 out
<< " " << in
.dirfragtree
;
170 out
<< " v" << in
.get_version();
171 if (in
.get_projected_version() > in
.get_version())
172 out
<< " pv" << in
.get_projected_version();
174 if (in
.get_num_auth_pins()) {
175 out
<< " ap=" << in
.get_num_auth_pins();
176 #ifdef MDS_AUTHPIN_SET
177 in
.print_authpin_set(out
);
182 out
<< " snaprealm=" << in
.snaprealm
;
184 if (in
.state_test(CInode::STATE_AMBIGUOUSAUTH
)) out
<< " AMBIGAUTH";
185 if (in
.state_test(CInode::STATE_NEEDSRECOVER
)) out
<< " NEEDSRECOVER";
186 if (in
.state_test(CInode::STATE_RECOVERING
)) out
<< " RECOVERING";
187 if (in
.state_test(CInode::STATE_DIRTYPARENT
)) out
<< " DIRTYPARENT";
188 if (in
.state_test(CInode::STATE_MISSINGOBJS
)) out
<< " MISSINGOBJS";
189 if (in
.is_ephemeral_dist()) out
<< " DISTEPHEMERALPIN";
190 if (in
.is_ephemeral_rand()) out
<< " RANDEPHEMERALPIN";
191 if (in
.is_freezing_inode()) out
<< " FREEZING=" << in
.auth_pin_freeze_allowance
;
192 if (in
.is_frozen_inode()) out
<< " FROZEN";
193 if (in
.is_frozen_auth_pin()) out
<< " FROZEN_AUTHPIN";
195 const auto& pi
= in
.get_projected_inode();
196 if (pi
->is_truncating())
197 out
<< " truncating(" << pi
->truncate_from
<< " to " << pi
->truncate_size
<< ")";
200 out
<< " " << in
.get_inode()->dirstat
;
201 if (g_conf()->mds_debug_scatterstat
&& in
.is_projected()) {
202 out
<< "->" << pi
->dirstat
;
205 out
<< " s=" << in
.get_inode()->size
;
206 if (in
.get_inode()->nlink
!= 1)
207 out
<< " nl=" << in
.get_inode()->nlink
;
211 out
<< " " << in
.get_inode()->rstat
;
212 if (!(in
.get_inode()->rstat
== in
.get_inode()->accounted_rstat
))
213 out
<< "/" << in
.get_inode()->accounted_rstat
;
214 if (g_conf()->mds_debug_scatterstat
&& in
.is_projected()) {
215 out
<< "->" << pi
->rstat
;
216 if (!(pi
->rstat
== pi
->accounted_rstat
))
217 out
<< "/" << pi
->accounted_rstat
;
220 if (in
.is_any_old_inodes()) {
221 out
<< " old_inodes=" << in
.get_old_inodes()->size();
224 if (!in
.client_need_snapflush
.empty())
225 out
<< " need_snapflush=" << in
.client_need_snapflush
;
228 if (!in
.authlock
.is_sync_and_unlocked())
229 out
<< " " << in
.authlock
;
230 if (!in
.linklock
.is_sync_and_unlocked())
231 out
<< " " << in
.linklock
;
232 if (in
.get_inode()->is_dir()) {
233 if (!in
.dirfragtreelock
.is_sync_and_unlocked())
234 out
<< " " << in
.dirfragtreelock
;
235 if (!in
.snaplock
.is_sync_and_unlocked())
236 out
<< " " << in
.snaplock
;
237 if (!in
.nestlock
.is_sync_and_unlocked())
238 out
<< " " << in
.nestlock
;
239 if (!in
.policylock
.is_sync_and_unlocked())
240 out
<< " " << in
.policylock
;
242 if (!in
.flocklock
.is_sync_and_unlocked())
243 out
<< " " << in
.flocklock
;
245 if (!in
.filelock
.is_sync_and_unlocked())
246 out
<< " " << in
.filelock
;
247 if (!in
.xattrlock
.is_sync_and_unlocked())
248 out
<< " " << in
.xattrlock
;
249 if (!in
.versionlock
.is_sync_and_unlocked())
250 out
<< " " << in
.versionlock
;
252 // hack: spit out crap on which clients have caps
253 if (in
.get_inode()->client_ranges
.size())
254 out
<< " cr=" << in
.get_inode()->client_ranges
;
256 if (!in
.get_client_caps().empty()) {
259 for (const auto &p
: in
.get_client_caps()) {
260 if (!first
) out
<< ",";
261 out
<< p
.first
<< "="
262 << ccap_string(p
.second
.pending());
263 if (p
.second
.issued() != p
.second
.pending())
264 out
<< "/" << ccap_string(p
.second
.issued());
265 out
<< "/" << ccap_string(p
.second
.wanted())
266 << "@" << p
.second
.get_last_seq();
270 if (in
.get_loner() >= 0 || in
.get_wanted_loner() >= 0) {
271 out
<< ",l=" << in
.get_loner();
272 if (in
.get_loner() != in
.get_wanted_loner())
273 out
<< "(" << in
.get_wanted_loner() << ")";
276 if (!in
.get_mds_caps_wanted().empty()) {
279 for (const auto &p
: in
.get_mds_caps_wanted()) {
282 out
<< p
.first
<< '=' << ccap_string(p
.second
);
288 if (in
.get_num_ref()) {
290 in
.print_pin_set(out
);
293 if (in
.get_inode()->export_pin
!= MDS_RANK_NONE
) {
294 out
<< " export_pin=" << in
.get_inode()->export_pin
;
296 if (in
.state_test(CInode::STATE_DISTEPHEMERALPIN
)) {
299 if (in
.state_test(CInode::STATE_RANDEPHEMERALPIN
)) {
308 CInode::CInode(MDCache
*c
, bool auth
, snapid_t f
, snapid_t l
) :
309 mdcache(c
), first(f
), last(l
),
312 item_open_file(this),
313 item_dirty_parent(this),
314 item_dirty_dirfrag_dir(this),
315 item_dirty_dirfrag_nest(this),
316 item_dirty_dirfrag_dirfragtree(this),
318 versionlock(this, &versionlock_type
),
319 authlock(this, &authlock_type
),
320 linklock(this, &linklock_type
),
321 dirfragtreelock(this, &dirfragtreelock_type
),
322 filelock(this, &filelock_type
),
323 xattrlock(this, &xattrlock_type
),
324 snaplock(this, &snaplock_type
),
325 nestlock(this, &nestlock_type
),
326 flocklock(this, &flocklock_type
),
327 policylock(this, &policylock_type
)
330 state_set(STATE_AUTH
);
333 void CInode::print(ostream
& out
)
338 void CInode::add_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
)
340 dout(10) << __func__
<< " client." << client
<< " snapid " << snapid
<< " on " << snapin
<< dendl
;
342 if (client_need_snapflush
.empty()) {
343 get(CInode::PIN_NEEDSNAPFLUSH
);
345 // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially
346 // long periods waiting for clients to flush their snaps.
347 auth_pin(this); // pin head get_inode()->..
350 auto &clients
= client_need_snapflush
[snapid
];
352 snapin
->auth_pin(this); // ...and pin snapped/old inode!
354 clients
.insert(client
);
357 void CInode::remove_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
)
359 dout(10) << __func__
<< " client." << client
<< " snapid " << snapid
<< " on " << snapin
<< dendl
;
360 auto it
= client_need_snapflush
.find(snapid
);
361 if (it
== client_need_snapflush
.end()) {
362 dout(10) << " snapid not found" << dendl
;
365 size_t n
= it
->second
.erase(client
);
367 dout(10) << " client not found" << dendl
;
370 if (it
->second
.empty()) {
371 client_need_snapflush
.erase(it
);
372 snapin
->auth_unpin(this);
374 if (client_need_snapflush
.empty()) {
375 put(CInode::PIN_NEEDSNAPFLUSH
);
381 pair
<bool,bool> CInode::split_need_snapflush(CInode
*cowin
, CInode
*in
)
383 dout(10) << __func__
<< " [" << cowin
->first
<< "," << cowin
->last
<< "] for " << *cowin
<< dendl
;
384 bool cowin_need_flush
= false;
385 bool orig_need_flush
= false;
386 auto it
= client_need_snapflush
.lower_bound(cowin
->first
);
387 while (it
!= client_need_snapflush
.end() && it
->first
< in
->first
) {
388 ceph_assert(!it
->second
.empty());
389 if (cowin
->last
>= it
->first
) {
390 cowin
->auth_pin(this);
391 cowin_need_flush
= true;
394 it
= client_need_snapflush
.erase(it
);
396 in
->auth_unpin(this);
399 if (it
!= client_need_snapflush
.end() && it
->first
<= in
->last
)
400 orig_need_flush
= true;
402 return make_pair(cowin_need_flush
, orig_need_flush
);
405 void CInode::mark_dirty_rstat()
407 if (!state_test(STATE_DIRTYRSTAT
)) {
408 dout(10) << __func__
<< dendl
;
409 state_set(STATE_DIRTYRSTAT
);
411 CDentry
*pdn
= get_projected_parent_dn();
412 if (pdn
->is_auth()) {
413 CDir
*pdir
= pdn
->dir
;
414 pdir
->dirty_rstat_inodes
.push_back(&dirty_rstat_item
);
415 mdcache
->mds
->locker
->mark_updated_scatterlock(&pdir
->inode
->nestlock
);
417 // under cross-MDS rename.
418 // DIRTYRSTAT flag will get cleared when rename finishes
419 ceph_assert(state_test(STATE_AMBIGUOUSAUTH
));
423 void CInode::clear_dirty_rstat()
425 if (state_test(STATE_DIRTYRSTAT
)) {
426 dout(10) << __func__
<< dendl
;
427 state_clear(STATE_DIRTYRSTAT
);
429 dirty_rstat_item
.remove_myself();
433 CInode::projected_inode
CInode::project_inode(const MutationRef
& mut
,
434 bool xattr
, bool snap
)
436 if (mut
&& mut
->is_projected(this)) {
437 ceph_assert(!xattr
&& !snap
);
438 auto _inode
= std::const_pointer_cast
<mempool_inode
>(projected_nodes
.back().inode
);
439 return projected_inode(std::move(_inode
), xattr_map_ptr());
442 auto pi
= allocate_inode(*get_projected_inode());
444 if (scrub_infop
&& scrub_infop
->last_scrub_dirty
) {
445 pi
->last_scrub_stamp
= scrub_infop
->last_scrub_stamp
;
446 pi
->last_scrub_version
= scrub_infop
->last_scrub_version
;
447 scrub_infop
->last_scrub_dirty
= false;
448 scrub_maybe_delete_info();
451 const auto& ox
= get_projected_xattrs();
454 px
= allocate_xattr_map();
459 sr_t
* ps
= projected_inode::UNDEF_SRNODE
;
461 ps
= prepare_new_srnode(0);
462 ++num_projected_srnodes
;
465 projected_nodes
.emplace_back(pi
, xattr
? px
: ox
, ps
);
467 mut
->add_projected_node(this);
468 dout(15) << __func__
<< " " << pi
->ino
<< dendl
;
469 return projected_inode(std::move(pi
), std::move(px
), ps
);
472 void CInode::pop_and_dirty_projected_inode(LogSegment
*ls
, const MutationRef
& mut
)
474 ceph_assert(!projected_nodes
.empty());
475 auto front
= std::move(projected_nodes
.front());
476 dout(15) << __func__
<< " v" << front
.inode
->version
<< dendl
;
478 projected_nodes
.pop_front();
480 mut
->remove_projected_node(this);
482 bool pool_updated
= get_inode()->layout
.pool_id
!= front
.inode
->layout
.pool_id
;
483 bool pin_updated
= (get_inode()->export_pin
!= front
.inode
->export_pin
) ||
484 (get_inode()->export_ephemeral_distributed_pin
!=
485 front
.inode
->export_ephemeral_distributed_pin
);
487 reset_inode(std::move(front
.inode
));
488 if (front
.xattrs
!= get_xattrs())
489 reset_xattrs(std::move(front
.xattrs
));
491 if (front
.snapnode
!= projected_inode::UNDEF_SRNODE
) {
492 --num_projected_srnodes
;
493 pop_projected_snaprealm(front
.snapnode
, false);
497 if (get_inode()->is_backtrace_updated())
498 mark_dirty_parent(ls
, pool_updated
);
501 maybe_export_pin(true);
504 sr_t
*CInode::prepare_new_srnode(snapid_t snapid
)
506 const sr_t
*cur_srnode
= get_projected_srnode();
510 new_srnode
= new sr_t(*cur_srnode
);
513 snapid
= mdcache
->get_global_snaprealm()->get_newest_seq();
514 new_srnode
= new sr_t();
515 new_srnode
->seq
= snapid
;
516 new_srnode
->created
= snapid
;
517 new_srnode
->current_parent_since
= get_oldest_snap();
522 const sr_t
*CInode::get_projected_srnode() const {
523 if (num_projected_srnodes
> 0) {
524 for (auto it
= projected_nodes
.rbegin(); it
!= projected_nodes
.rend(); ++it
)
525 if (it
->snapnode
!= projected_inode::UNDEF_SRNODE
)
529 return &snaprealm
->srnode
;
534 void CInode::project_snaprealm(sr_t
*new_srnode
)
536 dout(10) << __func__
<< " " << new_srnode
<< dendl
;
537 ceph_assert(projected_nodes
.back().snapnode
== projected_inode::UNDEF_SRNODE
);
538 projected_nodes
.back().snapnode
= new_srnode
;
539 ++num_projected_srnodes
;
542 void CInode::mark_snaprealm_global(sr_t
*new_srnode
)
544 ceph_assert(!is_dir());
545 // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since'
546 new_srnode
->last_destroyed
= new_srnode
->current_parent_since
;
547 new_srnode
->current_parent_since
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
548 new_srnode
->mark_parent_global();
551 void CInode::clear_snaprealm_global(sr_t
*new_srnode
)
553 // restore 'current_parent_since'
554 new_srnode
->current_parent_since
= new_srnode
->last_destroyed
;
555 new_srnode
->last_destroyed
= 0;
556 new_srnode
->seq
= mdcache
->get_global_snaprealm()->get_newest_seq();
557 new_srnode
->clear_parent_global();
560 bool CInode::is_projected_snaprealm_global() const
562 const sr_t
*srnode
= get_projected_srnode();
563 if (srnode
&& srnode
->is_parent_global())
568 void CInode::project_snaprealm_past_parent(SnapRealm
*newparent
)
570 sr_t
*new_snap
= project_snaprealm();
571 record_snaprealm_past_parent(new_snap
, newparent
);
575 /* if newparent != parent, add parent to past_parents
576 if parent DNE, we need to find what the parent actually is and fill that in */
577 void CInode::record_snaprealm_past_parent(sr_t
*new_snap
, SnapRealm
*newparent
)
579 ceph_assert(!new_snap
->is_parent_global());
580 SnapRealm
*oldparent
;
582 oldparent
= find_snaprealm();
584 oldparent
= snaprealm
->parent
;
587 if (newparent
!= oldparent
) {
588 snapid_t oldparentseq
= oldparent
->get_newest_seq();
589 if (oldparentseq
+ 1 > new_snap
->current_parent_since
) {
590 // copy old parent's snaps
591 const set
<snapid_t
>& snaps
= oldparent
->get_snaps();
592 auto p
= snaps
.lower_bound(new_snap
->current_parent_since
);
593 if (p
!= snaps
.end())
594 new_snap
->past_parent_snaps
.insert(p
, snaps
.end());
595 if (oldparentseq
> new_snap
->seq
)
596 new_snap
->seq
= oldparentseq
;
598 new_snap
->current_parent_since
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
602 void CInode::record_snaprealm_parent_dentry(sr_t
*new_snap
, SnapRealm
*oldparent
,
603 CDentry
*dn
, bool primary_dn
)
605 ceph_assert(new_snap
->is_parent_global());
608 oldparent
= dn
->get_dir()->inode
->find_snaprealm();
609 auto& snaps
= oldparent
->get_snaps();
612 auto p
= snaps
.lower_bound(dn
->first
);
613 if (p
!= snaps
.end())
614 new_snap
->past_parent_snaps
.insert(p
, snaps
.end());
616 // 'last_destroyed' is used as 'current_parent_since'
617 auto p
= snaps
.lower_bound(new_snap
->last_destroyed
);
618 if (p
!= snaps
.end())
619 new_snap
->past_parent_snaps
.insert(p
, snaps
.end());
620 new_snap
->last_destroyed
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
624 void CInode::early_pop_projected_snaprealm()
626 ceph_assert(!projected_nodes
.empty());
627 if (projected_nodes
.front().snapnode
!= projected_inode::UNDEF_SRNODE
) {
628 pop_projected_snaprealm(projected_nodes
.front().snapnode
, true);
629 projected_nodes
.front().snapnode
= projected_inode::UNDEF_SRNODE
;
630 --num_projected_srnodes
;
634 void CInode::pop_projected_snaprealm(sr_t
*next_snaprealm
, bool early
)
636 if (next_snaprealm
) {
637 dout(10) << __func__
<< (early
? " (early) " : " ")
638 << next_snaprealm
<< " seq " << next_snaprealm
->seq
<< dendl
;
642 auto old_flags
= snaprealm
->srnode
.flags
;
643 snaprealm
->srnode
= *next_snaprealm
;
644 delete next_snaprealm
;
646 if ((snaprealm
->srnode
.flags
^ old_flags
) & sr_t::PARENT_GLOBAL
) {
647 snaprealm
->adjust_parent();
650 if (snaprealm
->parent
)
651 dout(10) << " realm " << *snaprealm
<< " parent " << *snaprealm
->parent
<< dendl
;
653 dout(10) << __func__
<< (early
? " (early) null" : " null") << dendl
;
654 ceph_assert(snaprealm
);
655 snaprealm
->merge_to(NULL
);
660 // ====== CInode =======
664 InodeStoreBase::inode_const_ptr
InodeStoreBase::empty_inode
= InodeStoreBase::allocate_inode();
666 __u32
InodeStoreBase::hash_dentry_name(std::string_view dn
)
668 int which
= inode
->dir_layout
.dl_dir_hash
;
670 which
= CEPH_STR_HASH_LINUX
;
671 ceph_assert(ceph_str_hash_valid(which
));
672 return ceph_str_hash(which
, dn
.data(), dn
.length());
675 frag_t
InodeStoreBase::pick_dirfrag(std::string_view dn
)
677 if (dirfragtree
.empty())
678 return frag_t(); // avoid the string hash if we can.
680 __u32 h
= hash_dentry_name(dn
);
681 return dirfragtree
[h
];
684 std::pair
<bool, std::vector
<CDir
*>> CInode::get_dirfrags_under(frag_t fg
)
686 std::pair
<bool, std::vector
<CDir
*>> result
;
687 auto& all
= result
.first
;
688 auto& dirs
= result
.second
;
691 if (auto it
= dirfrags
.find(fg
); it
!= dirfrags
.end()){
693 dirs
.push_back(it
->second
);
698 for(auto &[_fg
, _dir
] : dirfrags
){
699 // frag_t.bits() can indicate the depth of the partition in the directory tree
701 // 01* : bit = 2, on the second floor
704 // 00* 01* 10* 11* -- > level 2, bit = 2
705 // so fragA.bits > fragB.bits means fragA is deeper than fragB
707 if (fg
.bits() >= _fg
.bits()) {
708 if (_fg
.contains(fg
)) {
713 if (fg
.contains(_fg
)) {
714 dirs
.push_back(_dir
);
715 // we can calculate how many sub slices a slice can be divided into
716 // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*)
717 // or 2^2 frags belonging to the second layer(00* 01* 10* 11*)
718 // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level
719 total
+= 1 << (24 - _fg
.bits());
724 // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache
725 all
= ((1<<(24-fg
.bits())) == total
);
729 void CInode::verify_dirfrags()
732 for (const auto &p
: dirfrags
) {
733 if (!dirfragtree
.is_leaf(p
.first
)) {
734 dout(0) << "have open dirfrag " << p
.first
<< " but not leaf in " << dirfragtree
735 << ": " << *p
.second
<< dendl
;
742 void CInode::force_dirfrags()
745 for (auto &p
: dirfrags
) {
746 if (!dirfragtree
.is_leaf(p
.first
)) {
747 dout(0) << "have open dirfrag " << p
.first
<< " but not leaf in " << dirfragtree
748 << ": " << *p
.second
<< dendl
;
755 dirfragtree
.get_leaves(leaves
);
756 for (const auto& leaf
: leaves
) {
757 mdcache
->get_force_dirfrag(dirfrag_t(ino(), leaf
), true);
764 CDir
*CInode::get_approx_dirfrag(frag_t fg
)
766 CDir
*dir
= get_dirfrag(fg
);
770 auto&& p
= get_dirfrags_under(fg
);
771 if (!p
.second
.empty())
772 return p
.second
.front();
775 while (fg
.bits() > 0) {
777 dir
= get_dirfrag(fg
);
783 CDir
*CInode::get_or_open_dirfrag(MDCache
*mdcache
, frag_t fg
)
785 ceph_assert(is_dir());
788 CDir
*dir
= get_dirfrag(fg
);
791 ceph_assert(is_auth() || mdcache
->mds
->is_any_replay());
792 dir
= new CDir(this, fg
, mdcache
, is_auth());
798 CDir
*CInode::add_dirfrag(CDir
*dir
)
800 auto em
= dirfrags
.emplace(std::piecewise_construct
, std::forward_as_tuple(dir
->dirfrag().frag
), std::forward_as_tuple(dir
));
801 ceph_assert(em
.second
);
803 if (stickydir_ref
> 0) {
804 dir
->state_set(CDir::STATE_STICKY
);
805 dir
->get(CDir::PIN_STICKY
);
813 void CInode::close_dirfrag(frag_t fg
)
815 dout(14) << __func__
<< " " << fg
<< dendl
;
816 ceph_assert(dirfrags
.count(fg
));
818 CDir
*dir
= dirfrags
[fg
];
819 dir
->remove_null_dentries();
825 if (stickydir_ref
> 0) {
826 dir
->state_clear(CDir::STATE_STICKY
);
827 dir
->put(CDir::PIN_STICKY
);
830 if (dir
->is_subtree_root())
833 // dump any remaining dentries, for debugging purposes
834 for (const auto &p
: dir
->items
)
835 dout(14) << __func__
<< " LEFTOVER dn " << *p
.second
<< dendl
;
837 ceph_assert(dir
->get_num_ref() == 0);
842 void CInode::close_dirfrags()
844 while (!dirfrags
.empty())
845 close_dirfrag(dirfrags
.begin()->first
);
848 bool CInode::has_subtree_root_dirfrag(int auth
)
850 if (num_subtree_roots
> 0) {
853 for (const auto &p
: dirfrags
) {
854 if (p
.second
->is_subtree_root() &&
855 p
.second
->dir_auth
.first
== auth
)
862 bool CInode::has_subtree_or_exporting_dirfrag()
864 if (num_subtree_roots
> 0 || num_exporting_dirs
> 0)
869 void CInode::get_stickydirs()
871 if (stickydir_ref
== 0) {
873 for (const auto &p
: dirfrags
) {
874 p
.second
->state_set(CDir::STATE_STICKY
);
875 p
.second
->get(CDir::PIN_STICKY
);
881 void CInode::put_stickydirs()
883 ceph_assert(stickydir_ref
> 0);
885 if (stickydir_ref
== 0) {
887 for (const auto &p
: dirfrags
) {
888 p
.second
->state_clear(CDir::STATE_STICKY
);
889 p
.second
->put(CDir::PIN_STICKY
);
900 void CInode::first_get()
904 parent
->get(CDentry::PIN_INODEPIN
);
907 void CInode::last_put()
911 parent
->put(CDentry::PIN_INODEPIN
);
916 if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
917 mdcache
->maybe_eval_stray(this, true);
920 void CInode::add_remote_parent(CDentry
*p
)
922 if (remote_parents
.empty())
923 get(PIN_REMOTEPARENT
);
924 remote_parents
.insert(p
);
926 void CInode::remove_remote_parent(CDentry
*p
)
928 remote_parents
.erase(p
);
929 if (remote_parents
.empty())
930 put(PIN_REMOTEPARENT
);
936 CDir
*CInode::get_parent_dir()
942 CDir
*CInode::get_projected_parent_dir()
944 CDentry
*p
= get_projected_parent_dn();
949 CInode
*CInode::get_parent_inode()
952 return parent
->dir
->inode
;
956 bool CInode::is_ancestor_of(const CInode
*other
) const
961 const CDentry
*pdn
= other
->get_oldest_parent_dn();
963 ceph_assert(other
->is_base());
966 other
= pdn
->get_dir()->get_inode();
971 bool CInode::is_projected_ancestor_of(const CInode
*other
) const
976 const CDentry
*pdn
= other
->get_projected_parent_dn();
978 ceph_assert(other
->is_base());
981 other
= pdn
->get_dir()->get_inode();
987 * Because a non-directory inode may have multiple links, the use_parent
988 * argument allows selecting which parent to use for path construction. This
989 * argument is only meaningful for the final component (i.e. the first of the
990 * nested calls) because directories cannot have multiple hard links. If
991 * use_parent is NULL and projected is true, the primary parent's projected
992 * inode is used all the way up the path chain. Otherwise the primary parent
993 * stable inode is used.
995 void CInode::make_path_string(string
& s
, bool projected
, const CDentry
*use_parent
) const
998 use_parent
= projected
? get_projected_parent_dn() : parent
;
1002 use_parent
->make_path_string(s
, projected
);
1003 } else if (is_root()) {
1005 } else if (is_mdsdir()) {
1007 uint64_t eino(ino());
1008 eino
-= MDS_INO_MDSDIR_OFFSET
;
1009 snprintf(t
, sizeof(t
), "~mds%" PRId64
, eino
);
1013 uint64_t eino(ino());
1014 snprintf(n
, sizeof(n
), "#%" PRIx64
, eino
);
1019 void CInode::make_path(filepath
& fp
, bool projected
) const
1021 const CDentry
*use_parent
= projected
? get_projected_parent_dn() : parent
;
1023 ceph_assert(!is_base());
1024 use_parent
->make_path(fp
, projected
);
1026 fp
= filepath(ino());
1030 void CInode::name_stray_dentry(string
& dname
)
1033 snprintf(s
, sizeof(s
), "%llx", (unsigned long long)ino().val
);
1037 version_t
CInode::pre_dirty()
1040 CDentry
* _cdentry
= get_projected_parent_dn();
1042 pv
= _cdentry
->pre_dirty(get_projected_version());
1043 dout(10) << "pre_dirty " << pv
<< " (current v " << get_inode()->version
<< ")" << dendl
;
1045 ceph_assert(is_base());
1046 pv
= get_projected_version() + 1;
1048 // force update backtrace for old format inode (see mempool_inode::decode)
1049 if (get_inode()->backtrace_version
== 0 && !projected_nodes
.empty()) {
1050 auto pi
= _get_projected_inode();
1051 if (pi
->backtrace_version
== 0)
1052 pi
->update_backtrace(pv
);
1057 void CInode::_mark_dirty(LogSegment
*ls
)
1059 if (!state_test(STATE_DIRTY
)) {
1060 state_set(STATE_DIRTY
);
1065 // move myself to this segment's dirty list
1067 ls
->dirty_inodes
.push_back(&item_dirty
);
1070 void CInode::mark_dirty(LogSegment
*ls
) {
1072 dout(10) << __func__
<< " " << *this << dendl
;
1075 NOTE: I may already be dirty, but this fn _still_ needs to be called so that
1076 the directory is (perhaps newly) dirtied, and so that parent_dir_version is
1080 // only auth can get dirty. "dirty" async data in replicas is relative to
1081 // filelock state, not the dirty flag.
1082 ceph_assert(is_auth());
1084 // touch my private version
1089 parent
->mark_dirty(get_version(), ls
);
1093 void CInode::mark_clean()
1095 dout(10) << __func__
<< " " << *this << dendl
;
1096 if (state_test(STATE_DIRTY
)) {
1097 state_clear(STATE_DIRTY
);
1100 // remove myself from ls dirty list
1101 item_dirty
.remove_myself();
1107 // per-inode storage
1108 // (currently for root inode only)
1110 struct C_IO_Inode_Stored
: public CInodeIOContext
{
1113 C_IO_Inode_Stored(CInode
*i
, version_t v
, Context
*f
) : CInodeIOContext(i
), version(v
), fin(f
) {}
1114 void finish(int r
) override
{
1115 in
->_stored(r
, version
, fin
);
1117 void print(ostream
& out
) const override
{
1118 out
<< "inode_store(" << in
->ino() << ")";
1122 object_t
InodeStoreBase::get_object_name(inodeno_t ino
, frag_t fg
, std::string_view suffix
)
1125 snprintf(n
, sizeof(n
), "%llx.%08llx", (long long unsigned)ino
, (long long unsigned)fg
);
1126 ceph_assert(strlen(n
) + suffix
.size() < sizeof n
);
1127 strncat(n
, suffix
.data(), suffix
.size());
1131 void CInode::store(MDSContext
*fin
)
1133 dout(10) << __func__
<< " " << get_version() << dendl
;
1134 ceph_assert(is_base());
1137 purge_stale_snap_data(snaprealm
->get_snaps());
1141 string magic
= CEPH_FS_ONDISK_MAGIC
;
1144 encode_store(bl
, mdcache
->mds
->mdsmap
->get_up_features());
1151 object_t oid
= CInode::get_object_name(ino(), frag_t(), ".inode");
1152 object_locator_t
oloc(mdcache
->mds
->get_metadata_pool());
1155 new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin
),
1156 mdcache
->mds
->finisher
);
1157 mdcache
->mds
->objecter
->mutate(oid
, oloc
, m
, snapc
,
1158 ceph::real_clock::now(), 0,
1162 void CInode::_stored(int r
, version_t v
, Context
*fin
)
1165 dout(1) << "store error " << r
<< " v " << v
<< " on " << *this << dendl
;
1166 mdcache
->mds
->clog
->error() << "failed to store inode " << ino()
1167 << " object: " << cpp_strerror(r
);
1168 mdcache
->mds
->handle_write_error(r
);
1173 dout(10) << __func__
<< " " << v
<< " on " << *this << dendl
;
1174 if (v
== get_projected_version())
1180 void CInode::flush(MDSContext
*fin
)
1182 dout(10) << __func__
<< " " << *this << dendl
;
1183 ceph_assert(is_auth() && can_auth_pin());
1185 MDSGatherBuilder
gather(g_ceph_context
);
1187 if (is_dirty_parent()) {
1188 store_backtrace(gather
.new_sub());
1192 store(gather
.new_sub());
1194 parent
->dir
->commit(0, gather
.new_sub());
1198 if (gather
.has_subs()) {
1199 gather
.set_finisher(fin
);
1206 struct C_IO_Inode_Fetched
: public CInodeIOContext
{
1209 C_IO_Inode_Fetched(CInode
*i
, Context
*f
) : CInodeIOContext(i
), fin(f
) {}
1210 void finish(int r
) override
{
1211 // Ignore 'r', because we fetch from two places, so r is usually CEPHFS_ENOENT
1212 in
->_fetched(bl
, bl2
, fin
);
1214 void print(ostream
& out
) const override
{
1215 out
<< "inode_fetch(" << in
->ino() << ")";
1219 void CInode::fetch(MDSContext
*fin
)
1221 dout(10) << __func__
<< dendl
;
1223 C_IO_Inode_Fetched
*c
= new C_IO_Inode_Fetched(this, fin
);
1224 C_GatherBuilder
gather(g_ceph_context
, new C_OnFinisher(c
, mdcache
->mds
->finisher
));
1226 object_t oid
= CInode::get_object_name(ino(), frag_t(), "");
1227 object_locator_t
oloc(mdcache
->mds
->get_metadata_pool());
1229 // Old on-disk format: inode stored in xattr of a dirfrag
1231 rd
.getxattr("inode", &c
->bl
, NULL
);
1232 mdcache
->mds
->objecter
->read(oid
, oloc
, rd
, CEPH_NOSNAP
, (bufferlist
*)NULL
, 0, gather
.new_sub());
1234 // Current on-disk format: inode stored in a .inode object
1235 object_t oid2
= CInode::get_object_name(ino(), frag_t(), ".inode");
1236 mdcache
->mds
->objecter
->read(oid2
, oloc
, 0, 0, CEPH_NOSNAP
, &c
->bl2
, 0, gather
.new_sub());
1241 void CInode::_fetched(bufferlist
& bl
, bufferlist
& bl2
, Context
*fin
)
1243 dout(10) << __func__
<< " got " << bl
.length() << " and " << bl2
.length() << dendl
;
1244 bufferlist::const_iterator p
;
1247 } else if (bl
.length()) {
1250 derr
<< "No data while reading inode " << ino() << dendl
;
1251 fin
->complete(-CEPHFS_ENOENT
);
1260 dout(10) << " magic is '" << magic
<< "' (expecting '"
1261 << CEPH_FS_ONDISK_MAGIC
<< "')" << dendl
;
1262 if (magic
!= CEPH_FS_ONDISK_MAGIC
) {
1263 dout(0) << "on disk magic '" << magic
<< "' != my magic '" << CEPH_FS_ONDISK_MAGIC
1265 fin
->complete(-CEPHFS_EINVAL
);
1268 dout(10) << "_fetched " << *this << dendl
;
1271 } catch (buffer::error
&err
) {
1272 derr
<< "Corrupt inode " << ino() << ": " << err
.what() << dendl
;
1273 fin
->complete(-CEPHFS_EINVAL
);
1278 void CInode::build_backtrace(int64_t pool
, inode_backtrace_t
& bt
)
1281 bt
.ancestors
.clear();
1285 CDentry
*pdn
= get_parent_dn();
1287 CInode
*diri
= pdn
->get_dir()->get_inode();
1288 bt
.ancestors
.push_back(inode_backpointer_t(diri
->ino(), pdn
->get_name(), in
->get_inode()->version
));
1290 pdn
= in
->get_parent_dn();
1292 bt
.old_pools
.reserve(get_inode()->old_pools
.size());
1293 for (auto &p
: get_inode()->old_pools
) {
1294 // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
1296 bt
.old_pools
.push_back(p
);
1300 struct C_IO_Inode_StoredBacktrace
: public CInodeIOContext
{
1303 C_IO_Inode_StoredBacktrace(CInode
*i
, version_t v
, Context
*f
) : CInodeIOContext(i
), version(v
), fin(f
) {}
1304 void finish(int r
) override
{
1305 in
->_stored_backtrace(r
, version
, fin
);
1307 void print(ostream
& out
) const override
{
1308 out
<< "backtrace_store(" << in
->ino() << ")";
1313 void CInode::_commit_ops(int r
, C_GatherBuilder
&gather_bld
,
1314 std::vector
<CInodeCommitOperation
> &ops_vec
,
1315 inode_backtrace_t
&bt
)
1317 dout(10) << __func__
<< dendl
;
1320 mdcache
->mds
->handle_write_error_with_lock(r
);
1325 object_t oid
= get_object_name(ino(), frag_t(), "");
1327 for (auto &op
: ops_vec
) {
1328 ObjectOperation obj_op
;
1329 object_locator_t
oloc(op
.get_pool());
1330 op
.update(obj_op
, bt
);
1331 mdcache
->mds
->objecter
->mutate(oid
, oloc
, obj_op
, snapc
,
1332 ceph::real_clock::now(),
1333 0, gather_bld
.new_sub());
1337 void CInode::_store_backtrace(std::vector
<CInodeCommitOperation
> &ops_vec
,
1338 inode_backtrace_t
&bt
, int op_prio
)
1340 dout(10) << __func__
<< " on " << *this << dendl
;
1341 ceph_assert(is_dirty_parent());
1344 op_prio
= CEPH_MSG_PRIO_DEFAULT
;
1348 const int64_t pool
= get_backtrace_pool();
1349 build_backtrace(pool
, bt
);
1351 ops_vec
.emplace_back(op_prio
, pool
, get_inode()->layout
,
1352 mdcache
->mds
->mdsmap
->get_up_features());
1354 if (!state_test(STATE_DIRTYPOOL
) || get_inode()->old_pools
.empty()) {
1355 dout(20) << __func__
<< ": no dirtypool or no old pools" << dendl
;
1359 // In the case where DIRTYPOOL is set, we update all old pools backtraces
1360 // such that anyone reading them will see the new pool ID in
1361 // inode_backtrace_t::pool and go read everything else from there.
1362 for (const auto &p
: get_inode()->old_pools
) {
1366 dout(20) << __func__
<< ": updating old pool " << p
<< dendl
;
1368 ops_vec
.emplace_back(op_prio
, p
);
1372 void CInode::store_backtrace(MDSContext
*fin
, int op_prio
)
1374 std::vector
<CInodeCommitOperation
> ops_vec
;
1375 inode_backtrace_t bt
;
1376 auto version
= get_inode()->backtrace_version
;
1378 _store_backtrace(ops_vec
, bt
, op_prio
);
1380 C_GatherBuilder
gather(g_ceph_context
,
1382 new C_IO_Inode_StoredBacktrace(this, version
, fin
),
1383 mdcache
->mds
->finisher
));
1384 _commit_ops(0, gather
, ops_vec
, bt
);
1385 ceph_assert(gather
.has_subs());
1389 void CInode::store_backtrace(CInodeCommitOperations
&op
, int op_prio
)
1391 op
.version
= get_inode()->backtrace_version
;
1394 _store_backtrace(op
.ops_vec
, op
.bt
, op_prio
);
1397 void CInode::_stored_backtrace(int r
, version_t v
, Context
*fin
)
1399 if (r
== -CEPHFS_ENOENT
) {
1400 const int64_t pool
= get_backtrace_pool();
1401 bool exists
= mdcache
->mds
->objecter
->with_osdmap(
1402 [pool
](const OSDMap
&osd_map
) {
1403 return osd_map
.have_pg_pool(pool
);
1406 // This CEPHFS_ENOENT is because the pool doesn't exist (the user deleted it
1407 // out from under us), so the backtrace can never be written, so pretend
1408 // to succeed so that the user can proceed to e.g. delete the file.
1410 dout(4) << __func__
<< " got CEPHFS_ENOENT: a data pool was deleted "
1411 "beneath us!" << dendl
;
1417 dout(1) << "store backtrace error " << r
<< " v " << v
<< dendl
;
1418 mdcache
->mds
->clog
->error() << "failed to store backtrace on ino "
1419 << ino() << " object"
1420 << ", pool " << get_backtrace_pool()
1422 mdcache
->mds
->handle_write_error(r
);
1428 dout(10) << __func__
<< " v " << v
<< dendl
;
1431 if (v
== get_inode()->backtrace_version
)
1432 clear_dirty_parent();
1437 void CInode::fetch_backtrace(Context
*fin
, bufferlist
*backtrace
)
1439 mdcache
->fetch_backtrace(ino(), get_backtrace_pool(), *backtrace
, fin
);
1442 void CInode::mark_dirty_parent(LogSegment
*ls
, bool dirty_pool
)
1444 if (!state_test(STATE_DIRTYPARENT
)) {
1445 dout(10) << __func__
<< dendl
;
1446 state_set(STATE_DIRTYPARENT
);
1447 get(PIN_DIRTYPARENT
);
1451 state_set(STATE_DIRTYPOOL
);
1453 ls
->dirty_parent_inodes
.push_back(&item_dirty_parent
);
1456 void CInode::clear_dirty_parent()
1458 if (state_test(STATE_DIRTYPARENT
)) {
1459 dout(10) << __func__
<< dendl
;
1460 state_clear(STATE_DIRTYPARENT
);
1461 state_clear(STATE_DIRTYPOOL
);
1462 put(PIN_DIRTYPARENT
);
1463 item_dirty_parent
.remove_myself();
1467 void CInode::verify_diri_backtrace(bufferlist
&bl
, int err
)
1469 if (is_base() || is_dirty_parent() || !is_auth())
1472 dout(10) << __func__
<< dendl
;
1475 inode_backtrace_t backtrace
;
1477 decode(backtrace
, bl
);
1478 CDentry
*pdn
= get_parent_dn();
1479 if (backtrace
.ancestors
.empty() ||
1480 backtrace
.ancestors
[0].dname
!= pdn
->get_name() ||
1481 backtrace
.ancestors
[0].dirino
!= pdn
->get_dir()->ino())
1482 err
= -CEPHFS_EINVAL
;
1486 MDSRank
*mds
= mdcache
->mds
;
1487 mds
->clog
->error() << "bad backtrace on directory inode " << ino();
1488 ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace
> 1));
1490 mark_dirty_parent(mds
->mdlog
->get_current_segment(), false);
1491 mds
->mdlog
->flush();
1495 // ------------------
1499 void InodeStoreBase::encode_xattrs(bufferlist
&bl
) const {
1502 encode(*xattrs
, bl
);
1504 encode((__u32
)0, bl
);
1507 void InodeStoreBase::decode_xattrs(bufferlist::const_iterator
&p
) {
1509 mempool_xattr_map tmp
;
1510 decode_noshare(tmp
, p
);
1512 reset_xattrs(xattr_map_ptr());
1514 reset_xattrs(allocate_xattr_map(std::move(tmp
)));
1518 void InodeStoreBase::encode_old_inodes(bufferlist
&bl
, uint64_t features
) const {
1521 encode(*old_inodes
, bl
, features
);
1523 encode((__u32
)0, bl
);
1526 void InodeStoreBase::decode_old_inodes(bufferlist::const_iterator
&p
) {
1528 mempool_old_inode_map tmp
;
1531 reset_old_inodes(old_inode_map_ptr());
1533 reset_old_inodes(allocate_old_inode_map(std::move(tmp
)));
1537 void InodeStoreBase::encode_bare(bufferlist
&bl
, uint64_t features
,
1538 const bufferlist
*snap_blob
) const
1541 encode(*inode
, bl
, features
);
1542 if (inode
->is_symlink())
1543 encode(symlink
, bl
);
1544 encode(dirfragtree
, bl
);
1548 encode(*snap_blob
, bl
);
1550 encode(bufferlist(), bl
);
1551 encode_old_inodes(bl
, features
);
1552 encode(oldest_snap
, bl
);
1553 encode(damage_flags
, bl
);
1556 void InodeStoreBase::encode(bufferlist
&bl
, uint64_t features
,
1557 const bufferlist
*snap_blob
) const
1559 ENCODE_START(6, 4, bl
);
1560 encode_bare(bl
, features
, snap_blob
);
1564 void CInode::encode_store(bufferlist
& bl
, uint64_t features
)
1566 bufferlist snap_blob
;
1567 encode_snap_blob(snap_blob
);
1568 InodeStoreBase::encode(bl
, mdcache
->mds
->mdsmap
->get_up_features(),
1572 void InodeStoreBase::decode_bare(bufferlist::const_iterator
&bl
,
1573 bufferlist
& snap_blob
, __u8 struct_v
)
1577 auto _inode
= allocate_inode();
1578 decode(*_inode
, bl
);
1580 if (_inode
->is_symlink()) {
1583 symlink
= std::string_view(tmp
);
1585 decode(dirfragtree
, bl
);
1587 decode(snap_blob
, bl
);
1589 decode_old_inodes(bl
);
1590 if (struct_v
== 2 && _inode
->is_dir()) {
1591 bool default_layout_exists
;
1592 decode(default_layout_exists
, bl
);
1593 if (default_layout_exists
) {
1594 decode(struct_v
, bl
); // this was a default_file_layout
1595 decode(_inode
->layout
, bl
); // but we only care about the layout portion
1599 if (struct_v
>= 5) {
1600 // InodeStore is embedded in dentries without proper versioning, so
1601 // we consume up to the end of the buffer
1603 decode(oldest_snap
, bl
);
1607 decode(damage_flags
, bl
);
1611 reset_inode(std::move(_inode
));
1615 void InodeStoreBase::decode(bufferlist::const_iterator
&bl
, bufferlist
& snap_blob
)
1617 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl
);
1618 decode_bare(bl
, snap_blob
, struct_v
);
1622 void CInode::decode_store(bufferlist::const_iterator
& bl
)
1624 bufferlist snap_blob
;
1625 InodeStoreBase::decode(bl
, snap_blob
);
1626 decode_snap_blob(snap_blob
);
1629 // ------------------
1632 SimpleLock
* CInode::get_lock(int type
)
1635 case CEPH_LOCK_IVERSION
: return &versionlock
;
1636 case CEPH_LOCK_IFILE
: return &filelock
;
1637 case CEPH_LOCK_IAUTH
: return &authlock
;
1638 case CEPH_LOCK_ILINK
: return &linklock
;
1639 case CEPH_LOCK_IDFT
: return &dirfragtreelock
;
1640 case CEPH_LOCK_IXATTR
: return &xattrlock
;
1641 case CEPH_LOCK_ISNAP
: return &snaplock
;
1642 case CEPH_LOCK_INEST
: return &nestlock
;
1643 case CEPH_LOCK_IFLOCK
: return &flocklock
;
1644 case CEPH_LOCK_IPOLICY
: return &policylock
;
1649 void CInode::set_object_info(MDSCacheObjectInfo
&info
)
1655 void CInode::encode_lock_iauth(bufferlist
& bl
)
1657 ENCODE_START(1, 1, bl
);
1658 encode(get_inode()->version
, bl
);
1659 encode(get_inode()->ctime
, bl
);
1660 encode(get_inode()->mode
, bl
);
1661 encode(get_inode()->uid
, bl
);
1662 encode(get_inode()->gid
, bl
);
1666 void CInode::decode_lock_iauth(bufferlist::const_iterator
& p
)
1668 ceph_assert(!is_auth());
1669 auto _inode
= allocate_inode(*get_inode());
1671 decode(_inode
->version
, p
);
1674 if (_inode
->ctime
< tm
) _inode
->ctime
= tm
;
1675 decode(_inode
->mode
, p
);
1676 decode(_inode
->uid
, p
);
1677 decode(_inode
->gid
, p
);
1679 reset_inode(std::move(_inode
));
1682 void CInode::encode_lock_ilink(bufferlist
& bl
)
1684 ENCODE_START(1, 1, bl
);
1685 encode(get_inode()->version
, bl
);
1686 encode(get_inode()->ctime
, bl
);
1687 encode(get_inode()->nlink
, bl
);
1691 void CInode::decode_lock_ilink(bufferlist::const_iterator
& p
)
1693 ceph_assert(!is_auth());
1694 auto _inode
= allocate_inode(*get_inode());
1696 decode(_inode
->version
, p
);
1699 if (_inode
->ctime
< tm
) _inode
->ctime
= tm
;
1700 decode(_inode
->nlink
, p
);
1702 reset_inode(std::move(_inode
));
1705 void CInode::encode_lock_idft(bufferlist
& bl
)
1707 ENCODE_START(1, 1, bl
);
1709 encode(get_inode()->version
, bl
);
1711 // treat flushing as dirty when rejoining cache
1712 bool dirty
= dirfragtreelock
.is_dirty_or_flushing();
1716 // encode the raw tree
1717 encode(dirfragtree
, bl
);
1719 // also specify which frags are mine
1720 set
<frag_t
> myfrags
;
1721 auto&& dfls
= get_dirfrags();
1722 for (const auto& dir
: dfls
) {
1723 if (dir
->is_auth()) {
1724 frag_t fg
= dir
->get_frag();
1728 encode(myfrags
, bl
);
1733 void CInode::decode_lock_idft(bufferlist::const_iterator
& p
)
1740 decode(replica_dirty
, p
);
1741 if (replica_dirty
) {
1742 dout(10) << __func__
<< " setting dftlock dirty flag" << dendl
;
1743 dirfragtreelock
.mark_dirty(); // ok bc we're auth and caller will handle
1746 _inode
= allocate_inode(*get_inode());
1747 decode(_inode
->version
, p
);
1752 set
<frag_t
> authfrags
;
1753 decode(authfrags
, p
);
1755 // auth. believe replica's auth frags only.
1756 for (auto fg
: authfrags
) {
1757 if (!dirfragtree
.is_leaf(fg
)) {
1758 dout(10) << " forcing frag " << fg
<< " to leaf (split|merge)" << dendl
;
1759 dirfragtree
.force_to_leaf(g_ceph_context
, fg
);
1760 dirfragtreelock
.mark_dirty(); // ok bc we're auth and caller will handle
1764 // replica. take the new tree, BUT make sure any open
1765 // dirfrags remain leaves (they may have split _after_ this
1766 // dft was scattered, or we may still be be waiting on the
1767 // notify from the auth)
1768 dirfragtree
.swap(temp
);
1769 for (const auto &p
: dirfrags
) {
1770 if (!dirfragtree
.is_leaf(p
.first
)) {
1771 dout(10) << " forcing open dirfrag " << p
.first
<< " to leaf (racing with split|merge)" << dendl
;
1772 dirfragtree
.force_to_leaf(g_ceph_context
, p
.first
);
1774 if (p
.second
->is_auth())
1775 p
.second
->state_clear(CDir::STATE_DIRTYDFT
);
1778 if (g_conf()->mds_debug_frag
)
1784 reset_inode(std::move(_inode
));
1787 void CInode::encode_lock_ifile(bufferlist
& bl
)
1789 ENCODE_START(1, 1, bl
);
1791 encode(get_inode()->version
, bl
);
1792 encode(get_inode()->ctime
, bl
);
1793 encode(get_inode()->mtime
, bl
);
1794 encode(get_inode()->atime
, bl
);
1795 encode(get_inode()->time_warp_seq
, bl
);
1797 encode(get_inode()->layout
, bl
, mdcache
->mds
->mdsmap
->get_up_features());
1798 encode(get_inode()->size
, bl
);
1799 encode(get_inode()->truncate_seq
, bl
);
1800 encode(get_inode()->truncate_size
, bl
);
1801 encode(get_inode()->client_ranges
, bl
);
1802 encode(get_inode()->inline_data
, bl
);
1805 // treat flushing as dirty when rejoining cache
1806 bool dirty
= filelock
.is_dirty_or_flushing();
1809 dout(15) << __func__
<< " inode.dirstat is " << get_inode()->dirstat
<< dendl
;
1810 encode(get_inode()->dirstat
, bl
); // only meaningful if i am auth.
1813 for (const auto &p
: dirfrags
) {
1814 frag_t fg
= p
.first
;
1815 CDir
*dir
= p
.second
;
1816 if (is_auth() || dir
->is_auth()) {
1817 const auto& pf
= dir
->get_projected_fnode();
1818 dout(15) << fg
<< " " << *dir
<< dendl
;
1819 dout(20) << fg
<< " fragstat " << pf
->fragstat
<< dendl
;
1820 dout(20) << fg
<< " accounted_fragstat " << pf
->accounted_fragstat
<< dendl
;
1822 encode(dir
->first
, tmp
);
1823 encode(pf
->fragstat
, tmp
);
1824 encode(pf
->accounted_fragstat
, tmp
);
1829 bl
.claim_append(tmp
);
1833 void CInode::decode_lock_ifile(bufferlist::const_iterator
& p
)
1839 _inode
= allocate_inode(*get_inode());
1841 decode(_inode
->version
, p
);
1844 if (_inode
->ctime
< tm
) _inode
->ctime
= tm
;
1845 decode(_inode
->mtime
, p
);
1846 decode(_inode
->atime
, p
);
1847 decode(_inode
->time_warp_seq
, p
);
1849 decode(_inode
->layout
, p
);
1850 decode(_inode
->size
, p
);
1851 decode(_inode
->truncate_seq
, p
);
1852 decode(_inode
->truncate_size
, p
);
1853 decode(_inode
->client_ranges
, p
);
1854 decode(_inode
->inline_data
, p
);
1858 decode(replica_dirty
, p
);
1859 if (replica_dirty
) {
1860 dout(10) << __func__
<< " setting filelock dirty flag" << dendl
;
1861 filelock
.mark_dirty(); // ok bc we're auth and caller will handle
1865 frag_info_t dirstat
;
1868 dout(10) << " taking inode dirstat " << dirstat
<< " for " << *this << dendl
;
1869 _inode
->dirstat
= dirstat
; // take inode summation if replica
1873 dout(10) << " ...got " << n
<< " fragstats on " << *this << dendl
;
1877 frag_info_t fragstat
;
1878 frag_info_t accounted_fragstat
;
1881 decode(fragstat
, p
);
1882 decode(accounted_fragstat
, p
);
1883 dout(10) << fg
<< " [" << fgfirst
<< ",head] " << dendl
;
1884 dout(10) << fg
<< " fragstat " << fragstat
<< dendl
;
1885 dout(20) << fg
<< " accounted_fragstat " << accounted_fragstat
<< dendl
;
1887 CDir
*dir
= get_dirfrag(fg
);
1889 ceph_assert(dir
); // i am auth; i had better have this dir open
1890 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
1891 << " on " << *dir
<< dendl
;
1892 dir
->first
= fgfirst
;
1893 auto _fnode
= CDir::allocate_fnode(*dir
->get_fnode());
1894 _fnode
->fragstat
= fragstat
;
1895 _fnode
->accounted_fragstat
= accounted_fragstat
;
1896 dir
->reset_fnode(std::move(_fnode
));
1897 if (!(fragstat
== accounted_fragstat
)) {
1898 dout(10) << fg
<< " setting filelock updated flag" << dendl
;
1899 filelock
.mark_dirty(); // ok bc we're auth and caller will handle
1902 if (dir
&& dir
->is_auth()) {
1903 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
1904 << " on " << *dir
<< dendl
;
1905 dir
->first
= fgfirst
;
1906 const auto& pf
= dir
->get_projected_fnode();
1907 finish_scatter_update(&filelock
, dir
,
1908 _inode
->dirstat
.version
, pf
->accounted_fragstat
.version
);
1915 reset_inode(std::move(_inode
));
1918 void CInode::encode_lock_inest(bufferlist
& bl
)
1920 ENCODE_START(1, 1, bl
);
1922 encode(get_inode()->version
, bl
);
1924 // treat flushing as dirty when rejoining cache
1925 bool dirty
= nestlock
.is_dirty_or_flushing();
1928 dout(15) << __func__
<< " inode.rstat is " << get_inode()->rstat
<< dendl
;
1929 encode(get_inode()->rstat
, bl
); // only meaningful if i am auth.
1932 for (const auto &p
: dirfrags
) {
1933 frag_t fg
= p
.first
;
1934 CDir
*dir
= p
.second
;
1935 if (is_auth() || dir
->is_auth()) {
1936 const auto& pf
= dir
->get_projected_fnode();
1937 dout(10) << __func__
<< " " << fg
<< " dir " << *dir
<< dendl
;
1938 dout(10) << __func__
<< " " << fg
<< " rstat " << pf
->rstat
<< dendl
;
1939 dout(10) << __func__
<< " " << fg
<< " accounted_rstat " << pf
->rstat
<< dendl
;
1940 dout(10) << __func__
<< " " << fg
<< " dirty_old_rstat " << dir
->dirty_old_rstat
<< dendl
;
1942 encode(dir
->first
, tmp
);
1943 encode(pf
->rstat
, tmp
);
1944 encode(pf
->accounted_rstat
, tmp
);
1945 encode(dir
->dirty_old_rstat
, tmp
);
1950 bl
.claim_append(tmp
);
1954 void CInode::decode_lock_inest(bufferlist::const_iterator
& p
)
1961 decode(replica_dirty
, p
);
1962 if (replica_dirty
) {
1963 dout(10) << __func__
<< " setting nestlock dirty flag" << dendl
;
1964 nestlock
.mark_dirty(); // ok bc we're auth and caller will handle
1967 _inode
= allocate_inode(*get_inode());
1968 decode(_inode
->version
, p
);
1973 dout(10) << __func__
<< " taking inode rstat " << rstat
<< " for " << *this << dendl
;
1974 _inode
->rstat
= rstat
; // take inode summation if replica
1982 nest_info_t accounted_rstat
;
1983 decltype(CDir::dirty_old_rstat
) dirty_old_rstat
;
1987 decode(accounted_rstat
, p
);
1988 decode(dirty_old_rstat
, p
);
1989 dout(10) << __func__
<< " " << fg
<< " [" << fgfirst
<< ",head]" << dendl
;
1990 dout(10) << __func__
<< " " << fg
<< " rstat " << rstat
<< dendl
;
1991 dout(10) << __func__
<< " " << fg
<< " accounted_rstat " << accounted_rstat
<< dendl
;
1992 dout(10) << __func__
<< " " << fg
<< " dirty_old_rstat " << dirty_old_rstat
<< dendl
;
1993 CDir
*dir
= get_dirfrag(fg
);
1995 ceph_assert(dir
); // i am auth; i had better have this dir open
1996 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
1997 << " on " << *dir
<< dendl
;
1998 dir
->first
= fgfirst
;
1999 auto _fnode
= CDir::allocate_fnode(*dir
->get_fnode());
2000 _fnode
->rstat
= rstat
;
2001 _fnode
->accounted_rstat
= accounted_rstat
;
2002 dir
->reset_fnode(std::move(_fnode
));
2003 dir
->dirty_old_rstat
.swap(dirty_old_rstat
);
2004 if (!(rstat
== accounted_rstat
) || !dir
->dirty_old_rstat
.empty()) {
2005 dout(10) << fg
<< " setting nestlock updated flag" << dendl
;
2006 nestlock
.mark_dirty(); // ok bc we're auth and caller will handle
2009 if (dir
&& dir
->is_auth()) {
2010 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
2011 << " on " << *dir
<< dendl
;
2012 dir
->first
= fgfirst
;
2013 const auto& pf
= dir
->get_projected_fnode();
2014 finish_scatter_update(&nestlock
, dir
,
2015 _inode
->rstat
.version
, pf
->accounted_rstat
.version
);
2022 reset_inode(std::move(_inode
));
2025 void CInode::encode_lock_ixattr(bufferlist
& bl
)
2027 ENCODE_START(1, 1, bl
);
2028 encode(get_inode()->version
, bl
);
2029 encode(get_inode()->ctime
, bl
);
2034 void CInode::decode_lock_ixattr(bufferlist::const_iterator
& p
)
2036 ceph_assert(!is_auth());
2037 auto _inode
= allocate_inode(*get_inode());
2039 decode(_inode
->version
, p
);
2042 if (_inode
->ctime
< tm
)
2046 reset_inode(std::move(_inode
));
2049 void CInode::encode_lock_isnap(bufferlist
& bl
)
2051 ENCODE_START(1, 1, bl
);
2052 encode(get_inode()->version
, bl
);
2053 encode(get_inode()->ctime
, bl
);
2058 void CInode::decode_lock_isnap(bufferlist::const_iterator
& p
)
2060 ceph_assert(!is_auth());
2061 auto _inode
= allocate_inode(*get_inode());
2063 decode(_inode
->version
, p
);
2066 if (_inode
->ctime
< tm
) _inode
->ctime
= tm
;
2069 reset_inode(std::move(_inode
));
2072 void CInode::encode_lock_iflock(bufferlist
& bl
)
2074 ENCODE_START(1, 1, bl
);
2075 encode(get_inode()->version
, bl
);
2076 _encode_file_locks(bl
);
2080 void CInode::decode_lock_iflock(bufferlist::const_iterator
& p
)
2082 ceph_assert(!is_auth());
2083 auto _inode
= allocate_inode(*get_inode());
2085 decode(_inode
->version
, p
);
2086 _decode_file_locks(p
);
2088 reset_inode(std::move(_inode
));
2091 void CInode::encode_lock_ipolicy(bufferlist
& bl
)
2093 ENCODE_START(2, 1, bl
);
2095 encode(get_inode()->version
, bl
);
2096 encode(get_inode()->ctime
, bl
);
2097 encode(get_inode()->layout
, bl
, mdcache
->mds
->mdsmap
->get_up_features());
2098 encode(get_inode()->quota
, bl
);
2099 encode(get_inode()->export_pin
, bl
);
2100 encode(get_inode()->export_ephemeral_distributed_pin
, bl
);
2101 encode(get_inode()->export_ephemeral_random_pin
, bl
);
2106 void CInode::decode_lock_ipolicy(bufferlist::const_iterator
& p
)
2108 ceph_assert(!is_auth());
2109 auto _inode
= allocate_inode(*get_inode());
2112 decode(_inode
->version
, p
);
2115 if (_inode
->ctime
< tm
)
2117 decode(_inode
->layout
, p
);
2118 decode(_inode
->quota
, p
);
2119 decode(_inode
->export_pin
, p
);
2120 if (struct_v
>= 2) {
2121 decode(_inode
->export_ephemeral_distributed_pin
, p
);
2122 decode(_inode
->export_ephemeral_random_pin
, p
);
2127 bool pin_updated
= (get_inode()->export_pin
!= _inode
->export_pin
) ||
2128 (get_inode()->export_ephemeral_distributed_pin
!=
2129 _inode
->export_ephemeral_distributed_pin
);
2130 reset_inode(std::move(_inode
));
2131 maybe_export_pin(pin_updated
);
2134 void CInode::encode_lock_state(int type
, bufferlist
& bl
)
2136 ENCODE_START(1, 1, bl
);
2139 encode(parent
->first
, bl
);
2142 case CEPH_LOCK_IAUTH
:
2143 encode_lock_iauth(bl
);
2146 case CEPH_LOCK_ILINK
:
2147 encode_lock_ilink(bl
);
2150 case CEPH_LOCK_IDFT
:
2151 encode_lock_idft(bl
);
2154 case CEPH_LOCK_IFILE
:
2155 encode_lock_ifile(bl
);
2158 case CEPH_LOCK_INEST
:
2159 encode_lock_inest(bl
);
2162 case CEPH_LOCK_IXATTR
:
2163 encode_lock_ixattr(bl
);
2166 case CEPH_LOCK_ISNAP
:
2167 encode_lock_isnap(bl
);
2170 case CEPH_LOCK_IFLOCK
:
2171 encode_lock_iflock(bl
);
2174 case CEPH_LOCK_IPOLICY
:
2175 encode_lock_ipolicy(bl
);
2184 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2186 void CInode::decode_lock_state(int type
, const bufferlist
& bl
)
2188 auto p
= bl
.cbegin();
2195 decode(newfirst
, p
);
2196 if (!is_auth() && newfirst
!= first
) {
2197 dout(10) << __func__
<< " first " << first
<< " -> " << newfirst
<< dendl
;
2201 decode(newfirst
, p
);
2202 if (!parent
->is_auth() && newfirst
!= parent
->first
) {
2203 dout(10) << __func__
<< " parent first " << first
<< " -> " << newfirst
<< dendl
;
2204 parent
->first
= newfirst
;
2209 case CEPH_LOCK_IAUTH
:
2210 decode_lock_iauth(p
);
2213 case CEPH_LOCK_ILINK
:
2214 decode_lock_ilink(p
);
2217 case CEPH_LOCK_IDFT
:
2218 decode_lock_idft(p
);
2221 case CEPH_LOCK_IFILE
:
2222 decode_lock_ifile(p
);
2225 case CEPH_LOCK_INEST
:
2226 decode_lock_inest(p
);
2229 case CEPH_LOCK_IXATTR
:
2230 decode_lock_ixattr(p
);
2233 case CEPH_LOCK_ISNAP
:
2234 decode_lock_isnap(p
);
2237 case CEPH_LOCK_IFLOCK
:
2238 decode_lock_iflock(p
);
2241 case CEPH_LOCK_IPOLICY
:
2242 decode_lock_ipolicy(p
);
2252 bool CInode::is_dirty_scattered()
2255 filelock
.is_dirty_or_flushing() ||
2256 nestlock
.is_dirty_or_flushing() ||
2257 dirfragtreelock
.is_dirty_or_flushing();
2260 void CInode::clear_scatter_dirty()
2262 filelock
.remove_dirty();
2263 nestlock
.remove_dirty();
2264 dirfragtreelock
.remove_dirty();
2267 void CInode::clear_dirty_scattered(int type
)
2269 dout(10) << __func__
<< " " << type
<< " on " << *this << dendl
;
2270 ceph_assert(is_dir());
2272 case CEPH_LOCK_IFILE
:
2273 item_dirty_dirfrag_dir
.remove_myself();
2276 case CEPH_LOCK_INEST
:
2277 item_dirty_dirfrag_nest
.remove_myself();
2280 case CEPH_LOCK_IDFT
:
2281 item_dirty_dirfrag_dirfragtree
.remove_myself();
2291 * when we initially scatter a lock, we need to check if any of the dirfrags
2292 * have out of date accounted_rstat/fragstat. if so, mark the lock stale.
2294 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2295 void CInode::start_scatter(ScatterLock
*lock
)
2297 dout(10) << __func__
<< " " << *lock
<< " on " << *this << dendl
;
2298 ceph_assert(is_auth());
2299 const auto& pi
= get_projected_inode();
2301 for (const auto &p
: dirfrags
) {
2302 frag_t fg
= p
.first
;
2303 CDir
*dir
= p
.second
;
2304 const auto& pf
= dir
->get_projected_fnode();
2305 dout(20) << fg
<< " " << *dir
<< dendl
;
2307 if (!dir
->is_auth())
2310 switch (lock
->get_type()) {
2311 case CEPH_LOCK_IFILE
:
2312 finish_scatter_update(lock
, dir
, pi
->dirstat
.version
, pf
->accounted_fragstat
.version
);
2315 case CEPH_LOCK_INEST
:
2316 finish_scatter_update(lock
, dir
, pi
->rstat
.version
, pf
->accounted_rstat
.version
);
2319 case CEPH_LOCK_IDFT
:
2320 dir
->state_clear(CDir::STATE_DIRTYDFT
);
2327 class C_Inode_FragUpdate
: public MDSLogContextBase
{
2332 MDSRank
*get_mds() override
{return in
->mdcache
->mds
;}
2333 void finish(int r
) override
{
2334 in
->_finish_frag_update(dir
, mut
);
2338 C_Inode_FragUpdate(CInode
*i
, CDir
*d
, MutationRef
& m
) : in(i
), dir(d
), mut(m
) {}
2341 void CInode::finish_scatter_update(ScatterLock
*lock
, CDir
*dir
,
2342 version_t inode_version
, version_t dir_accounted_version
)
2344 frag_t fg
= dir
->get_frag();
2345 ceph_assert(dir
->is_auth());
2347 if (dir
->is_frozen()) {
2348 dout(10) << __func__
<< " " << fg
<< " frozen, marking " << *lock
<< " stale " << *dir
<< dendl
;
2349 } else if (dir
->get_version() == 0) {
2350 dout(10) << __func__
<< " " << fg
<< " not loaded, marking " << *lock
<< " stale " << *dir
<< dendl
;
2352 if (dir_accounted_version
!= inode_version
) {
2353 dout(10) << __func__
<< " " << fg
<< " journaling accounted scatterstat update v" << inode_version
<< dendl
;
2355 MDLog
*mdlog
= mdcache
->mds
->mdlog
;
2356 MutationRef
mut(new MutationImpl());
2357 mut
->ls
= mdlog
->get_current_segment();
2359 auto pf
= dir
->project_fnode(mut
);
2361 std::string_view ename
;
2362 switch (lock
->get_type()) {
2363 case CEPH_LOCK_IFILE
:
2364 pf
->fragstat
.version
= inode_version
;
2365 pf
->accounted_fragstat
= pf
->fragstat
;
2366 ename
= "lock ifile accounted scatter stat update";
2368 case CEPH_LOCK_INEST
:
2369 pf
->rstat
.version
= inode_version
;
2370 pf
->accounted_rstat
= pf
->rstat
;
2371 ename
= "lock inest accounted scatter stat update";
2373 if (!is_auth() && lock
->get_state() == LOCK_MIX
) {
2374 dout(10) << __func__
<< " try to assimilate dirty rstat on "
2376 dir
->assimilate_dirty_rstat_inodes(mut
);
2384 EUpdate
*le
= new EUpdate(mdlog
, ename
);
2385 mdlog
->start_entry(le
);
2386 le
->metablob
.add_dir_context(dir
);
2387 le
->metablob
.add_dir(dir
, true);
2389 ceph_assert(!dir
->is_frozen());
2392 if (lock
->get_type() == CEPH_LOCK_INEST
&&
2393 !is_auth() && lock
->get_state() == LOCK_MIX
) {
2394 dout(10) << __func__
<< " finish assimilating dirty rstat on "
2396 dir
->assimilate_dirty_rstat_inodes_finish(&le
->metablob
);
2398 if (!(pf
->rstat
== pf
->accounted_rstat
)) {
2399 if (!mut
->is_wrlocked(&nestlock
)) {
2400 mdcache
->mds
->locker
->wrlock_force(&nestlock
, mut
);
2403 mdcache
->mds
->locker
->mark_updated_scatterlock(&nestlock
);
2404 mut
->ls
->dirty_dirfrag_nest
.push_back(&item_dirty_dirfrag_nest
);
2408 pf
->version
= dir
->pre_dirty();
2410 mdlog
->submit_entry(le
, new C_Inode_FragUpdate(this, dir
, mut
));
2412 dout(10) << __func__
<< " " << fg
<< " accounted " << *lock
2413 << " scatter stat unchanged at v" << dir_accounted_version
<< dendl
;
2418 void CInode::_finish_frag_update(CDir
*dir
, MutationRef
& mut
)
2420 dout(10) << __func__
<< " on " << *dir
<< dendl
;
2422 mdcache
->mds
->locker
->drop_locks(mut
.get());
2428 * when we gather a lock, we need to assimilate dirfrag changes into the inode
2429 * state. it's possible we can't update the dirfrag accounted_rstat/fragstat
2430 * because the frag is auth and frozen, or that the replica couldn't for the same
2431 * reason. hopefully it will get updated the next time the lock cycles.
2433 * we have two dimensions of behavior:
2434 * - we may be (auth and !frozen), and able to update, or not.
2435 * - the frag may be stale, or not.
2437 * if the frag is non-stale, we want to assimilate the diff into the
2438 * inode, regardless of whether it's auth or updateable.
2440 * if we update the frag, we want to set accounted_fragstat = frag,
2441 * both if we took the diff or it was stale and we are making it
2444 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2445 void CInode::finish_scatter_gather_update(int type
, MutationRef
& mut
)
2447 LogChannelRef clog
= mdcache
->mds
->clog
;
2449 dout(10) << __func__
<< " " << type
<< " on " << *this << dendl
;
2450 ceph_assert(is_auth());
2453 case CEPH_LOCK_IFILE
:
2455 fragtree_t tmpdft
= dirfragtree
;
2456 struct frag_info_t dirstat
;
2457 bool dirstat_valid
= true;
2460 ceph_assert(is_auth());
2461 auto pi
= _get_projected_inode();
2463 bool touched_mtime
= false, touched_chattr
= false;
2464 dout(20) << " orig dirstat " << pi
->dirstat
<< dendl
;
2465 pi
->dirstat
.version
++;
2466 for (const auto &p
: dirfrags
) {
2467 frag_t fg
= p
.first
;
2468 CDir
*dir
= p
.second
;
2469 dout(20) << fg
<< " " << *dir
<< dendl
;
2472 if (dir
->get_version() != 0) {
2473 update
= dir
->is_auth() && !dir
->is_frozen();
2476 dirstat_valid
= false;
2479 CDir::fnode_const_ptr pf
;
2482 pf
= dir
->project_fnode(mut
);
2484 pf
= dir
->get_projected_fnode();
2487 if (pf
->accounted_fragstat
.version
== pi
->dirstat
.version
- 1) {
2488 dout(20) << fg
<< " fragstat " << pf
->fragstat
<< dendl
;
2489 dout(20) << fg
<< " accounted_fragstat " << pf
->accounted_fragstat
<< dendl
;
2490 pi
->dirstat
.add_delta(pf
->fragstat
, pf
->accounted_fragstat
, &touched_mtime
, &touched_chattr
);
2492 dout(20) << fg
<< " skipping STALE accounted_fragstat " << pf
->accounted_fragstat
<< dendl
;
2495 if (pf
->fragstat
.nfiles
< 0 ||
2496 pf
->fragstat
.nsubdirs
< 0) {
2497 clog
->error() << "bad/negative dir size on "
2498 << dir
->dirfrag() << " " << pf
->fragstat
;
2499 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter
);
2501 auto _pf
= const_cast<fnode_t
*>(pf
.get());
2502 if (pf
->fragstat
.nfiles
< 0)
2503 _pf
->fragstat
.nfiles
= 0;
2504 if (pf
->fragstat
.nsubdirs
< 0)
2505 _pf
->fragstat
.nsubdirs
= 0;
2509 auto _pf
= const_cast<fnode_t
*>(pf
.get());
2510 _pf
->accounted_fragstat
= _pf
->fragstat
;
2511 _pf
->fragstat
.version
= _pf
->accounted_fragstat
.version
= pi
->dirstat
.version
;
2512 _pf
->version
= dir
->pre_dirty();
2513 dout(10) << fg
<< " updated accounted_fragstat " << pf
->fragstat
<< " on " << *dir
<< dendl
;
2516 tmpdft
.force_to_leaf(g_ceph_context
, fg
);
2517 dirstat
.add(pf
->fragstat
);
2520 pi
->mtime
= pi
->ctime
= pi
->dirstat
.mtime
;
2522 pi
->change_attr
= pi
->dirstat
.change_attr
;
2523 dout(20) << " final dirstat " << pi
->dirstat
<< dendl
;
2525 if (dirstat_valid
&& !dirstat
.same_sums(pi
->dirstat
)) {
2527 tmpdft
.get_leaves_under(frag_t(), leaves
);
2528 for (const auto& leaf
: leaves
) {
2529 if (!dirfrags
.count(leaf
)) {
2530 dirstat_valid
= false;
2534 if (dirstat_valid
) {
2535 if (state_test(CInode::STATE_REPAIRSTATS
)) {
2536 dout(20) << " dirstat mismatch, fixing" << dendl
;
2538 clog
->error() << "unmatched fragstat on " << ino() << ", inode has "
2539 << pi
->dirstat
<< ", dirfrags have " << dirstat
;
2540 ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter
);
2542 // trust the dirfrags for now
2543 version_t v
= pi
->dirstat
.version
;
2544 if (pi
->dirstat
.mtime
> dirstat
.mtime
)
2545 dirstat
.mtime
= pi
->dirstat
.mtime
;
2546 if (pi
->dirstat
.change_attr
> dirstat
.change_attr
)
2547 dirstat
.change_attr
= pi
->dirstat
.change_attr
;
2548 pi
->dirstat
= dirstat
;
2549 pi
->dirstat
.version
= v
;
2553 if (pi
->dirstat
.nfiles
< 0 || pi
->dirstat
.nsubdirs
< 0) {
2555 make_path_string(path
);
2556 clog
->error() << "Inconsistent statistics detected: fragstat on inode "
2557 << ino() << " (" << path
<< "), inode has " << pi
->dirstat
;
2558 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter
);
2560 if (pi
->dirstat
.nfiles
< 0)
2561 pi
->dirstat
.nfiles
= 0;
2562 if (pi
->dirstat
.nsubdirs
< 0)
2563 pi
->dirstat
.nsubdirs
= 0;
2568 case CEPH_LOCK_INEST
:
2571 ceph_assert(is_auth());
2573 fragtree_t tmpdft
= dirfragtree
;
2575 bool rstat_valid
= true;
2578 if (const sr_t
*srnode
= get_projected_srnode(); srnode
)
2579 rstat
.rsnaps
= srnode
->snaps
.size();
2581 auto pi
= _get_projected_inode();
2582 dout(20) << " orig rstat " << pi
->rstat
<< dendl
;
2583 pi
->rstat
.version
++;
2584 for (const auto &p
: dirfrags
) {
2585 frag_t fg
= p
.first
;
2586 CDir
*dir
= p
.second
;
2587 dout(20) << fg
<< " " << *dir
<< dendl
;
2590 if (dir
->get_version() != 0) {
2591 update
= dir
->is_auth() && !dir
->is_frozen();
2594 rstat_valid
= false;
2597 CDir::fnode_const_ptr pf
;
2600 pf
= dir
->project_fnode(mut
);
2602 pf
= dir
->get_projected_fnode();
2605 if (pf
->accounted_rstat
.version
== pi
->rstat
.version
-1) {
2606 // only pull this frag's dirty rstat inodes into the frag if
2607 // the frag is non-stale and updateable. if it's stale,
2608 // that info will just get thrown out!
2610 dir
->assimilate_dirty_rstat_inodes(mut
);
2612 dout(20) << fg
<< " rstat " << pf
->rstat
<< dendl
;
2613 dout(20) << fg
<< " accounted_rstat " << pf
->accounted_rstat
<< dendl
;
2614 dout(20) << fg
<< " dirty_old_rstat " << dir
->dirty_old_rstat
<< dendl
;
2615 mdcache
->project_rstat_frag_to_inode(pf
->rstat
, pf
->accounted_rstat
,
2616 dir
->first
, CEPH_NOSNAP
, this, true);
2617 for (auto &p
: dir
->dirty_old_rstat
) {
2618 mdcache
->project_rstat_frag_to_inode(p
.second
.rstat
, p
.second
.accounted_rstat
,
2619 p
.second
.first
, p
.first
, this, true);
2621 if (update
) // dir contents not valid if frozen or non-auth
2622 dir
->check_rstats();
2624 dout(20) << fg
<< " skipping STALE accounted_rstat " << pf
->accounted_rstat
<< dendl
;
2627 auto _pf
= const_cast<fnode_t
*>(pf
.get());
2628 _pf
->accounted_rstat
= pf
->rstat
;
2629 _pf
->rstat
.version
= _pf
->accounted_rstat
.version
= pi
->rstat
.version
;
2630 _pf
->version
= dir
->pre_dirty();
2631 dir
->dirty_old_rstat
.clear();
2632 dir
->check_rstats();
2633 dout(10) << fg
<< " updated accounted_rstat " << pf
->rstat
<< " on " << *dir
<< dendl
;
2636 tmpdft
.force_to_leaf(g_ceph_context
, fg
);
2637 rstat
.add(pf
->rstat
);
2639 dout(20) << " final rstat " << pi
->rstat
<< dendl
;
2641 if (rstat_valid
&& !rstat
.same_sums(pi
->rstat
)) {
2643 tmpdft
.get_leaves_under(frag_t(), leaves
);
2644 for (const auto& leaf
: leaves
) {
2645 if (!dirfrags
.count(leaf
)) {
2646 rstat_valid
= false;
2651 if (state_test(CInode::STATE_REPAIRSTATS
)) {
2652 dout(20) << " rstat mismatch, fixing" << dendl
;
2654 clog
->error() << "inconsistent rstat on inode " << ino()
2655 << ", inode has " << pi
->rstat
2656 << ", directory fragments have " << rstat
;
2657 ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter
);
2659 // trust the dirfrag for now
2660 version_t v
= pi
->rstat
.version
;
2661 if (pi
->rstat
.rctime
> rstat
.rctime
)
2662 rstat
.rctime
= pi
->rstat
.rctime
;
2664 pi
->rstat
.version
= v
;
2668 mdcache
->broadcast_quota_to_client(this);
2672 case CEPH_LOCK_IDFT
:
2680 void CInode::finish_scatter_gather_update_accounted(int type
, EMetaBlob
*metablob
)
2682 dout(10) << __func__
<< " " << type
<< " on " << *this << dendl
;
2683 ceph_assert(is_auth());
2685 for (const auto &p
: dirfrags
) {
2686 CDir
*dir
= p
.second
;
2687 if (!dir
->is_auth() || dir
->get_version() == 0 || dir
->is_frozen())
2690 if (type
== CEPH_LOCK_IDFT
)
2691 continue; // nothing to do.
2693 if (type
== CEPH_LOCK_INEST
)
2694 dir
->assimilate_dirty_rstat_inodes_finish(metablob
);
2696 dout(10) << " journaling updated frag accounted_ on " << *dir
<< dendl
;
2697 ceph_assert(dir
->is_projected());
2698 metablob
->add_dir(dir
, true);
2704 bool CInode::is_frozen() const
2706 if (is_frozen_inode()) return true;
2707 if (parent
&& parent
->dir
->is_frozen()) return true;
2711 bool CInode::is_frozen_dir() const
2713 if (parent
&& parent
->dir
->is_frozen_dir()) return true;
2717 bool CInode::is_freezing() const
2719 if (is_freezing_inode()) return true;
2720 if (parent
&& parent
->dir
->is_freezing()) return true;
2724 void CInode::add_dir_waiter(frag_t fg
, MDSContext
*c
)
2726 if (waiting_on_dir
.empty())
2728 waiting_on_dir
[fg
].push_back(c
);
2729 dout(10) << __func__
<< " frag " << fg
<< " " << c
<< " on " << *this << dendl
;
2732 void CInode::take_dir_waiting(frag_t fg
, MDSContext::vec
& ls
)
2734 if (waiting_on_dir
.empty())
2737 auto it
= waiting_on_dir
.find(fg
);
2738 if (it
!= waiting_on_dir
.end()) {
2739 dout(10) << __func__
<< " frag " << fg
<< " on " << *this << dendl
;
2740 auto& waiting
= it
->second
;
2741 ls
.insert(ls
.end(), waiting
.begin(), waiting
.end());
2742 waiting_on_dir
.erase(it
);
2744 if (waiting_on_dir
.empty())
2749 void CInode::add_waiter(uint64_t tag
, MDSContext
*c
)
2751 dout(10) << __func__
<< " tag " << std::hex
<< tag
<< std::dec
<< " " << c
2752 << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH
)
2753 << " !frozen " << !is_frozen_inode()
2754 << " !freezing " << !is_freezing_inode()
2756 // wait on the directory?
2757 // make sure its not the inode that is explicitly ambiguous|freezing|frozen
2758 if (((tag
& WAIT_SINGLEAUTH
) && !state_test(STATE_AMBIGUOUSAUTH
)) ||
2759 ((tag
& WAIT_UNFREEZE
) &&
2760 !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) {
2761 dout(15) << "passing waiter up tree" << dendl
;
2762 parent
->dir
->add_waiter(tag
, c
);
2765 dout(15) << "taking waiter here" << dendl
;
2766 MDSCacheObject::add_waiter(tag
, c
);
2769 void CInode::take_waiting(uint64_t mask
, MDSContext::vec
& ls
)
2771 if ((mask
& WAIT_DIR
) && !waiting_on_dir
.empty()) {
2772 // take all dentry waiters
2773 while (!waiting_on_dir
.empty()) {
2774 auto it
= waiting_on_dir
.begin();
2775 dout(10) << __func__
<< " dirfrag " << it
->first
<< " on " << *this << dendl
;
2776 auto& waiting
= it
->second
;
2777 ls
.insert(ls
.end(), waiting
.begin(), waiting
.end());
2778 waiting_on_dir
.erase(it
);
2784 MDSCacheObject::take_waiting(mask
, ls
);
2787 void CInode::maybe_finish_freeze_inode()
2789 CDir
*dir
= get_parent_dir();
2790 if (auth_pins
> auth_pin_freeze_allowance
|| dir
->frozen_inode_suppressed
)
2793 dout(10) << "maybe_finish_freeze_inode - frozen" << dendl
;
2794 ceph_assert(auth_pins
== auth_pin_freeze_allowance
);
2797 state_clear(STATE_FREEZING
);
2798 state_set(STATE_FROZEN
);
2800 item_freezing_inode
.remove_myself();
2801 dir
->num_frozen_inodes
++;
2803 finish_waiting(WAIT_FROZEN
);
2806 bool CInode::freeze_inode(int auth_pin_allowance
)
2808 CDir
*dir
= get_parent_dir();
2811 ceph_assert(auth_pin_allowance
> 0); // otherwise we need to adjust parent's nested_auth_pins
2812 ceph_assert(auth_pins
>= auth_pin_allowance
);
2813 if (auth_pins
== auth_pin_allowance
&& !dir
->frozen_inode_suppressed
) {
2814 dout(10) << "freeze_inode - frozen" << dendl
;
2815 if (!state_test(STATE_FROZEN
)) {
2817 state_set(STATE_FROZEN
);
2818 dir
->num_frozen_inodes
++;
2823 dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance
<< dendl
;
2824 auth_pin_freeze_allowance
= auth_pin_allowance
;
2825 dir
->freezing_inodes
.push_back(&item_freezing_inode
);
2828 state_set(STATE_FREEZING
);
2830 if (!dir
->lock_caches_with_auth_pins
.empty())
2831 mdcache
->mds
->locker
->invalidate_lock_caches(dir
);
2833 const static int lock_types
[] = {
2834 CEPH_LOCK_IVERSION
, CEPH_LOCK_IFILE
, CEPH_LOCK_IAUTH
, CEPH_LOCK_ILINK
, CEPH_LOCK_IDFT
,
2835 CEPH_LOCK_IXATTR
, CEPH_LOCK_ISNAP
, CEPH_LOCK_INEST
, CEPH_LOCK_IFLOCK
, CEPH_LOCK_IPOLICY
, 0
2837 for (int i
= 0; lock_types
[i
]; ++i
) {
2838 auto lock
= get_lock(lock_types
[i
]);
2839 if (lock
->is_cached())
2840 mdcache
->mds
->locker
->invalidate_lock_caches(lock
);
2842 // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed
2843 // and finish freezing the inode
2844 return state_test(STATE_FROZEN
);
2847 void CInode::unfreeze_inode(MDSContext::vec
& finished
)
2849 dout(10) << __func__
<< dendl
;
2850 if (state_test(STATE_FREEZING
)) {
2851 state_clear(STATE_FREEZING
);
2853 item_freezing_inode
.remove_myself();
2854 } else if (state_test(STATE_FROZEN
)) {
2855 state_clear(STATE_FROZEN
);
2857 get_parent_dir()->num_frozen_inodes
--;
2860 take_waiting(WAIT_UNFREEZE
, finished
);
2863 void CInode::unfreeze_inode()
2865 MDSContext::vec finished
;
2866 unfreeze_inode(finished
);
2867 mdcache
->mds
->queue_waiters(finished
);
2870 void CInode::freeze_auth_pin()
2872 ceph_assert(state_test(CInode::STATE_FROZEN
));
2873 state_set(CInode::STATE_FROZENAUTHPIN
);
2874 get_parent_dir()->num_frozen_inodes
++;
2877 void CInode::unfreeze_auth_pin()
2879 ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN
));
2880 state_clear(CInode::STATE_FROZENAUTHPIN
);
2881 get_parent_dir()->num_frozen_inodes
--;
2882 if (!state_test(STATE_FREEZING
|STATE_FROZEN
)) {
2883 MDSContext::vec finished
;
2884 take_waiting(WAIT_UNFREEZE
, finished
);
2885 mdcache
->mds
->queue_waiters(finished
);
2889 void CInode::clear_ambiguous_auth(MDSContext::vec
& finished
)
2891 ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH
));
2892 state_clear(CInode::STATE_AMBIGUOUSAUTH
);
2893 take_waiting(CInode::WAIT_SINGLEAUTH
, finished
);
2896 void CInode::clear_ambiguous_auth()
2898 MDSContext::vec finished
;
2899 clear_ambiguous_auth(finished
);
2900 mdcache
->mds
->queue_waiters(finished
);
2904 bool CInode::can_auth_pin(int *err_ret
) const {
2908 } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
2909 err
= ERR_EXPORTING_INODE
;
2912 return parent
->can_auth_pin(err_ret
);
2920 void CInode::auth_pin(void *by
)
2926 #ifdef MDS_AUTHPIN_SET
2927 auth_pin_set
.insert(by
);
2930 dout(10) << "auth_pin by " << by
<< " on " << *this << " now " << auth_pins
<< dendl
;
2933 parent
->adjust_nested_auth_pins(1, this);
2936 void CInode::auth_unpin(void *by
)
2940 #ifdef MDS_AUTHPIN_SET
2942 auto it
= auth_pin_set
.find(by
);
2943 ceph_assert(it
!= auth_pin_set
.end());
2944 auth_pin_set
.erase(it
);
2951 dout(10) << "auth_unpin by " << by
<< " on " << *this << " now " << auth_pins
<< dendl
;
2953 ceph_assert(auth_pins
>= 0);
2956 parent
->adjust_nested_auth_pins(-1, by
);
2958 if (is_freezing_inode())
2959 maybe_finish_freeze_inode();
2964 mds_authority_t
CInode::authority() const
2966 if (inode_auth
.first
>= 0)
2970 return parent
->dir
->authority();
2972 // new items that are not yet linked in (in the committed plane) belong
2973 // to their first parent.
2974 if (!projected_parent
.empty())
2975 return projected_parent
.front()->dir
->authority();
2977 return CDIR_AUTH_UNDEF
;
2983 snapid_t
CInode::get_oldest_snap()
2986 if (is_any_old_inodes())
2987 t
= get_old_inodes()->begin()->second
.first
;
2988 return std::min(t
, oldest_snap
);
2991 const CInode::mempool_old_inode
& CInode::cow_old_inode(snapid_t follows
, bool cow_head
)
2993 ceph_assert(follows
>= first
);
2995 const auto& pi
= cow_head
? get_projected_inode() : get_previous_projected_inode();
2996 const auto& px
= cow_head
? get_projected_xattrs() : get_previous_projected_xattrs();
2998 auto _old_inodes
= allocate_old_inode_map();
3000 *_old_inodes
= *old_inodes
;
3002 mempool_old_inode
&old
= (*_old_inodes
)[follows
];
3006 dout(10) << " " << px
->size() << " xattrs cowed, " << *px
<< dendl
;
3010 if (first
< oldest_snap
)
3011 oldest_snap
= first
;
3013 old
.inode
.trim_client_ranges(follows
);
3015 if (g_conf()->mds_snap_rstat
&&
3016 !(old
.inode
.rstat
== old
.inode
.accounted_rstat
))
3017 dirty_old_rstats
.insert(follows
);
3021 dout(10) << __func__
<< " " << (cow_head
? "head" : "previous_head" )
3022 << " to [" << old
.first
<< "," << follows
<< "] on "
3025 reset_old_inodes(std::move(_old_inodes
));
3029 void CInode::pre_cow_old_inode()
3031 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
3032 if (first
<= follows
)
3033 cow_old_inode(follows
, true);
3036 bool CInode::has_snap_data(snapid_t snapid
)
3038 bool found
= snapid
>= first
&& snapid
<= last
;
3039 if (!found
&& is_any_old_inodes()) {
3040 auto p
= old_inodes
->lower_bound(snapid
);
3041 if (p
!= old_inodes
->end()) {
3042 if (p
->second
.first
> snapid
) {
3043 if (p
!= old_inodes
->begin())
3046 if (p
->second
.first
<= snapid
&& snapid
<= p
->first
) {
3054 void CInode::purge_stale_snap_data(const set
<snapid_t
>& snaps
)
3056 dout(10) << __func__
<< " " << snaps
<< dendl
;
3058 if (!get_old_inodes())
3061 std::vector
<snapid_t
> to_remove
;
3062 for (auto p
: *get_old_inodes()) {
3063 const snapid_t
&id
= p
.first
;
3064 const auto &s
= snaps
.lower_bound(p
.second
.first
);
3065 if (s
== snaps
.end() || *s
> id
) {
3066 dout(10) << " purging old_inode [" << p
.second
.first
<< "," << id
<< "]" << dendl
;
3067 to_remove
.push_back(id
);
3071 if (to_remove
.size() == get_old_inodes()->size()) {
3072 reset_old_inodes(old_inode_map_ptr());
3073 } else if (!to_remove
.empty()) {
3074 auto _old_inodes
= allocate_old_inode_map(*get_old_inodes());
3075 for (auto id
: to_remove
)
3076 _old_inodes
->erase(id
);
3077 reset_old_inodes(std::move(_old_inodes
));
3082 * pick/create an old_inode
3084 snapid_t
CInode::pick_old_inode(snapid_t snap
) const
3086 if (is_any_old_inodes()) {
3087 auto it
= old_inodes
->lower_bound(snap
); // p is first key >= to snap
3088 if (it
!= old_inodes
->end() && it
->second
.first
<= snap
) {
3089 dout(10) << __func__
<< " snap " << snap
<< " -> [" << it
->second
.first
<< "," << it
->first
<< "]" << dendl
;
3093 dout(10) << __func__
<< " snap " << snap
<< " -> nothing" << dendl
;
3097 void CInode::open_snaprealm(bool nosplit
)
3100 SnapRealm
*parent
= find_snaprealm();
3101 snaprealm
= new SnapRealm(mdcache
, this);
3103 dout(10) << __func__
<< " " << snaprealm
3104 << " parent is " << parent
3106 dout(30) << " siblings are " << parent
->open_children
<< dendl
;
3107 snaprealm
->parent
= parent
;
3109 parent
->split_at(snaprealm
);
3110 parent
->open_children
.insert(snaprealm
);
3114 void CInode::close_snaprealm(bool nojoin
)
3117 dout(15) << __func__
<< " " << *snaprealm
<< dendl
;
3118 if (snaprealm
->parent
) {
3119 snaprealm
->parent
->open_children
.erase(snaprealm
);
3121 //snaprealm->parent->join(snaprealm);
3128 SnapRealm
*CInode::find_snaprealm() const
3130 const CInode
*cur
= this;
3131 while (!cur
->snaprealm
) {
3132 const CDentry
*pdn
= cur
->get_oldest_parent_dn();
3135 cur
= pdn
->get_dir()->get_inode();
3137 return cur
->snaprealm
;
3140 void CInode::encode_snap_blob(bufferlist
&snapbl
)
3144 encode(snaprealm
->srnode
, snapbl
);
3145 dout(20) << __func__
<< " " << *snaprealm
<< dendl
;
3148 void CInode::decode_snap_blob(const bufferlist
& snapbl
)
3151 if (snapbl
.length()) {
3153 auto old_flags
= snaprealm
->srnode
.flags
;
3154 auto p
= snapbl
.cbegin();
3155 decode(snaprealm
->srnode
, p
);
3157 if ((snaprealm
->srnode
.flags
^ old_flags
) & sr_t::PARENT_GLOBAL
) {
3158 snaprealm
->adjust_parent();
3161 dout(20) << __func__
<< " " << *snaprealm
<< dendl
;
3162 } else if (snaprealm
&&
3163 !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675
3164 ceph_assert(mdcache
->mds
->is_any_replay());
3165 snaprealm
->merge_to(NULL
);
3169 void CInode::encode_snap(bufferlist
& bl
)
3171 ENCODE_START(1, 1, bl
);
3173 encode_snap_blob(snapbl
);
3175 encode(oldest_snap
, bl
);
3179 void CInode::decode_snap(bufferlist::const_iterator
& p
)
3184 decode(oldest_snap
, p
);
3185 decode_snap_blob(snapbl
);
3189 // =============================================
3191 client_t
CInode::calc_ideal_loner()
3193 if (mdcache
->is_readonly())
3195 if (!get_mds_caps_wanted().empty())
3199 client_t loner
= -1;
3200 for (const auto &p
: client_caps
) {
3201 if (!p
.second
.is_stale() &&
3203 !has_subtree_or_exporting_dirfrag() :
3204 (p
.second
.wanted() & (CEPH_CAP_ANY_WR
|CEPH_CAP_FILE_RD
)))) {
3214 bool CInode::choose_ideal_loner()
3216 want_loner_cap
= calc_ideal_loner();
3217 int changed
= false;
3218 if (loner_cap
>= 0 && loner_cap
!= want_loner_cap
) {
3219 if (!try_drop_loner())
3224 if (want_loner_cap
>= 0) {
3225 if (loner_cap
< 0) {
3226 set_loner_cap(want_loner_cap
);
3229 ceph_assert(loner_cap
== want_loner_cap
);
3234 bool CInode::try_set_loner()
3236 ceph_assert(want_loner_cap
>= 0);
3237 if (loner_cap
>= 0 && loner_cap
!= want_loner_cap
)
3239 set_loner_cap(want_loner_cap
);
3243 void CInode::set_loner_cap(client_t l
)
3246 authlock
.set_excl_client(loner_cap
);
3247 filelock
.set_excl_client(loner_cap
);
3248 linklock
.set_excl_client(loner_cap
);
3249 xattrlock
.set_excl_client(loner_cap
);
3252 bool CInode::try_drop_loner()
3257 int other_allowed
= get_caps_allowed_by_type(CAP_ANY
);
3258 Capability
*cap
= get_client_cap(loner_cap
);
3260 (cap
->issued() & ~other_allowed
) == 0) {
3268 // choose new lock state during recovery, based on issued caps
3269 void CInode::choose_lock_state(SimpleLock
*lock
, int allissued
)
3271 int shift
= lock
->get_cap_shift();
3272 int issued
= (allissued
>> shift
) & lock
->get_cap_mask();
3274 if (lock
->is_xlocked()) {
3276 } else if (lock
->get_state() != LOCK_MIX
) {
3277 if (issued
& (CEPH_CAP_GEXCL
| CEPH_CAP_GBUFFER
))
3278 lock
->set_state(LOCK_EXCL
);
3279 else if (issued
& CEPH_CAP_GWR
) {
3280 if (issued
& (CEPH_CAP_GCACHE
| CEPH_CAP_GSHARED
))
3281 lock
->set_state(LOCK_EXCL
);
3283 lock
->set_state(LOCK_MIX
);
3284 } else if (lock
->is_dirty()) {
3285 if (is_replicated())
3286 lock
->set_state(LOCK_MIX
);
3288 lock
->set_state(LOCK_LOCK
);
3290 lock
->set_state(LOCK_SYNC
);
3293 // our states have already been chosen during rejoin.
3294 if (lock
->is_xlocked())
3295 ceph_assert(lock
->get_state() == LOCK_LOCK
);
3299 void CInode::choose_lock_states(int dirty_caps
)
3301 int issued
= get_caps_issued() | dirty_caps
;
3302 if (is_auth() && (issued
& (CEPH_CAP_ANY_EXCL
|CEPH_CAP_ANY_WR
)))
3303 choose_ideal_loner();
3304 choose_lock_state(&filelock
, issued
);
3305 choose_lock_state(&nestlock
, issued
);
3306 choose_lock_state(&dirfragtreelock
, issued
);
3307 choose_lock_state(&authlock
, issued
);
3308 choose_lock_state(&xattrlock
, issued
);
3309 choose_lock_state(&linklock
, issued
);
3312 int CInode::count_nonstale_caps()
3315 for (const auto &p
: client_caps
) {
3316 if (!p
.second
.is_stale())
3322 bool CInode::multiple_nonstale_caps()
3325 for (const auto &p
: client_caps
) {
3326 if (!p
.second
.is_stale()) {
3335 void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map
<int32_t,int32_t>& m
)
3337 bool old_empty
= mds_caps_wanted
.empty();
3338 mds_caps_wanted
.swap(m
);
3339 if (old_empty
!= (bool)mds_caps_wanted
.empty()) {
3341 adjust_num_caps_notable(1);
3343 adjust_num_caps_notable(-1);
3347 void CInode::set_mds_caps_wanted(mds_rank_t mds
, int32_t wanted
)
3349 bool old_empty
= mds_caps_wanted
.empty();
3351 mds_caps_wanted
[mds
] = wanted
;
3353 adjust_num_caps_notable(1);
3354 } else if (!old_empty
) {
3355 mds_caps_wanted
.erase(mds
);
3356 if (mds_caps_wanted
.empty())
3357 adjust_num_caps_notable(-1);
3361 Capability
*CInode::add_client_cap(client_t client
, Session
*session
,
3362 SnapRealm
*conrealm
, bool new_inode
)
3364 ceph_assert(last
== CEPH_NOSNAP
);
3365 if (client_caps
.empty()) {
3368 containing_realm
= conrealm
;
3370 containing_realm
= find_snaprealm();
3371 containing_realm
->inodes_with_caps
.push_back(&item_caps
);
3372 dout(10) << __func__
<< " first cap, joining realm " << *containing_realm
<< dendl
;
3374 mdcache
->num_inodes_with_caps
++;
3376 parent
->dir
->adjust_num_inodes_with_caps(1);
3379 uint64_t cap_id
= new_inode
? 1 : ++mdcache
->last_cap_id
;
3380 auto ret
= client_caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
),
3381 std::forward_as_tuple(this, session
, cap_id
));
3382 ceph_assert(ret
.second
== true);
3383 Capability
*cap
= &ret
.first
->second
;
3385 cap
->client_follows
= first
-1;
3386 containing_realm
->add_cap(client
, cap
);
3391 void CInode::remove_client_cap(client_t client
)
3393 auto it
= client_caps
.find(client
);
3394 ceph_assert(it
!= client_caps
.end());
3395 Capability
*cap
= &it
->second
;
3397 cap
->item_session_caps
.remove_myself();
3398 cap
->item_revoking_caps
.remove_myself();
3399 cap
->item_client_revoking_caps
.remove_myself();
3400 containing_realm
->remove_cap(client
, cap
);
3402 if (client
== loner_cap
)
3405 if (cap
->is_wanted_notable())
3406 adjust_num_caps_notable(-1);
3408 client_caps
.erase(it
);
3409 if (client_caps
.empty()) {
3410 dout(10) << __func__
<< " last cap, leaving realm " << *containing_realm
<< dendl
;
3412 item_caps
.remove_myself();
3413 containing_realm
= NULL
;
3414 mdcache
->num_inodes_with_caps
--;
3416 parent
->dir
->adjust_num_inodes_with_caps(-1);
3419 //clean up advisory locks
3420 bool fcntl_removed
= fcntl_locks
? fcntl_locks
->remove_all_from(client
) : false;
3421 bool flock_removed
= flock_locks
? flock_locks
->remove_all_from(client
) : false;
3422 if (fcntl_removed
|| flock_removed
) {
3423 MDSContext::vec waiters
;
3424 take_waiting(CInode::WAIT_FLOCK
, waiters
);
3425 mdcache
->mds
->queue_waiters(waiters
);
3429 void CInode::move_to_realm(SnapRealm
*realm
)
3431 dout(10) << __func__
<< " joining realm " << *realm
3432 << ", leaving realm " << *containing_realm
<< dendl
;
3433 for (auto& p
: client_caps
) {
3434 containing_realm
->remove_cap(p
.first
, &p
.second
);
3435 realm
->add_cap(p
.first
, &p
.second
);
3437 item_caps
.remove_myself();
3438 realm
->inodes_with_caps
.push_back(&item_caps
);
3439 containing_realm
= realm
;
3442 Capability
*CInode::reconnect_cap(client_t client
, const cap_reconnect_t
& icr
, Session
*session
)
3444 Capability
*cap
= get_client_cap(client
);
3447 cap
->merge(icr
.capinfo
.wanted
, icr
.capinfo
.issued
);
3449 cap
= add_client_cap(client
, session
);
3450 cap
->set_cap_id(icr
.capinfo
.cap_id
);
3451 cap
->set_wanted(icr
.capinfo
.wanted
);
3452 cap
->issue_norevoke(icr
.capinfo
.issued
);
3455 cap
->set_last_issue_stamp(ceph_clock_now());
3459 void CInode::clear_client_caps_after_export()
3461 while (!client_caps
.empty())
3462 remove_client_cap(client_caps
.begin()->first
);
3464 want_loner_cap
= -1;
3465 if (!get_mds_caps_wanted().empty()) {
3466 mempool::mds_co::compact_map
<int32_t,int32_t> empty
;
3467 set_mds_caps_wanted(empty
);
3471 void CInode::export_client_caps(map
<client_t
,Capability::Export
>& cl
)
3473 for (const auto &p
: client_caps
) {
3474 cl
[p
.first
] = p
.second
.make_export();
3479 int CInode::get_caps_liked() const
3482 return CEPH_CAP_PIN
| CEPH_CAP_ANY_EXCL
| CEPH_CAP_ANY_SHARED
; // but not, say, FILE_RD|WR|WRBUFFER
3484 return CEPH_CAP_ANY
& ~CEPH_CAP_FILE_LAZYIO
;
3487 int CInode::get_caps_allowed_ever() const
3491 allowed
= CEPH_CAP_PIN
| CEPH_CAP_ANY_EXCL
| CEPH_CAP_ANY_SHARED
;
3493 allowed
= CEPH_CAP_ANY
;
3496 (filelock
.gcaps_allowed_ever() << filelock
.get_cap_shift()) |
3497 (authlock
.gcaps_allowed_ever() << authlock
.get_cap_shift()) |
3498 (xattrlock
.gcaps_allowed_ever() << xattrlock
.get_cap_shift()) |
3499 (linklock
.gcaps_allowed_ever() << linklock
.get_cap_shift()));
3502 int CInode::get_caps_allowed_by_type(int type
) const
3506 (filelock
.gcaps_allowed(type
) << filelock
.get_cap_shift()) |
3507 (authlock
.gcaps_allowed(type
) << authlock
.get_cap_shift()) |
3508 (xattrlock
.gcaps_allowed(type
) << xattrlock
.get_cap_shift()) |
3509 (linklock
.gcaps_allowed(type
) << linklock
.get_cap_shift());
3512 int CInode::get_caps_careful() const
3515 (filelock
.gcaps_careful() << filelock
.get_cap_shift()) |
3516 (authlock
.gcaps_careful() << authlock
.get_cap_shift()) |
3517 (xattrlock
.gcaps_careful() << xattrlock
.get_cap_shift()) |
3518 (linklock
.gcaps_careful() << linklock
.get_cap_shift());
3521 int CInode::get_xlocker_mask(client_t client
) const
3524 (filelock
.gcaps_xlocker_mask(client
) << filelock
.get_cap_shift()) |
3525 (authlock
.gcaps_xlocker_mask(client
) << authlock
.get_cap_shift()) |
3526 (xattrlock
.gcaps_xlocker_mask(client
) << xattrlock
.get_cap_shift()) |
3527 (linklock
.gcaps_xlocker_mask(client
) << linklock
.get_cap_shift());
3530 int CInode::get_caps_allowed_for_client(Session
*session
, Capability
*cap
,
3531 const mempool_inode
*file_i
) const
3533 client_t client
= session
->get_client();
3535 if (client
== get_loner()) {
3536 // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked
3538 get_caps_allowed_by_type(CAP_LONER
) |
3539 (get_caps_allowed_by_type(CAP_XLOCKER
) & get_xlocker_mask(client
));
3541 allowed
= get_caps_allowed_by_type(CAP_ANY
);
3545 allowed
&= ~CEPH_CAP_ANY_DIR_OPS
;
3546 if (cap
&& (allowed
& CEPH_CAP_FILE_EXCL
))
3547 allowed
|= cap
->get_lock_cache_allowed();
3549 if (file_i
->inline_data
.version
== CEPH_INLINE_NONE
&&
3550 file_i
->layout
.pool_ns
.empty()) {
3553 if ((file_i
->inline_data
.version
!= CEPH_INLINE_NONE
&&
3554 cap
->is_noinline()) ||
3555 (!file_i
->layout
.pool_ns
.empty() &&
3556 cap
->is_nopoolns()))
3557 allowed
&= ~(CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_WR
);
3559 auto& conn
= session
->get_connection();
3560 if ((file_i
->inline_data
.version
!= CEPH_INLINE_NONE
&&
3561 !conn
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) ||
3562 (!file_i
->layout
.pool_ns
.empty() &&
3563 !conn
->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
)))
3564 allowed
&= ~(CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_WR
);
3570 // caps issued, wanted
3571 int CInode::get_caps_issued(int *ploner
, int *pother
, int *pxlocker
,
3572 int shift
, int mask
)
3575 int loner
= 0, other
= 0, xlocker
= 0;
3580 for (const auto &p
: client_caps
) {
3581 int i
= p
.second
.issued();
3583 if (p
.first
== loner_cap
)
3587 xlocker
|= get_xlocker_mask(p
.first
) & i
;
3589 if (ploner
) *ploner
= (loner
>> shift
) & mask
;
3590 if (pother
) *pother
= (other
>> shift
) & mask
;
3591 if (pxlocker
) *pxlocker
= (xlocker
>> shift
) & mask
;
3592 return (c
>> shift
) & mask
;
3595 bool CInode::is_any_caps_wanted() const
3597 for (const auto &p
: client_caps
) {
3598 if (p
.second
.wanted())
3604 int CInode::get_caps_wanted(int *ploner
, int *pother
, int shift
, int mask
) const
3607 int loner
= 0, other
= 0;
3608 for (const auto &p
: client_caps
) {
3609 if (!p
.second
.is_stale()) {
3610 int t
= p
.second
.wanted();
3612 if (p
.first
== loner_cap
)
3617 //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
3620 for (const auto &p
: mds_caps_wanted
) {
3623 //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
3625 if (ploner
) *ploner
= (loner
>> shift
) & mask
;
3626 if (pother
) *pother
= (other
>> shift
) & mask
;
3627 return (w
>> shift
) & mask
;
3630 bool CInode::issued_caps_need_gather(SimpleLock
*lock
)
3632 int loner_issued
, other_issued
, xlocker_issued
;
3633 get_caps_issued(&loner_issued
, &other_issued
, &xlocker_issued
,
3634 lock
->get_cap_shift(), lock
->get_cap_mask());
3635 if ((loner_issued
& ~lock
->gcaps_allowed(CAP_LONER
)) ||
3636 (other_issued
& ~lock
->gcaps_allowed(CAP_ANY
)) ||
3637 (xlocker_issued
& ~lock
->gcaps_allowed(CAP_XLOCKER
)))
3642 void CInode::adjust_num_caps_notable(int d
)
3644 if (!is_clientwriteable()) {
3645 if (!num_caps_notable
&& d
> 0)
3646 mdcache
->open_file_table
.add_inode(this);
3647 else if (num_caps_notable
> 0 && num_caps_notable
== -d
)
3648 mdcache
->open_file_table
.remove_inode(this);
3651 num_caps_notable
+=d
;
3652 ceph_assert(num_caps_notable
>= 0);
3655 void CInode::mark_clientwriteable()
3657 if (last
!= CEPH_NOSNAP
)
3659 if (!state_test(STATE_CLIENTWRITEABLE
)) {
3660 if (num_caps_notable
== 0)
3661 mdcache
->open_file_table
.add_inode(this);
3662 state_set(STATE_CLIENTWRITEABLE
);
3666 void CInode::clear_clientwriteable()
3668 if (state_test(STATE_CLIENTWRITEABLE
)) {
3669 if (num_caps_notable
== 0)
3670 mdcache
->open_file_table
.remove_inode(this);
3671 state_clear(STATE_CLIENTWRITEABLE
);
3675 // =============================================
3677 int CInode::encode_inodestat(bufferlist
& bl
, Session
*session
,
3678 SnapRealm
*dir_realm
,
3683 client_t client
= session
->get_client();
3684 ceph_assert(snapid
);
3689 const mempool_inode
*oi
= get_inode().get();
3690 const mempool_inode
*pi
= get_projected_inode().get();
3692 const mempool_xattr_map
*pxattrs
= nullptr;
3694 if (snapid
!= CEPH_NOSNAP
) {
3696 // for now at least, old_inodes is only defined/valid on the auth
3700 if (is_any_old_inodes()) {
3701 auto it
= old_inodes
->lower_bound(snapid
);
3702 if (it
!= old_inodes
->end()) {
3703 if (it
->second
.first
> snapid
) {
3704 if (it
!= old_inodes
->begin())
3707 if (it
->second
.first
<= snapid
&& snapid
<= it
->first
) {
3708 dout(15) << __func__
<< " snapid " << snapid
3709 << " to old_inode [" << it
->second
.first
<< "," << it
->first
<< "]"
3710 << " " << it
->second
.inode
.rstat
3712 pi
= oi
= &it
->second
.inode
;
3713 pxattrs
= &it
->second
.xattrs
;
3715 // snapshoted remote dentry can result this
3716 dout(0) << __func__
<< " old_inode for snapid " << snapid
3717 << " not found" << dendl
;
3720 } else if (snapid
< first
|| snapid
> last
) {
3721 // snapshoted remote dentry can result this
3722 dout(0) << __func__
<< " [" << first
<< "," << last
<< "]"
3723 << " not match snapid " << snapid
<< dendl
;
3728 std::map
<std::string
, std::string
> snap_metadata
;
3729 SnapRealm
*realm
= find_snaprealm();
3730 if (snapid
!= CEPH_NOSNAP
&& realm
) {
3731 // add snapshot timestamp vxattr
3732 map
<snapid_t
,const SnapInfo
*> infomap
;
3733 realm
->get_snap_info(infomap
,
3736 if (!infomap
.empty()) {
3737 ceph_assert(infomap
.size() == 1);
3738 const SnapInfo
*si
= infomap
.begin()->second
;
3739 snap_btime
= si
->stamp
;
3740 snap_metadata
= si
->metadata
;
3745 bool no_caps
= !valid
||
3746 session
->is_stale() ||
3747 (dir_realm
&& realm
!= dir_realm
) ||
3749 state_test(CInode::STATE_EXPORTINGCAPS
);
3751 dout(20) << __func__
<< " no caps"
3752 << (!valid
?", !valid":"")
3753 << (session
->is_stale()?", session stale ":"")
3754 << ((dir_realm
&& realm
!= dir_realm
)?", snaprealm differs ":"")
3755 << (is_frozen()?", frozen inode":"")
3756 << (state_test(CInode::STATE_EXPORTINGCAPS
)?", exporting caps":"")
3760 // "fake" a version that is old (stable) version, +1 if projected.
3761 version_t version
= (oi
->version
* 2) + is_projected();
3763 Capability
*cap
= get_client_cap(client
);
3764 bool pfile
= filelock
.is_xlocked_by_client(client
) || get_loner() == client
;
3765 //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL));
3766 bool pauth
= authlock
.is_xlocked_by_client(client
) || get_loner() == client
;
3767 bool plink
= linklock
.is_xlocked_by_client(client
) || get_loner() == client
;
3768 bool pxattr
= xattrlock
.is_xlocked_by_client(client
) || get_loner() == client
;
3770 bool plocal
= versionlock
.get_last_wrlock_client() == client
;
3771 bool ppolicy
= policylock
.is_xlocked_by_client(client
) || get_loner()==client
;
3773 const mempool_inode
*any_i
= (pfile
|pauth
|plink
|pxattr
|plocal
) ? pi
: oi
;
3775 dout(20) << " pfile " << pfile
<< " pauth " << pauth
3776 << " plink " << plink
<< " pxattr " << pxattr
3777 << " plocal " << plocal
3778 << " ctime " << any_i
->ctime
3779 << " valid=" << valid
<< dendl
;
3782 const mempool_inode
*file_i
= pfile
? pi
:oi
;
3783 file_layout_t layout
;
3785 layout
= (ppolicy
? pi
: oi
)->layout
;
3787 layout
= file_i
->layout
;
3790 // max_size is min of projected, actual
3792 std::min(oi
->get_client_range(client
),
3793 pi
->get_client_range(client
));
3796 version_t inline_version
= 0;
3797 bufferlist inline_data
;
3798 if (file_i
->inline_data
.version
== CEPH_INLINE_NONE
) {
3799 inline_version
= CEPH_INLINE_NONE
;
3800 } else if ((!cap
&& !no_caps
) ||
3801 (cap
&& cap
->client_inline_version
< file_i
->inline_data
.version
) ||
3802 (getattr_caps
& CEPH_CAP_FILE_RD
)) { // client requests inline data
3803 inline_version
= file_i
->inline_data
.version
;
3804 if (file_i
->inline_data
.length() > 0)
3805 file_i
->inline_data
.get_data(inline_data
);
3808 // nest (do same as file... :/)
3810 cap
->last_rbytes
= file_i
->rstat
.rbytes
;
3811 cap
->last_rsize
= file_i
->rstat
.rsize();
3815 const mempool_inode
*auth_i
= pauth
? pi
:oi
;
3818 const mempool_inode
*link_i
= plink
? pi
:oi
;
3821 const mempool_inode
*xattr_i
= pxattr
? pi
:oi
;
3825 version_t xattr_version
;
3826 if ((!cap
&& !no_caps
) ||
3827 (cap
&& cap
->client_xattr_version
< xattr_i
->xattr_version
) ||
3828 (getattr_caps
& CEPH_CAP_XATTR_SHARED
)) { // client requests xattrs
3830 pxattrs
= pxattr
? get_projected_xattrs().get() : get_xattrs().get();
3831 xattr_version
= xattr_i
->xattr_version
;
3839 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap
) +
3840 sizeof(struct ceph_file_layout
) +
3841 sizeof(struct ceph_timespec
) * 3 + 4 + // ctime ~ time_warp_seq
3842 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink
3843 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec
) + // dirstat.nfiles ~ rstat.rctime
3844 sizeof(__u32
) + sizeof(__u32
) * 2 * dirfragtree
._splits
.size() + // dirfragtree
3845 sizeof(__u32
) + symlink
.length() + // symlink
3846 sizeof(struct ceph_dir_layout
); // dir_layout
3848 if (xattr_version
) {
3849 bytes
+= sizeof(__u32
) + sizeof(__u32
); // xattr buffer len + number entries
3851 for (const auto &p
: *pxattrs
)
3852 bytes
+= sizeof(__u32
) * 2 + p
.first
.length() + p
.second
.length();
3855 bytes
+= sizeof(__u32
); // xattr buffer len
3858 sizeof(version_t
) + sizeof(__u32
) + inline_data
.length() + // inline data
3859 1 + 1 + 8 + 8 + 4 + // quota
3860 4 + layout
.pool_ns
.size() + // pool ns
3861 sizeof(struct ceph_timespec
) + 8; // btime + change_attr
3863 if (bytes
> max_bytes
)
3864 return -CEPHFS_ENOSPC
;
3869 struct ceph_mds_reply_cap ecap
;
3870 if (snapid
!= CEPH_NOSNAP
) {
3872 * snapped inodes (files or dirs) only get read-only caps. always
3873 * issue everything possible, since it is read only.
3875 * if a snapped inode has caps, limit issued caps based on the
3878 * if it is a live inode, limit issued caps based on the lock
3881 * do NOT adjust cap issued state, because the client always
3882 * tracks caps per-snap and the mds does either per-interval or
3885 ecap
.caps
= valid
? get_caps_allowed_by_type(CAP_ANY
) : CEPH_STAT_CAP_INODE
;
3886 if (last
== CEPH_NOSNAP
|| is_any_caps())
3887 ecap
.caps
= ecap
.caps
& get_caps_allowed_for_client(session
, nullptr, file_i
);
3892 if (!no_caps
&& !cap
) {
3894 cap
= add_client_cap(client
, session
, realm
);
3896 choose_ideal_loner();
3900 if (!no_caps
&& cap
) {
3901 int likes
= get_caps_liked();
3902 int allowed
= get_caps_allowed_for_client(session
, cap
, file_i
);
3903 issue
= (cap
->wanted() | likes
) & allowed
;
3904 cap
->issue_norevoke(issue
, true);
3905 issue
= cap
->pending();
3906 dout(10) << "encode_inodestat issuing " << ccap_string(issue
)
3907 << " seq " << cap
->get_last_seq() << dendl
;
3908 } else if (cap
&& cap
->is_new() && !dir_realm
) {
3909 // alway issue new caps to client, otherwise the caps get lost
3910 ceph_assert(cap
->is_stale());
3911 ceph_assert(!cap
->pending());
3912 issue
= CEPH_CAP_PIN
;
3913 cap
->issue_norevoke(issue
, true);
3914 dout(10) << "encode_inodestat issuing " << ccap_string(issue
)
3915 << " seq " << cap
->get_last_seq()
3916 << "(stale&new caps)" << dendl
;
3920 cap
->set_last_issue();
3921 cap
->set_last_issue_stamp(ceph_clock_now());
3923 ecap
.wanted
= cap
->wanted();
3924 ecap
.cap_id
= cap
->get_cap_id();
3925 ecap
.seq
= cap
->get_last_seq();
3926 ecap
.mseq
= cap
->get_mseq();
3927 ecap
.realm
= realm
->inode
->ino();
3937 ecap
.flags
= is_auth() ? CEPH_CAP_FLAG_AUTH
: 0;
3938 dout(10) << "encode_inodestat caps " << ccap_string(ecap
.caps
)
3939 << " seq " << ecap
.seq
<< " mseq " << ecap
.mseq
3940 << " xattrv " << xattr_version
<< dendl
;
3942 if (inline_data
.length() && cap
) {
3943 if ((cap
->pending() | getattr_caps
) & CEPH_CAP_FILE_SHARED
) {
3944 dout(10) << "including inline version " << inline_version
<< dendl
;
3945 cap
->client_inline_version
= inline_version
;
3947 dout(10) << "dropping inline version " << inline_version
<< dendl
;
3949 inline_data
.clear();
3953 // include those xattrs?
3954 if (xattr_version
&& cap
) {
3955 if ((cap
->pending() | getattr_caps
) & CEPH_CAP_XATTR_SHARED
) {
3956 dout(10) << "including xattrs version " << xattr_version
<< dendl
;
3957 cap
->client_xattr_version
= xattr_version
;
3959 dout(10) << "dropping xattrs version " << xattr_version
<< dendl
;
3964 // The end result of encode_xattrs() is equivalent to:
3967 // if (xattr_version) {
3969 // encode(*pxattrs, bl);
3971 // encode((__u32)0, bl);
3976 // But encoding xattrs into the 'xbl' requires a memory allocation.
3977 // The 'bl' should have enough pre-allocated memory in most cases.
3978 // Encoding xattrs directly into it can avoid the extra allocation.
3979 auto encode_xattrs
= [xattr_version
, pxattrs
, &bl
]() {
3981 if (xattr_version
) {
3983 auto filler
= bl
.append_hole(sizeof(xbl_len
));
3984 const auto starting_bl_len
= bl
.length();
3986 encode(*pxattrs
, bl
);
3988 encode((__u32
)0, bl
);
3989 xbl_len
= bl
.length() - starting_bl_len
;
3990 filler
.copy_in(sizeof(xbl_len
), (char *)&xbl_len
);
3992 encode((__u32
)0, bl
);
3997 * note: encoding matches MClientReply::InodeStat
3999 if (session
->info
.has_feature(CEPHFS_FEATURE_REPLY_ENCODING
)) {
4000 ENCODE_START(6, 1, bl
);
4001 encode(oi
->ino
, bl
);
4003 encode(oi
->rdev
, bl
);
4004 encode(version
, bl
);
4005 encode(xattr_version
, bl
);
4008 ceph_file_layout legacy_layout
;
4009 layout
.to_legacy(&legacy_layout
);
4010 encode(legacy_layout
, bl
);
4012 encode(any_i
->ctime
, bl
);
4013 encode(file_i
->mtime
, bl
);
4014 encode(file_i
->atime
, bl
);
4015 encode(file_i
->time_warp_seq
, bl
);
4016 encode(file_i
->size
, bl
);
4017 encode(max_size
, bl
);
4018 encode(file_i
->truncate_size
, bl
);
4019 encode(file_i
->truncate_seq
, bl
);
4020 encode(auth_i
->mode
, bl
);
4021 encode((uint32_t)auth_i
->uid
, bl
);
4022 encode((uint32_t)auth_i
->gid
, bl
);
4023 encode(link_i
->nlink
, bl
);
4024 encode(file_i
->dirstat
.nfiles
, bl
);
4025 encode(file_i
->dirstat
.nsubdirs
, bl
);
4026 encode(file_i
->rstat
.rbytes
, bl
);
4027 encode(file_i
->rstat
.rfiles
, bl
);
4028 encode(file_i
->rstat
.rsubdirs
, bl
);
4029 encode(file_i
->rstat
.rctime
, bl
);
4030 dirfragtree
.encode(bl
);
4031 encode(symlink
, bl
);
4032 encode(file_i
->dir_layout
, bl
);
4034 encode(inline_version
, bl
);
4035 encode(inline_data
, bl
);
4036 const mempool_inode
*policy_i
= ppolicy
? pi
: oi
;
4037 encode(policy_i
->quota
, bl
);
4038 encode(layout
.pool_ns
, bl
);
4039 encode(any_i
->btime
, bl
);
4040 encode(any_i
->change_attr
, bl
);
4041 encode(file_i
->export_pin
, bl
);
4042 encode(snap_btime
, bl
);
4043 encode(file_i
->rstat
.rsnaps
, bl
);
4044 encode(snap_metadata
, bl
);
4045 encode(file_i
->fscrypt
, bl
);
4049 ceph_assert(session
->get_connection());
4051 encode(oi
->ino
, bl
);
4053 encode(oi
->rdev
, bl
);
4054 encode(version
, bl
);
4055 encode(xattr_version
, bl
);
4058 ceph_file_layout legacy_layout
;
4059 layout
.to_legacy(&legacy_layout
);
4060 encode(legacy_layout
, bl
);
4062 encode(any_i
->ctime
, bl
);
4063 encode(file_i
->mtime
, bl
);
4064 encode(file_i
->atime
, bl
);
4065 encode(file_i
->time_warp_seq
, bl
);
4066 encode(file_i
->size
, bl
);
4067 encode(max_size
, bl
);
4068 encode(file_i
->truncate_size
, bl
);
4069 encode(file_i
->truncate_seq
, bl
);
4070 encode(auth_i
->mode
, bl
);
4071 encode((uint32_t)auth_i
->uid
, bl
);
4072 encode((uint32_t)auth_i
->gid
, bl
);
4073 encode(link_i
->nlink
, bl
);
4074 encode(file_i
->dirstat
.nfiles
, bl
);
4075 encode(file_i
->dirstat
.nsubdirs
, bl
);
4076 encode(file_i
->rstat
.rbytes
, bl
);
4077 encode(file_i
->rstat
.rfiles
, bl
);
4078 encode(file_i
->rstat
.rsubdirs
, bl
);
4079 encode(file_i
->rstat
.rctime
, bl
);
4080 dirfragtree
.encode(bl
);
4081 encode(symlink
, bl
);
4082 auto& conn
= session
->get_connection();
4083 if (conn
->has_feature(CEPH_FEATURE_DIRLAYOUTHASH
)) {
4084 encode(file_i
->dir_layout
, bl
);
4087 if (conn
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
4088 encode(inline_version
, bl
);
4089 encode(inline_data
, bl
);
4091 if (conn
->has_feature(CEPH_FEATURE_MDS_QUOTA
)) {
4092 const mempool_inode
*policy_i
= ppolicy
? pi
: oi
;
4093 encode(policy_i
->quota
, bl
);
4095 if (conn
->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
)) {
4096 encode(layout
.pool_ns
, bl
);
4098 if (conn
->has_feature(CEPH_FEATURE_FS_BTIME
)) {
4099 encode(any_i
->btime
, bl
);
4100 encode(any_i
->change_attr
, bl
);
4107 void CInode::encode_cap_message(const ref_t
<MClientCaps
> &m
, Capability
*cap
)
4111 client_t client
= cap
->get_client();
4113 bool pfile
= filelock
.is_xlocked_by_client(client
) || (cap
->issued() & CEPH_CAP_FILE_EXCL
);
4114 bool pauth
= authlock
.is_xlocked_by_client(client
);
4115 bool plink
= linklock
.is_xlocked_by_client(client
);
4116 bool pxattr
= xattrlock
.is_xlocked_by_client(client
);
4118 const mempool_inode
*oi
= get_inode().get();
4119 const mempool_inode
*pi
= get_projected_inode().get();
4120 const mempool_inode
*i
= (pfile
|pauth
|plink
|pxattr
) ? pi
: oi
;
4122 dout(20) << __func__
<< " pfile " << pfile
4123 << " pauth " << pauth
<< " plink " << plink
<< " pxattr " << pxattr
4124 << " ctime " << i
->ctime
<< dendl
;
4127 m
->set_layout(i
->layout
);
4129 m
->truncate_seq
= i
->truncate_seq
;
4130 m
->truncate_size
= i
->truncate_size
;
4131 m
->mtime
= i
->mtime
;
4132 m
->atime
= i
->atime
;
4133 m
->ctime
= i
->ctime
;
4134 m
->change_attr
= i
->change_attr
;
4135 m
->time_warp_seq
= i
->time_warp_seq
;
4136 m
->nfiles
= i
->dirstat
.nfiles
;
4137 m
->nsubdirs
= i
->dirstat
.nsubdirs
;
4139 if (cap
->client_inline_version
< i
->inline_data
.version
) {
4140 m
->inline_version
= cap
->client_inline_version
= i
->inline_data
.version
;
4141 if (i
->inline_data
.length() > 0)
4142 i
->inline_data
.get_data(m
->inline_data
);
4144 m
->inline_version
= 0;
4147 // max_size is min of projected, actual.
4148 uint64_t oldms
= oi
->get_client_range(client
);
4149 uint64_t newms
= pi
->get_client_range(client
);
4150 m
->max_size
= std::min(oldms
, newms
);
4153 m
->head
.mode
= i
->mode
;
4154 m
->head
.uid
= i
->uid
;
4155 m
->head
.gid
= i
->gid
;
4158 m
->head
.nlink
= i
->nlink
;
4162 const auto& ix
= pxattr
? get_projected_xattrs() : get_xattrs();
4163 if ((cap
->pending() & CEPH_CAP_XATTR_SHARED
) &&
4164 i
->xattr_version
> cap
->client_xattr_version
) {
4165 dout(10) << " including xattrs v " << i
->xattr_version
<< dendl
;
4167 encode(*ix
, m
->xattrbl
);
4169 encode((__u32
)0, m
->xattrbl
);
4170 m
->head
.xattr_version
= i
->xattr_version
;
4171 cap
->client_xattr_version
= i
->xattr_version
;
4177 void CInode::_encode_base(bufferlist
& bl
, uint64_t features
)
4179 ENCODE_START(1, 1, bl
);
4181 encode(*get_inode(), bl
, features
);
4182 encode(symlink
, bl
);
4183 encode(dirfragtree
, bl
);
4185 encode_old_inodes(bl
, features
);
4186 encode(damage_flags
, bl
);
4190 void CInode::_decode_base(bufferlist::const_iterator
& p
)
4195 auto _inode
= allocate_inode();
4197 reset_inode(std::move(_inode
));
4202 symlink
= std::string_view(tmp
);
4204 decode(dirfragtree
, p
);
4206 decode_old_inodes(p
);
4207 decode(damage_flags
, p
);
4212 void CInode::_encode_locks_full(bufferlist
& bl
)
4215 encode(authlock
, bl
);
4216 encode(linklock
, bl
);
4217 encode(dirfragtreelock
, bl
);
4218 encode(filelock
, bl
);
4219 encode(xattrlock
, bl
);
4220 encode(snaplock
, bl
);
4221 encode(nestlock
, bl
);
4222 encode(flocklock
, bl
);
4223 encode(policylock
, bl
);
4225 encode(loner_cap
, bl
);
4227 void CInode::_decode_locks_full(bufferlist::const_iterator
& p
)
4230 decode(authlock
, p
);
4231 decode(linklock
, p
);
4232 decode(dirfragtreelock
, p
);
4233 decode(filelock
, p
);
4234 decode(xattrlock
, p
);
4235 decode(snaplock
, p
);
4236 decode(nestlock
, p
);
4237 decode(flocklock
, p
);
4238 decode(policylock
, p
);
4240 decode(loner_cap
, p
);
4241 set_loner_cap(loner_cap
);
4242 want_loner_cap
= loner_cap
; // for now, we'll eval() shortly.
4245 void CInode::_encode_locks_state_for_replica(bufferlist
& bl
, bool need_recover
)
4247 ENCODE_START(1, 1, bl
);
4248 authlock
.encode_state_for_replica(bl
);
4249 linklock
.encode_state_for_replica(bl
);
4250 dirfragtreelock
.encode_state_for_replica(bl
);
4251 filelock
.encode_state_for_replica(bl
);
4252 nestlock
.encode_state_for_replica(bl
);
4253 xattrlock
.encode_state_for_replica(bl
);
4254 snaplock
.encode_state_for_replica(bl
);
4255 flocklock
.encode_state_for_replica(bl
);
4256 policylock
.encode_state_for_replica(bl
);
4257 encode(need_recover
, bl
);
4261 void CInode::_encode_locks_state_for_rejoin(bufferlist
& bl
, int rep
)
4263 authlock
.encode_state_for_replica(bl
);
4264 linklock
.encode_state_for_replica(bl
);
4265 dirfragtreelock
.encode_state_for_rejoin(bl
, rep
);
4266 filelock
.encode_state_for_rejoin(bl
, rep
);
4267 nestlock
.encode_state_for_rejoin(bl
, rep
);
4268 xattrlock
.encode_state_for_replica(bl
);
4269 snaplock
.encode_state_for_replica(bl
);
4270 flocklock
.encode_state_for_replica(bl
);
4271 policylock
.encode_state_for_replica(bl
);
4274 void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator
& p
, bool is_new
)
4277 authlock
.decode_state(p
, is_new
);
4278 linklock
.decode_state(p
, is_new
);
4279 dirfragtreelock
.decode_state(p
, is_new
);
4280 filelock
.decode_state(p
, is_new
);
4281 nestlock
.decode_state(p
, is_new
);
4282 xattrlock
.decode_state(p
, is_new
);
4283 snaplock
.decode_state(p
, is_new
);
4284 flocklock
.decode_state(p
, is_new
);
4285 policylock
.decode_state(p
, is_new
);
4288 decode(need_recover
, p
);
4289 if (need_recover
&& is_new
) {
4290 // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
4291 // and change the object when replaying unsafe requests.
4292 authlock
.mark_need_recover();
4293 linklock
.mark_need_recover();
4294 dirfragtreelock
.mark_need_recover();
4295 filelock
.mark_need_recover();
4296 nestlock
.mark_need_recover();
4297 xattrlock
.mark_need_recover();
4298 snaplock
.mark_need_recover();
4299 flocklock
.mark_need_recover();
4300 policylock
.mark_need_recover();
4304 void CInode::_decode_locks_rejoin(bufferlist::const_iterator
& p
, MDSContext::vec
& waiters
,
4305 list
<SimpleLock
*>& eval_locks
, bool survivor
)
4307 authlock
.decode_state_rejoin(p
, waiters
, survivor
);
4308 linklock
.decode_state_rejoin(p
, waiters
, survivor
);
4309 dirfragtreelock
.decode_state_rejoin(p
, waiters
, survivor
);
4310 filelock
.decode_state_rejoin(p
, waiters
, survivor
);
4311 nestlock
.decode_state_rejoin(p
, waiters
, survivor
);
4312 xattrlock
.decode_state_rejoin(p
, waiters
, survivor
);
4313 snaplock
.decode_state_rejoin(p
, waiters
, survivor
);
4314 flocklock
.decode_state_rejoin(p
, waiters
, survivor
);
4315 policylock
.decode_state_rejoin(p
, waiters
, survivor
);
4317 if (!dirfragtreelock
.is_stable() && !dirfragtreelock
.is_wrlocked())
4318 eval_locks
.push_back(&dirfragtreelock
);
4319 if (!filelock
.is_stable() && !filelock
.is_wrlocked())
4320 eval_locks
.push_back(&filelock
);
4321 if (!nestlock
.is_stable() && !nestlock
.is_wrlocked())
4322 eval_locks
.push_back(&nestlock
);
4328 void CInode::encode_export(bufferlist
& bl
)
4330 ENCODE_START(5, 4, bl
);
4331 _encode_base(bl
, mdcache
->mds
->mdsmap
->get_up_features());
4337 encode(get_replicas(), bl
);
4339 // include scatterlock info for any bounding CDirs
4340 bufferlist bounding
;
4341 if (get_inode()->is_dir())
4342 for (const auto &p
: dirfrags
) {
4343 CDir
*dir
= p
.second
;
4344 if (dir
->state_test(CDir::STATE_EXPORTBOUND
)) {
4345 encode(p
.first
, bounding
);
4346 encode(dir
->get_fnode()->fragstat
, bounding
);
4347 encode(dir
->get_fnode()->accounted_fragstat
, bounding
);
4348 encode(dir
->get_fnode()->rstat
, bounding
);
4349 encode(dir
->get_fnode()->accounted_rstat
, bounding
);
4350 dout(10) << " encoded fragstat/rstat info for " << *dir
<< dendl
;
4353 encode(bounding
, bl
);
4355 _encode_locks_full(bl
);
4357 _encode_file_locks(bl
);
4361 get(PIN_TEMPEXPORTING
);
4364 void CInode::finish_export()
4366 state
&= MASK_STATE_EXPORT_KEPT
;
4371 //dirlock.clear_updated();
4375 put(PIN_TEMPEXPORTING
);
4378 void CInode::decode_import(bufferlist::const_iterator
& p
,
4388 s
&= MASK_STATE_EXPORTED
;
4390 set_ephemeral_pin((s
& STATE_DISTEPHEMERALPIN
),
4391 (s
& STATE_RANDEPHEMERALPIN
));
4392 state_set(STATE_AUTH
| s
);
4399 if (is_dirty_parent()) {
4400 get(PIN_DIRTYPARENT
);
4401 mark_dirty_parent(ls
);
4406 decode(get_replicas(), p
);
4407 if (is_replicated())
4408 get(PIN_REPLICATED
);
4411 // decode fragstat info on bounding cdirs
4412 bufferlist bounding
;
4413 decode(bounding
, p
);
4414 auto q
= bounding
.cbegin();
4418 CDir
*dir
= get_dirfrag(fg
);
4419 ceph_assert(dir
); // we should have all bounds open
4421 // Only take the remote's fragstat/rstat if we are non-auth for
4422 // this dirfrag AND the lock is NOT in a scattered (MIX) state.
4423 // We know lock is stable, and MIX is the only state in which
4424 // the inode auth (who sent us this data) may not have the best
4427 // HMM: Are there cases where dir->is_auth() is an insufficient
4428 // check because the dirfrag is under migration? That implies
4429 // it is frozen (and in a SYNC or LOCK state). FIXME.
4431 auto _fnode
= CDir::allocate_fnode(*dir
->get_fnode());
4432 if (dir
->is_auth() ||
4433 filelock
.get_state() == LOCK_MIX
) {
4434 dout(10) << " skipped fragstat info for " << *dir
<< dendl
;
4439 decode(_fnode
->fragstat
, q
);
4440 decode(_fnode
->accounted_fragstat
, q
);
4441 dout(10) << " took fragstat info for " << *dir
<< dendl
;
4443 if (dir
->is_auth() ||
4444 nestlock
.get_state() == LOCK_MIX
) {
4445 dout(10) << " skipped rstat info for " << *dir
<< dendl
;
4450 decode(_fnode
->rstat
, q
);
4451 decode(_fnode
->accounted_rstat
, q
);
4452 dout(10) << " took rstat info for " << *dir
<< dendl
;
4454 dir
->reset_fnode(std::move(_fnode
));
4457 _decode_locks_full(p
);
4459 _decode_file_locks(p
);
4465 void InodeStoreBase::dump(Formatter
*f
) const
4468 f
->dump_string("symlink", symlink
);
4470 f
->open_array_section("xattrs");
4472 for (const auto& [key
, val
] : *xattrs
) {
4473 f
->open_object_section("xattr");
4474 f
->dump_string("key", key
);
4475 std::string
v(val
.c_str(), val
.length());
4476 f
->dump_string("val", v
);
4481 f
->open_object_section("dirfragtree");
4482 dirfragtree
.dump(f
);
4483 f
->close_section(); // dirfragtree
4485 f
->open_array_section("old_inodes");
4487 for (const auto &p
: *old_inodes
) {
4488 f
->open_object_section("old_inode");
4489 // The key is the last snapid, the first is in the mempool_old_inode
4490 f
->dump_int("last", p
.first
);
4492 f
->close_section(); // old_inode
4495 f
->close_section(); // old_inodes
4497 f
->dump_unsigned("oldest_snap", oldest_snap
);
4498 f
->dump_unsigned("damage_flags", damage_flags
);
4502 void decode_json_obj(mempool::mds_co::string
& t
, JSONObj
*obj
){
4504 t
= mempool::mds_co::string(std::string_view(obj
->get_data()));
4507 void InodeStoreBase::decode_json(JSONObj
*obj
)
4510 auto _inode
= allocate_inode();
4511 _inode
->decode_json(obj
);
4512 reset_inode(std::move(_inode
));
4515 JSONDecoder::decode_json("symlink", symlink
, obj
, true);
4516 // JSONDecoder::decode_json("dirfragtree", dirfragtree, obj, true); // cann't decode it now
4520 mempool_xattr_map tmp
;
4521 JSONDecoder::decode_json("xattrs", tmp
, xattrs_cb
, obj
, true);
4523 reset_xattrs(xattr_map_ptr());
4525 reset_xattrs(allocate_xattr_map(std::move(tmp
)));
4527 // JSONDecoder::decode_json("old_inodes", old_inodes, InodeStoreBase::old_indoes_cb, obj, true); // cann't decode old_inodes now
4528 JSONDecoder::decode_json("oldest_snap", oldest_snap
.val
, obj
, true);
4529 JSONDecoder::decode_json("damage_flags", damage_flags
, obj
, true);
4531 //JSONDecoder::decode_json("snap_blob", srnode, obj, true); // cann't decode it now
4532 //snap_blob = srnode;
4535 void InodeStoreBase::xattrs_cb(InodeStoreBase::mempool_xattr_map
& c
, JSONObj
*obj
){
4538 JSONDecoder::decode_json("key", k
, obj
, true);
4540 JSONDecoder::decode_json("val", v
, obj
, true);
4541 c
[k
.c_str()] = buffer::copy(v
.c_str(), v
.size());
4544 void InodeStoreBase::old_indoes_cb(InodeStoreBase::mempool_old_inode_map
& c
, JSONObj
*obj
){
4547 JSONDecoder::decode_json("last", s
.val
, obj
, true);
4548 InodeStoreBase::mempool_old_inode i
;
4549 // i.decode_json(obj); // cann't decode now, simon
4553 void InodeStore::generate_test_instances(std::list
<InodeStore
*> &ls
)
4555 InodeStore
*populated
= new InodeStore
;
4556 populated
->get_inode()->ino
= 0xdeadbeef;
4557 populated
->symlink
= "rhubarb";
4558 ls
.push_back(populated
);
4561 void InodeStoreBare::generate_test_instances(std::list
<InodeStoreBare
*> &ls
)
4563 InodeStoreBare
*populated
= new InodeStoreBare
;
4564 populated
->get_inode()->ino
= 0xdeadbeef;
4565 populated
->symlink
= "rhubarb";
4566 ls
.push_back(populated
);
4569 void CInode::validate_disk_state(CInode::validated_data
*results
,
4572 class ValidationContinuation
: public MDSContinuation
{
4576 CInode::validated_data
*results
;
4588 ValidationContinuation(CInode
*i
,
4589 CInode::validated_data
*data_r
,
4591 MDSContinuation(i
->mdcache
->mds
->server
),
4596 set_callback(START
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_start
));
4597 set_callback(BACKTRACE
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_backtrace
));
4598 set_callback(INODE
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_inode_disk
));
4599 set_callback(DIRFRAGS
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_dirfrags
));
4602 ~ValidationContinuation() override
{
4605 in
->mdcache
->num_shadow_inodes
--;
4610 * Fetch backtrace and set tag if tag is non-empty
4612 void fetch_backtrace_and_tag(CInode
*in
,
4613 std::string_view tag
, bool is_internal
,
4614 Context
*fin
, int *bt_r
, bufferlist
*bt
)
4616 const int64_t pool
= in
->get_backtrace_pool();
4617 object_t oid
= CInode::get_object_name(in
->ino(), frag_t(), "");
4619 ObjectOperation fetch
;
4620 fetch
.getxattr("parent", bt
, bt_r
);
4621 in
->mdcache
->mds
->objecter
->read(oid
, object_locator_t(pool
), fetch
, CEPH_NOSNAP
,
4623 if (in
->mdcache
->mds
->logger
) {
4624 in
->mdcache
->mds
->logger
->inc(l_mds_openino_backtrace_fetch
);
4625 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_backtrace_fetch
);
4630 ObjectOperation scrub_tag
;
4632 encode(tag
, tag_bl
);
4633 scrub_tag
.setxattr("scrub_tag", tag_bl
);
4635 in
->mdcache
->mds
->objecter
->mutate(oid
, object_locator_t(pool
), scrub_tag
, snapc
,
4636 ceph::real_clock::now(),
4638 if (in
->mdcache
->mds
->logger
)
4639 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_set_tag
);
4643 bool _start(int rval
) {
4644 ceph_assert(in
->can_auth_pin());
4647 if (in
->is_dirty()) {
4648 MDCache
*mdcache
= in
->mdcache
; // For the benefit of dout
4649 auto ino
= [this]() { return in
->ino(); }; // For the benefit of dout
4650 dout(20) << "validating a dirty CInode; results will be inconclusive"
4654 C_OnFinisher
*conf
= new C_OnFinisher(get_io_callback(BACKTRACE
),
4655 in
->mdcache
->mds
->finisher
);
4657 std::string_view tag
= in
->scrub_infop
->header
->get_tag();
4658 bool is_internal
= in
->scrub_infop
->header
->is_internal_tag();
4659 // Rather than using the usual CInode::fetch_backtrace,
4660 // use a special variant that optionally writes a tag in the same
4662 fetch_backtrace_and_tag(in
, tag
, is_internal
, conf
, &results
->backtrace
.ondisk_read_retval
, &bl
);
4666 bool _backtrace(int rval
) {
4667 // set up basic result reporting and make sure we got the data
4668 results
->performed_validation
= true; // at least, some of it!
4669 results
->backtrace
.checked
= true;
4671 const int64_t pool
= in
->get_backtrace_pool();
4672 inode_backtrace_t
& memory_backtrace
= results
->backtrace
.memory_value
;
4673 in
->build_backtrace(pool
, memory_backtrace
);
4674 bool equivalent
, divergent
;
4677 MDCache
*mdcache
= in
->mdcache
; // For the benefit of dout
4678 auto ino
= [this]() { return in
->ino(); }; // For the benefit of dout
4680 // Ignore rval because it's the result of a FAILOK operation
4681 // from fetch_backtrace_and_tag: the real result is in
4682 // backtrace.ondisk_read_retval
4683 dout(20) << "ondisk_read_retval: " << results
->backtrace
.ondisk_read_retval
<< dendl
;
4684 if (results
->backtrace
.ondisk_read_retval
!= 0) {
4685 results
->backtrace
.error_str
<< "failed to read off disk; see retval";
4686 // we probably have a new unwritten file!
4687 // so skip the backtrace scrub for this entry and say that all's well
4688 if (in
->is_dirty_parent()) {
4689 dout(20) << "forcing backtrace as passed since inode is dirty parent" << dendl
;
4690 results
->backtrace
.passed
= true;
4695 // extract the backtrace, and compare it to a newly-constructed one
4697 auto p
= bl
.cbegin();
4699 decode(results
->backtrace
.ondisk_value
, p
);
4700 dout(10) << "decoded " << bl
.length() << " bytes of backtrace successfully" << dendl
;
4701 } catch (buffer::error
&) {
4702 if (results
->backtrace
.ondisk_read_retval
== 0 && rval
!= 0) {
4703 // Cases where something has clearly gone wrong with the overall
4704 // fetch op, though we didn't get a nonzero rc from the getxattr
4705 // operation. e.g. object missing.
4706 results
->backtrace
.ondisk_read_retval
= rval
;
4708 results
->backtrace
.error_str
<< "failed to decode on-disk backtrace ("
4709 << bl
.length() << " bytes)!";
4710 // we probably have a new unwritten file!
4711 // so skip the backtrace scrub for this entry and say that all's well
4712 if (in
->is_dirty_parent()) {
4713 dout(20) << "decode failed; forcing backtrace as passed since "
4714 "inode is dirty parent" << dendl
;
4715 results
->backtrace
.passed
= true;
4721 memory_newer
= memory_backtrace
.compare(results
->backtrace
.ondisk_value
,
4722 &equivalent
, &divergent
);
4724 if (divergent
|| memory_newer
< 0) {
4725 // we're divergent, or on-disk version is newer
4726 results
->backtrace
.error_str
<< "On-disk backtrace is divergent or newer";
4727 /* if the backtraces are divergent and the link count is 0, then
4728 * most likely its a stray entry that's being purged and things are
4729 * well and there's no reason for alarm
4731 if (divergent
&& (in
->is_dirty_parent() || in
->get_inode()->nlink
== 0)) {
4732 results
->backtrace
.passed
= true;
4733 dout(20) << "divergent backtraces are acceptable when dn "
4734 "is being purged or has been renamed or moved to a "
4735 "different directory " << *in
<< dendl
;
4738 results
->backtrace
.passed
= true;
4742 if (!results
->backtrace
.passed
&& in
->scrub_infop
->header
->get_repair()) {
4744 in
->make_path_string(path
);
4745 in
->mdcache
->mds
->clog
->warn() << "bad backtrace on inode " << in
->ino()
4746 << "(" << path
<< "), rewriting it";
4747 in
->mark_dirty_parent(in
->mdcache
->mds
->mdlog
->get_current_segment(),
4749 // Flag that we repaired this BT so that it won't go into damagetable
4750 results
->backtrace
.repaired
= true;
4751 if (in
->mdcache
->mds
->logger
)
4752 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_backtrace_repaired
);
4755 // If the inode's number was free in the InoTable, fix that
4758 InoTable
*inotable
= mdcache
->mds
->inotable
;
4760 dout(10) << "scrub: inotable ino = " << in
->ino() << dendl
;
4761 dout(10) << "scrub: inotable free says "
4762 << inotable
->is_marked_free(in
->ino()) << dendl
;
4764 if (inotable
->is_marked_free(in
->ino())) {
4765 LogChannelRef clog
= in
->mdcache
->mds
->clog
;
4766 clog
->error() << "scrub: inode wrongly marked free: " << in
->ino();
4768 if (in
->scrub_infop
->header
->get_repair()) {
4769 bool repaired
= inotable
->repair(in
->ino());
4771 clog
->error() << "inode table repaired for inode: " << in
->ino();
4774 if (in
->mdcache
->mds
->logger
)
4775 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_inotable_repaired
);
4777 clog
->error() << "Cannot repair inotable while other operations"
4786 if (in
->mdcache
->mds
->logger
)
4787 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_dir_inodes
);
4788 return validate_directory_data();
4790 if (in
->mdcache
->mds
->logger
)
4791 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_file_inodes
);
4792 // TODO: validate on-disk inode for normal files
4797 bool validate_directory_data() {
4798 ceph_assert(in
->is_dir());
4800 if (in
->is_base()) {
4802 shadow_in
= new CInode(in
->mdcache
);
4803 in
->mdcache
->create_unlinked_system_inode(shadow_in
, in
->ino(), in
->get_inode()->mode
);
4804 in
->mdcache
->num_shadow_inodes
++;
4806 shadow_in
->fetch(get_internal_callback(INODE
));
4807 if (in
->mdcache
->mds
->logger
)
4808 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_dir_base_inodes
);
4811 // TODO: validate on-disk inode for non-base directories
4812 if (in
->mdcache
->mds
->logger
)
4813 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_dirfrag_rstats
);
4814 results
->inode
.passed
= true;
4815 return check_dirfrag_rstats();
4819 bool _inode_disk(int rval
) {
4820 const auto& si
= shadow_in
->get_inode();
4821 const auto& i
= in
->get_inode();
4823 results
->inode
.checked
= true;
4824 results
->inode
.ondisk_read_retval
= rval
;
4825 results
->inode
.ondisk_value
= *si
;
4826 results
->inode
.memory_value
= *i
;
4828 if (si
->version
> i
->version
) {
4830 results
->inode
.error_str
<< "On-disk inode is newer than in-memory one; ";
4833 bool divergent
= false;
4834 int r
= i
->compare(*si
, &divergent
);
4835 results
->inode
.passed
= !divergent
&& r
>= 0;
4836 if (!results
->inode
.passed
) {
4837 results
->inode
.error_str
<<
4838 "On-disk inode is divergent or newer than in-memory one; ";
4843 return check_dirfrag_rstats();
4846 bool check_dirfrag_rstats() {
4847 if (in
->has_subtree_root_dirfrag()) {
4848 in
->mdcache
->rdlock_dirfrags_stats(in
, get_internal_callback(DIRFRAGS
));
4851 return immediate(DIRFRAGS
, 0);
4855 bool _dirfrags(int rval
) {
4856 // basic reporting setup
4857 results
->raw_stats
.checked
= true;
4858 results
->raw_stats
.ondisk_read_retval
= rval
;
4860 results
->raw_stats
.memory_value
.dirstat
= in
->get_inode()->dirstat
;
4861 results
->raw_stats
.memory_value
.rstat
= in
->get_inode()->rstat
;
4862 frag_info_t
& dir_info
= results
->raw_stats
.ondisk_value
.dirstat
;
4863 nest_info_t
& nest_info
= results
->raw_stats
.ondisk_value
.rstat
;
4866 results
->raw_stats
.error_str
<< "Failed to read dirfrags off disk";
4870 // check each dirfrag...
4871 for (const auto &p
: in
->dirfrags
) {
4872 CDir
*dir
= p
.second
;
4873 ceph_assert(dir
->get_version() > 0);
4874 nest_info
.add(dir
->get_fnode()->accounted_rstat
);
4875 dir_info
.add(dir
->get_fnode()->accounted_fragstat
);
4877 nest_info
.rsubdirs
++; // it gets one to account for self
4878 if (const sr_t
*srnode
= in
->get_projected_srnode(); srnode
)
4879 nest_info
.rsnaps
+= srnode
->snaps
.size();
4881 // ...and that their sum matches our inode settings
4882 if (!dir_info
.same_sums(in
->get_inode()->dirstat
) ||
4883 !nest_info
.same_sums(in
->get_inode()->rstat
)) {
4884 if (in
->scrub_infop
->header
->get_repair()) {
4885 results
->raw_stats
.error_str
4886 << "freshly-calculated rstats don't match existing ones (will be fixed)";
4887 in
->mdcache
->repair_inode_stats(in
);
4888 results
->raw_stats
.repaired
= true;
4890 results
->raw_stats
.error_str
4891 << "freshly-calculated rstats don't match existing ones";
4893 if (in
->is_dirty()) {
4894 MDCache
*mdcache
= in
->mdcache
; // for dout()
4895 auto ino
= [this]() { return in
->ino(); }; // for dout()
4896 dout(20) << "raw stats most likely wont match since inode is dirty; "
4897 "please rerun scrub when system is stable; "
4898 "assuming passed for now;" << dendl
;
4899 results
->raw_stats
.passed
= true;
4904 results
->raw_stats
.passed
= true;
4906 MDCache
*mdcache
= in
->mdcache
; // for dout()
4907 auto ino
= [this]() { return in
->ino(); }; // for dout()
4908 dout(20) << "raw stats check passed on " << *in
<< dendl
;
4915 void _done() override
{
4916 if ((!results
->raw_stats
.checked
|| results
->raw_stats
.passed
) &&
4917 (!results
->backtrace
.checked
|| results
->backtrace
.passed
) &&
4918 (!results
->inode
.checked
|| results
->inode
.passed
))
4919 results
->passed_validation
= true;
4921 // Flag that we did some repair work so that our repair operation
4922 // can be flushed at end of scrub
4923 if (results
->backtrace
.repaired
||
4924 results
->inode
.repaired
||
4925 results
->raw_stats
.repaired
)
4926 in
->scrub_infop
->header
->set_repaired();
4928 fin
->complete(get_rval());
4930 in
->auth_unpin(this);
4935 dout(10) << "scrub starting validate_disk_state on " << *this << dendl
;
4936 ValidationContinuation
*vc
= new ValidationContinuation(this,
4942 void CInode::validated_data::dump(Formatter
*f
) const
4944 f
->open_object_section("results");
4946 f
->dump_bool("performed_validation", performed_validation
);
4947 f
->dump_bool("passed_validation", passed_validation
);
4948 f
->open_object_section("backtrace");
4950 f
->dump_bool("checked", backtrace
.checked
);
4951 f
->dump_bool("passed", backtrace
.passed
);
4952 f
->dump_int("read_ret_val", backtrace
.ondisk_read_retval
);
4953 f
->dump_stream("ondisk_value") << backtrace
.ondisk_value
;
4954 f
->dump_stream("memoryvalue") << backtrace
.memory_value
;
4955 f
->dump_string("error_str", backtrace
.error_str
.str());
4957 f
->close_section(); // backtrace
4958 f
->open_object_section("raw_stats");
4960 f
->dump_bool("checked", raw_stats
.checked
);
4961 f
->dump_bool("passed", raw_stats
.passed
);
4962 f
->dump_int("read_ret_val", raw_stats
.ondisk_read_retval
);
4963 f
->dump_stream("ondisk_value.dirstat") << raw_stats
.ondisk_value
.dirstat
;
4964 f
->dump_stream("ondisk_value.rstat") << raw_stats
.ondisk_value
.rstat
;
4965 f
->dump_stream("memory_value.dirstat") << raw_stats
.memory_value
.dirstat
;
4966 f
->dump_stream("memory_value.rstat") << raw_stats
.memory_value
.rstat
;
4967 f
->dump_string("error_str", raw_stats
.error_str
.str());
4969 f
->close_section(); // raw_stats
4970 // dump failure return code
4972 if (backtrace
.checked
&& backtrace
.ondisk_read_retval
)
4973 rc
= backtrace
.ondisk_read_retval
;
4974 if (inode
.checked
&& inode
.ondisk_read_retval
)
4975 rc
= inode
.ondisk_read_retval
;
4976 if (raw_stats
.checked
&& raw_stats
.ondisk_read_retval
)
4977 rc
= raw_stats
.ondisk_read_retval
;
4978 f
->dump_int("return_code", rc
);
4980 f
->close_section(); // results
4983 bool CInode::validated_data::all_damage_repaired() const
4986 (raw_stats
.checked
&& !raw_stats
.passed
&& !raw_stats
.repaired
)
4988 (backtrace
.checked
&& !backtrace
.passed
&& !backtrace
.repaired
)
4990 (inode
.checked
&& !inode
.passed
&& !inode
.repaired
);
4995 void CInode::dump(Formatter
*f
, int flags
) const
4997 if (flags
& DUMP_PATH
) {
4999 make_path_string(path
, true);
5002 f
->dump_string("path", path
);
5005 if (flags
& DUMP_INODE_STORE_BASE
)
5006 InodeStoreBase::dump(f
);
5008 if (flags
& DUMP_MDS_CACHE_OBJECT
)
5009 MDSCacheObject::dump(f
);
5011 if (flags
& DUMP_LOCKS
) {
5012 f
->open_object_section("versionlock");
5013 versionlock
.dump(f
);
5016 f
->open_object_section("authlock");
5020 f
->open_object_section("linklock");
5024 f
->open_object_section("dirfragtreelock");
5025 dirfragtreelock
.dump(f
);
5028 f
->open_object_section("filelock");
5032 f
->open_object_section("xattrlock");
5036 f
->open_object_section("snaplock");
5040 f
->open_object_section("nestlock");
5044 f
->open_object_section("flocklock");
5048 f
->open_object_section("policylock");
5053 if (flags
& DUMP_STATE
) {
5054 f
->open_array_section("states");
5055 MDSCacheObject::dump_states(f
);
5056 if (state_test(STATE_EXPORTING
))
5057 f
->dump_string("state", "exporting");
5058 if (state_test(STATE_OPENINGDIR
))
5059 f
->dump_string("state", "openingdir");
5060 if (state_test(STATE_FREEZING
))
5061 f
->dump_string("state", "freezing");
5062 if (state_test(STATE_FROZEN
))
5063 f
->dump_string("state", "frozen");
5064 if (state_test(STATE_AMBIGUOUSAUTH
))
5065 f
->dump_string("state", "ambiguousauth");
5066 if (state_test(STATE_EXPORTINGCAPS
))
5067 f
->dump_string("state", "exportingcaps");
5068 if (state_test(STATE_NEEDSRECOVER
))
5069 f
->dump_string("state", "needsrecover");
5070 if (state_test(STATE_PURGING
))
5071 f
->dump_string("state", "purging");
5072 if (state_test(STATE_DIRTYPARENT
))
5073 f
->dump_string("state", "dirtyparent");
5074 if (state_test(STATE_DIRTYRSTAT
))
5075 f
->dump_string("state", "dirtyrstat");
5076 if (state_test(STATE_STRAYPINNED
))
5077 f
->dump_string("state", "straypinned");
5078 if (state_test(STATE_FROZENAUTHPIN
))
5079 f
->dump_string("state", "frozenauthpin");
5080 if (state_test(STATE_DIRTYPOOL
))
5081 f
->dump_string("state", "dirtypool");
5082 if (state_test(STATE_ORPHAN
))
5083 f
->dump_string("state", "orphan");
5084 if (state_test(STATE_MISSINGOBJS
))
5085 f
->dump_string("state", "missingobjs");
5089 if (flags
& DUMP_CAPS
) {
5090 f
->open_array_section("client_caps");
5091 for (const auto &p
: client_caps
) {
5092 auto &client
= p
.first
;
5093 auto cap
= &p
.second
;
5094 f
->open_object_section("client_cap");
5095 f
->dump_int("client_id", client
.v
);
5096 f
->dump_string("pending", ccap_string(cap
->pending()));
5097 f
->dump_string("issued", ccap_string(cap
->issued()));
5098 f
->dump_string("wanted", ccap_string(cap
->wanted()));
5099 f
->dump_int("last_sent", cap
->get_last_seq());
5104 f
->dump_int("loner", loner_cap
.v
);
5105 f
->dump_int("want_loner", want_loner_cap
.v
);
5107 f
->open_array_section("mds_caps_wanted");
5108 for (const auto &p
: mds_caps_wanted
) {
5109 f
->open_object_section("mds_cap_wanted");
5110 f
->dump_int("rank", p
.first
);
5111 f
->dump_string("cap", ccap_string(p
.second
));
5117 if (flags
& DUMP_DIRFRAGS
) {
5118 f
->open_array_section("dirfrags");
5119 auto&& dfs
= get_dirfrags();
5120 for(const auto &dir
: dfs
) {
5121 f
->open_object_section("dir");
5122 dir
->dump(f
, CDir::DUMP_DEFAULT
| CDir::DUMP_ITEMS
);
5123 dir
->check_rstats();
5130 /****** Scrub Stuff *****/
5131 void CInode::scrub_info_create() const
5133 dout(25) << __func__
<< dendl
;
5134 ceph_assert(!scrub_infop
);
5136 // break out of const-land to set up implicit initial state
5137 CInode
*me
= const_cast<CInode
*>(this);
5138 const auto& pi
= me
->get_projected_inode();
5140 std::unique_ptr
<scrub_info_t
> si(new scrub_info_t());
5141 si
->last_scrub_stamp
= pi
->last_scrub_stamp
;
5142 si
->last_scrub_version
= pi
->last_scrub_version
;
5144 me
->scrub_infop
.swap(si
);
5147 void CInode::scrub_maybe_delete_info()
5150 !scrub_infop
->scrub_in_progress
&&
5151 !scrub_infop
->last_scrub_dirty
) {
5152 scrub_infop
.reset();
5156 void CInode::scrub_initialize(ScrubHeaderRef
& header
)
5158 dout(20) << __func__
<< " with scrub_version " << get_version() << dendl
;
5161 scrub_infop
->scrub_in_progress
= true;
5162 scrub_infop
->queued_frags
.clear();
5163 scrub_infop
->header
= header
;
5164 header
->inc_num_pending();
5165 // right now we don't handle remote inodes
5168 void CInode::scrub_aborted() {
5169 dout(20) << __func__
<< dendl
;
5170 ceph_assert(scrub_is_in_progress());
5172 scrub_infop
->scrub_in_progress
= false;
5173 scrub_infop
->header
->dec_num_pending();
5174 scrub_maybe_delete_info();
5177 void CInode::scrub_finished() {
5178 dout(20) << __func__
<< dendl
;
5179 ceph_assert(scrub_is_in_progress());
5181 scrub_infop
->last_scrub_version
= get_version();
5182 scrub_infop
->last_scrub_stamp
= ceph_clock_now();
5183 scrub_infop
->last_scrub_dirty
= true;
5184 scrub_infop
->scrub_in_progress
= false;
5185 scrub_infop
->header
->dec_num_pending();
5188 int64_t CInode::get_backtrace_pool() const
5191 return mdcache
->mds
->get_metadata_pool();
5193 // Files are required to have an explicit layout that specifies
5195 ceph_assert(get_inode()->layout
.pool_id
!= -1);
5196 return get_inode()->layout
.pool_id
;
5200 void CInode::queue_export_pin(mds_rank_t export_pin
)
5202 if (state_test(CInode::STATE_QUEUEDEXPORTPIN
))
5206 if (export_pin
>= 0)
5207 target
= export_pin
;
5208 else if (export_pin
== MDS_RANK_EPHEMERAL_RAND
)
5209 target
= mdcache
->hash_into_rank_bucket(ino());
5211 target
= MDS_RANK_NONE
;
5213 unsigned min_frag_bits
= mdcache
->get_ephemeral_dist_frag_bits();
5215 for (auto& p
: dirfrags
) {
5216 CDir
*dir
= p
.second
;
5217 if (!dir
->is_auth())
5220 if (export_pin
== MDS_RANK_EPHEMERAL_DIST
) {
5221 if (dir
->get_frag().bits() < min_frag_bits
) {
5226 target
= mdcache
->hash_into_rank_bucket(ino(), dir
->get_frag());
5229 if (target
!= MDS_RANK_NONE
) {
5230 if (dir
->is_subtree_root()) {
5231 // set auxsubtree bit or export it
5232 if (!dir
->state_test(CDir::STATE_AUXSUBTREE
) ||
5233 target
!= dir
->get_dir_auth().first
)
5236 // create aux subtree or export it
5240 // clear aux subtrees ?
5241 queue
= dir
->state_test(CDir::STATE_AUXSUBTREE
);
5248 state_set(CInode::STATE_QUEUEDEXPORTPIN
);
5249 mdcache
->export_pin_queue
.insert(this);
5253 void CInode::maybe_export_pin(bool update
)
5255 if (!g_conf()->mds_bal_export_pin
)
5257 if (!is_dir() || !is_normal())
5260 dout(15) << __func__
<< " update=" << update
<< " " << *this << dendl
;
5262 mds_rank_t export_pin
= get_export_pin(false);
5263 if (export_pin
== MDS_RANK_NONE
&& !update
)
5266 check_pin_policy(export_pin
);
5267 queue_export_pin(export_pin
);
5270 void CInode::set_ephemeral_pin(bool dist
, bool rand
)
5274 state
|= STATE_DISTEPHEMERALPIN
;
5276 state
|= STATE_RANDEPHEMERALPIN
;
5280 if (state_test(state
) != state
) {
5281 dout(10) << "set ephemeral (" << (dist
? "dist" : "")
5282 << (rand
? " rand" : "") << ") pin on " << *this << dendl
;
5283 if (!is_ephemerally_pinned()) {
5284 auto p
= mdcache
->export_ephemeral_pins
.insert(this);
5285 ceph_assert(p
.second
);
5291 void CInode::clear_ephemeral_pin(bool dist
, bool rand
)
5295 state
|= STATE_DISTEPHEMERALPIN
;
5297 state
|= STATE_RANDEPHEMERALPIN
;
5299 if (state_test(state
)) {
5300 dout(10) << "clear ephemeral (" << (dist
? "dist" : "")
5301 << (rand
? " rand" : "") << ") pin on " << *this << dendl
;
5303 if (!is_ephemerally_pinned()) {
5304 auto count
= mdcache
->export_ephemeral_pins
.erase(this);
5305 ceph_assert(count
== 1);
5310 void CInode::maybe_ephemeral_rand(double threshold
)
5312 if (!mdcache
->get_export_ephemeral_random_config()) {
5313 dout(15) << __func__
<< " config false: cannot ephemeral random pin " << *this << dendl
;
5314 clear_ephemeral_pin(false, true);
5316 } else if (!is_dir() || !is_normal()) {
5317 dout(15) << __func__
<< " !dir or !normal: cannot ephemeral random pin " << *this << dendl
;
5318 clear_ephemeral_pin(false, true);
5320 } else if (get_inode()->nlink
== 0) {
5321 dout(15) << __func__
<< " unlinked directory: cannot ephemeral random pin " << *this << dendl
;
5322 clear_ephemeral_pin(false, true);
5324 } else if (state_test(CInode::STATE_RANDEPHEMERALPIN
)) {
5325 dout(10) << __func__
<< " already ephemeral random pinned: requeueing " << *this << dendl
;
5326 queue_export_pin(MDS_RANK_EPHEMERAL_RAND
);
5330 /* not precomputed? */
5331 if (threshold
< 0.0) {
5332 threshold
= get_ephemeral_rand();
5334 if (threshold
<= 0.0) {
5337 double n
= ceph::util::generate_random_number(0.0, 1.0);
5339 dout(15) << __func__
<< " rand " << n
<< " <?= " << threshold
5340 << " " << *this << dendl
;
5342 if (n
<= threshold
) {
5343 dout(10) << __func__
<< " randomly export pinning " << *this << dendl
;
5344 set_ephemeral_pin(false, true);
5345 queue_export_pin(MDS_RANK_EPHEMERAL_RAND
);
5349 void CInode::setxattr_ephemeral_rand(double probability
)
5351 ceph_assert(is_dir());
5352 _get_projected_inode()->export_ephemeral_random_pin
= probability
;
5355 void CInode::setxattr_ephemeral_dist(bool val
)
5357 ceph_assert(is_dir());
5358 _get_projected_inode()->export_ephemeral_distributed_pin
= val
;
5361 void CInode::set_export_pin(mds_rank_t rank
)
5363 ceph_assert(is_dir());
5364 _get_projected_inode()->export_pin
= rank
;
5365 maybe_export_pin(true);
5368 mds_rank_t
CInode::get_export_pin(bool inherit
) const
5370 if (!g_conf()->mds_bal_export_pin
)
5371 return MDS_RANK_NONE
;
5373 /* An inode that is export pinned may not necessarily be a subtree root, we
5374 * need to traverse the parents. A base or system inode cannot be pinned.
5375 * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5376 * have a parent yet.
5378 mds_rank_t r_target
= MDS_RANK_NONE
;
5379 const CInode
*in
= this;
5380 const CDir
*dir
= nullptr;
5382 if (in
->is_system())
5384 const CDentry
*pdn
= in
->get_parent_dn();
5387 if (in
->get_inode()->nlink
== 0) {
5388 // ignore export pin for unlinked directory
5392 if (in
->get_inode()->export_pin
>= 0) {
5393 return in
->get_inode()->export_pin
;
5394 } else if (in
->get_inode()->export_ephemeral_distributed_pin
&&
5395 mdcache
->get_export_ephemeral_distributed_config()) {
5397 return mdcache
->hash_into_rank_bucket(in
->ino(), dir
->get_frag());
5398 return MDS_RANK_EPHEMERAL_DIST
;
5399 } else if (r_target
!= MDS_RANK_NONE
&& in
->get_inode()->export_ephemeral_random_pin
> 0.0) {
5401 } else if (r_target
== MDS_RANK_NONE
&& in
->is_ephemeral_rand() &&
5402 mdcache
->get_export_ephemeral_random_config()) {
5403 /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
5405 return MDS_RANK_EPHEMERAL_RAND
;
5407 r_target
= MDS_RANK_EPHEMERAL_RAND
;
5409 r_target
= mdcache
->hash_into_rank_bucket(in
->ino());
5414 dir
= pdn
->get_dir();
5417 return MDS_RANK_NONE
;
5420 void CInode::check_pin_policy(mds_rank_t export_pin
)
5422 if (export_pin
== MDS_RANK_EPHEMERAL_DIST
) {
5423 set_ephemeral_pin(true, false);
5424 clear_ephemeral_pin(false, true);
5425 } else if (export_pin
== MDS_RANK_EPHEMERAL_RAND
) {
5426 set_ephemeral_pin(false, true);
5427 clear_ephemeral_pin(true, false);
5428 } else if (is_ephemerally_pinned()) {
5429 // export_pin >= 0 || export_pin == MDS_RANK_NONE
5430 clear_ephemeral_pin(true, true);
5431 if (export_pin
!= get_inode()->export_pin
) // inherited export_pin
5432 queue_export_pin(MDS_RANK_NONE
);
5436 double CInode::get_ephemeral_rand() const
5438 /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5439 * have a parent yet.
5441 const CInode
*in
= this;
5442 double max
= mdcache
->export_ephemeral_random_max
;
5444 if (in
->is_system())
5446 const CDentry
*pdn
= in
->get_parent_dn();
5449 // ignore export pin for unlinked directory
5450 if (in
->get_inode()->nlink
== 0)
5453 if (in
->get_inode()->export_ephemeral_random_pin
> 0.0)
5454 return std::min(in
->get_inode()->export_ephemeral_random_pin
, max
);
5456 /* An export_pin overrides only if no closer parent (incl. this one) has a
5459 if (in
->get_inode()->export_pin
>= 0 ||
5460 in
->get_inode()->export_ephemeral_distributed_pin
)
5463 in
= pdn
->get_dir()->inode
;
5468 void CInode::get_nested_dirfrags(std::vector
<CDir
*>& v
) const
5470 for (const auto &p
: dirfrags
) {
5471 const auto& dir
= p
.second
;
5472 if (!dir
->is_subtree_root())
5477 void CInode::get_subtree_dirfrags(std::vector
<CDir
*>& v
) const
5479 for (const auto &p
: dirfrags
) {
5480 const auto& dir
= p
.second
;
5481 if (dir
->is_subtree_root())
5486 MEMPOOL_DEFINE_OBJECT_FACTORY(CInode
, co_inode
, mds_co
);