1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/int_types.h"
16 #include "common/errno.h"
30 #include "events/EUpdate.h"
32 #include "osdc/Objecter.h"
36 #include "LogSegment.h"
38 #include "common/Clock.h"
40 #include "common/config.h"
41 #include "global/global_context.h"
42 #include "include/ceph_assert.h"
44 #include "mds/MDSContinuation.h"
45 #include "mds/InoTable.h"
46 #include "cephfs_features.h"
47 #include "osdc/Objecter.h"
49 #define dout_context g_ceph_context
50 #define dout_subsys ceph_subsys_mds
52 #define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") "
56 void CInodeCommitOperation::update(ObjectOperation
&op
, inode_backtrace_t
&bt
) {
59 op
.priority
= priority
;
63 encode(bt
, parent_bl
);
64 op
.setxattr("parent", parent_bl
);
66 // for the old pool there is no need to update the layout and symlink
67 if (!update_layout_symlink
)
71 encode(_layout
, layout_bl
, _features
);
72 op
.setxattr("layout", layout_bl
);
74 if (!_symlink
.empty()) {
75 bufferlist symlink_bl
;
76 encode(_symlink
, symlink_bl
);
77 op
.setxattr("symlink", symlink_bl
);
81 class CInodeIOContext
: public MDSIOContextBase
85 MDSRank
*get_mds() override
{return in
->mdcache
->mds
;}
87 explicit CInodeIOContext(CInode
*in_
) : in(in_
) {
88 ceph_assert(in
!= NULL
);
92 sr_t
* const CInode::projected_inode::UNDEF_SRNODE
= (sr_t
*)(unsigned long)-1;
94 LockType
CInode::versionlock_type(CEPH_LOCK_IVERSION
);
95 LockType
CInode::authlock_type(CEPH_LOCK_IAUTH
);
96 LockType
CInode::linklock_type(CEPH_LOCK_ILINK
);
97 LockType
CInode::dirfragtreelock_type(CEPH_LOCK_IDFT
);
98 LockType
CInode::filelock_type(CEPH_LOCK_IFILE
);
99 LockType
CInode::xattrlock_type(CEPH_LOCK_IXATTR
);
100 LockType
CInode::snaplock_type(CEPH_LOCK_ISNAP
);
101 LockType
CInode::nestlock_type(CEPH_LOCK_INEST
);
102 LockType
CInode::flocklock_type(CEPH_LOCK_IFLOCK
);
103 LockType
CInode::policylock_type(CEPH_LOCK_IPOLICY
);
105 std::string_view
CInode::pin_name(int p
) const
108 case PIN_DIRFRAG
: return "dirfrag";
109 case PIN_CAPS
: return "caps";
110 case PIN_IMPORTING
: return "importing";
111 case PIN_OPENINGDIR
: return "openingdir";
112 case PIN_REMOTEPARENT
: return "remoteparent";
113 case PIN_BATCHOPENJOURNAL
: return "batchopenjournal";
114 case PIN_SCATTERED
: return "scattered";
115 case PIN_STICKYDIRS
: return "stickydirs";
116 //case PIN_PURGING: return "purging";
117 case PIN_FREEZING
: return "freezing";
118 case PIN_FROZEN
: return "frozen";
119 case PIN_IMPORTINGCAPS
: return "importingcaps";
120 case PIN_EXPORTINGCAPS
: return "exportingcaps";
121 case PIN_PASTSNAPPARENT
: return "pastsnapparent";
122 case PIN_OPENINGSNAPPARENTS
: return "openingsnapparents";
123 case PIN_TRUNCATING
: return "truncating";
124 case PIN_STRAY
: return "stray";
125 case PIN_NEEDSNAPFLUSH
: return "needsnapflush";
126 case PIN_DIRTYRSTAT
: return "dirtyrstat";
127 case PIN_DIRTYPARENT
: return "dirtyparent";
128 case PIN_DIRWAITER
: return "dirwaiter";
129 default: return generic_pin_name(p
);
133 //int cinode_pins[CINODE_NUM_PINS]; // counts
134 ostream
& CInode::print_db_line_prefix(ostream
& out
) const
136 return out
<< ceph_clock_now() << " mds." << mdcache
->mds
->get_nodeid() << ".cache.ino(" << ino() << ") ";
140 * write caps and lock ids
142 struct cinode_lock_info_t cinode_lock_info
[] = {
143 { CEPH_LOCK_IFILE
, CEPH_CAP_ANY_FILE_WR
},
144 { CEPH_LOCK_IAUTH
, CEPH_CAP_AUTH_EXCL
},
145 { CEPH_LOCK_ILINK
, CEPH_CAP_LINK_EXCL
},
146 { CEPH_LOCK_IXATTR
, CEPH_CAP_XATTR_EXCL
},
148 int num_cinode_locks
= sizeof(cinode_lock_info
) / sizeof(cinode_lock_info
[0]);
150 ostream
& operator<<(ostream
& out
, const CInode
& in
)
153 in
.make_path_string(path
, true);
155 out
<< "[inode " << in
.ino();
157 << (in
.is_multiversion() ? "...":"")
158 << in
.first
<< "," << in
.last
<< "]";
159 out
<< " " << path
<< (in
.is_dir() ? "/":"");
163 if (in
.is_replicated())
164 out
<< in
.get_replicas();
166 mds_authority_t a
= in
.authority();
167 out
<< " rep@" << a
.first
;
168 if (a
.second
!= CDIR_AUTH_UNKNOWN
)
169 out
<< "," << a
.second
;
170 out
<< "." << in
.get_replica_nonce();
174 out
<< " symlink='" << in
.symlink
<< "'";
175 if (in
.is_dir() && !in
.dirfragtree
.empty())
176 out
<< " " << in
.dirfragtree
;
178 out
<< " v" << in
.get_version();
179 if (in
.get_projected_version() > in
.get_version())
180 out
<< " pv" << in
.get_projected_version();
182 if (in
.get_num_auth_pins()) {
183 out
<< " ap=" << in
.get_num_auth_pins();
184 #ifdef MDS_AUTHPIN_SET
185 in
.print_authpin_set(out
);
190 out
<< " snaprealm=" << in
.snaprealm
;
192 if (in
.state_test(CInode::STATE_AMBIGUOUSAUTH
)) out
<< " AMBIGAUTH";
193 if (in
.state_test(CInode::STATE_NEEDSRECOVER
)) out
<< " NEEDSRECOVER";
194 if (in
.state_test(CInode::STATE_RECOVERING
)) out
<< " RECOVERING";
195 if (in
.state_test(CInode::STATE_DIRTYPARENT
)) out
<< " DIRTYPARENT";
196 if (in
.state_test(CInode::STATE_MISSINGOBJS
)) out
<< " MISSINGOBJS";
197 if (in
.is_ephemeral_dist()) out
<< " DISTEPHEMERALPIN";
198 if (in
.is_ephemeral_rand()) out
<< " RANDEPHEMERALPIN";
199 if (in
.is_freezing_inode()) out
<< " FREEZING=" << in
.auth_pin_freeze_allowance
;
200 if (in
.is_frozen_inode()) out
<< " FROZEN";
201 if (in
.is_frozen_auth_pin()) out
<< " FROZEN_AUTHPIN";
203 const auto& pi
= in
.get_projected_inode();
204 if (pi
->is_truncating())
205 out
<< " truncating(" << pi
->truncate_from
<< " to " << pi
->truncate_size
<< ")";
208 out
<< " " << in
.get_inode()->dirstat
;
209 if (g_conf()->mds_debug_scatterstat
&& in
.is_projected()) {
210 out
<< "->" << pi
->dirstat
;
213 out
<< " s=" << in
.get_inode()->size
;
214 if (in
.get_inode()->nlink
!= 1)
215 out
<< " nl=" << in
.get_inode()->nlink
;
219 out
<< " " << in
.get_inode()->rstat
;
220 if (!(in
.get_inode()->rstat
== in
.get_inode()->accounted_rstat
))
221 out
<< "/" << in
.get_inode()->accounted_rstat
;
222 if (g_conf()->mds_debug_scatterstat
&& in
.is_projected()) {
223 out
<< "->" << pi
->rstat
;
224 if (!(pi
->rstat
== pi
->accounted_rstat
))
225 out
<< "/" << pi
->accounted_rstat
;
228 if (in
.is_any_old_inodes()) {
229 out
<< " old_inodes=" << in
.get_old_inodes()->size();
232 if (!in
.client_need_snapflush
.empty())
233 out
<< " need_snapflush=" << in
.client_need_snapflush
;
236 if (!in
.authlock
.is_sync_and_unlocked())
237 out
<< " " << in
.authlock
;
238 if (!in
.linklock
.is_sync_and_unlocked())
239 out
<< " " << in
.linklock
;
240 if (in
.get_inode()->is_dir()) {
241 if (!in
.dirfragtreelock
.is_sync_and_unlocked())
242 out
<< " " << in
.dirfragtreelock
;
243 if (!in
.snaplock
.is_sync_and_unlocked())
244 out
<< " " << in
.snaplock
;
245 if (!in
.nestlock
.is_sync_and_unlocked())
246 out
<< " " << in
.nestlock
;
247 if (!in
.policylock
.is_sync_and_unlocked())
248 out
<< " " << in
.policylock
;
250 if (!in
.flocklock
.is_sync_and_unlocked())
251 out
<< " " << in
.flocklock
;
253 if (!in
.filelock
.is_sync_and_unlocked())
254 out
<< " " << in
.filelock
;
255 if (!in
.xattrlock
.is_sync_and_unlocked())
256 out
<< " " << in
.xattrlock
;
257 if (!in
.versionlock
.is_sync_and_unlocked())
258 out
<< " " << in
.versionlock
;
260 // hack: spit out crap on which clients have caps
261 if (in
.get_inode()->client_ranges
.size())
262 out
<< " cr=" << in
.get_inode()->client_ranges
;
264 if (!in
.get_client_caps().empty()) {
267 for (const auto &p
: in
.get_client_caps()) {
268 if (!first
) out
<< ",";
269 out
<< p
.first
<< "="
270 << ccap_string(p
.second
.pending());
271 if (p
.second
.issued() != p
.second
.pending())
272 out
<< "/" << ccap_string(p
.second
.issued());
273 out
<< "/" << ccap_string(p
.second
.wanted())
274 << "@" << p
.second
.get_last_seq();
278 if (in
.get_loner() >= 0 || in
.get_wanted_loner() >= 0) {
279 out
<< ",l=" << in
.get_loner();
280 if (in
.get_loner() != in
.get_wanted_loner())
281 out
<< "(" << in
.get_wanted_loner() << ")";
284 if (!in
.get_mds_caps_wanted().empty()) {
287 for (const auto &p
: in
.get_mds_caps_wanted()) {
290 out
<< p
.first
<< '=' << ccap_string(p
.second
);
296 if (in
.get_num_ref()) {
298 in
.print_pin_set(out
);
301 if (in
.get_inode()->export_pin
!= MDS_RANK_NONE
) {
302 out
<< " export_pin=" << in
.get_inode()->export_pin
;
304 if (in
.state_test(CInode::STATE_DISTEPHEMERALPIN
)) {
307 if (in
.state_test(CInode::STATE_RANDEPHEMERALPIN
)) {
316 CInode::CInode(MDCache
*c
, bool auth
, snapid_t f
, snapid_t l
) :
317 mdcache(c
), first(f
), last(l
),
320 item_open_file(this),
321 item_dirty_parent(this),
322 item_dirty_dirfrag_dir(this),
323 item_dirty_dirfrag_nest(this),
324 item_dirty_dirfrag_dirfragtree(this),
326 versionlock(this, &versionlock_type
),
327 authlock(this, &authlock_type
),
328 linklock(this, &linklock_type
),
329 dirfragtreelock(this, &dirfragtreelock_type
),
330 filelock(this, &filelock_type
),
331 xattrlock(this, &xattrlock_type
),
332 snaplock(this, &snaplock_type
),
333 nestlock(this, &nestlock_type
),
334 flocklock(this, &flocklock_type
),
335 policylock(this, &policylock_type
)
338 state_set(STATE_AUTH
);
341 void CInode::print(ostream
& out
) const
346 void CInode::add_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
)
348 dout(10) << __func__
<< " client." << client
<< " snapid " << snapid
<< " on " << snapin
<< dendl
;
350 if (client_need_snapflush
.empty()) {
351 get(CInode::PIN_NEEDSNAPFLUSH
);
353 // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially
354 // long periods waiting for clients to flush their snaps.
355 auth_pin(this); // pin head get_inode()->..
358 auto &clients
= client_need_snapflush
[snapid
];
360 snapin
->auth_pin(this); // ...and pin snapped/old inode!
362 clients
.insert(client
);
365 void CInode::remove_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
)
367 dout(10) << __func__
<< " client." << client
<< " snapid " << snapid
<< " on " << snapin
<< dendl
;
368 auto it
= client_need_snapflush
.find(snapid
);
369 if (it
== client_need_snapflush
.end()) {
370 dout(10) << " snapid not found" << dendl
;
373 size_t n
= it
->second
.erase(client
);
375 dout(10) << " client not found" << dendl
;
378 if (it
->second
.empty()) {
379 client_need_snapflush
.erase(it
);
380 snapin
->auth_unpin(this);
382 if (client_need_snapflush
.empty()) {
383 put(CInode::PIN_NEEDSNAPFLUSH
);
389 pair
<bool,bool> CInode::split_need_snapflush(CInode
*cowin
, CInode
*in
)
391 dout(10) << __func__
<< " [" << cowin
->first
<< "," << cowin
->last
<< "] for " << *cowin
<< dendl
;
392 bool cowin_need_flush
= false;
393 bool orig_need_flush
= false;
394 auto it
= client_need_snapflush
.lower_bound(cowin
->first
);
395 while (it
!= client_need_snapflush
.end() && it
->first
< in
->first
) {
396 ceph_assert(!it
->second
.empty());
397 if (cowin
->last
>= it
->first
) {
398 cowin
->auth_pin(this);
399 cowin_need_flush
= true;
402 it
= client_need_snapflush
.erase(it
);
404 in
->auth_unpin(this);
407 if (it
!= client_need_snapflush
.end() && it
->first
<= in
->last
)
408 orig_need_flush
= true;
410 return make_pair(cowin_need_flush
, orig_need_flush
);
413 void CInode::mark_dirty_rstat()
415 if (!state_test(STATE_DIRTYRSTAT
)) {
416 dout(10) << __func__
<< dendl
;
417 state_set(STATE_DIRTYRSTAT
);
419 CDentry
*pdn
= get_projected_parent_dn();
420 if (pdn
->is_auth()) {
421 CDir
*pdir
= pdn
->dir
;
422 pdir
->dirty_rstat_inodes
.push_back(&dirty_rstat_item
);
423 mdcache
->mds
->locker
->mark_updated_scatterlock(&pdir
->inode
->nestlock
);
425 // under cross-MDS rename.
426 // DIRTYRSTAT flag will get cleared when rename finishes
427 ceph_assert(state_test(STATE_AMBIGUOUSAUTH
));
431 void CInode::clear_dirty_rstat()
433 if (state_test(STATE_DIRTYRSTAT
)) {
434 dout(10) << __func__
<< dendl
;
435 state_clear(STATE_DIRTYRSTAT
);
437 dirty_rstat_item
.remove_myself();
441 CInode::projected_inode
CInode::project_inode(const MutationRef
& mut
,
442 bool xattr
, bool snap
)
444 if (mut
&& mut
->is_projected(this)) {
445 ceph_assert(!xattr
&& !snap
);
446 auto _inode
= std::const_pointer_cast
<mempool_inode
>(projected_nodes
.back().inode
);
447 return projected_inode(std::move(_inode
), xattr_map_ptr());
450 auto pi
= allocate_inode(*get_projected_inode());
452 if (scrub_infop
&& scrub_infop
->last_scrub_dirty
) {
453 pi
->last_scrub_stamp
= scrub_infop
->last_scrub_stamp
;
454 pi
->last_scrub_version
= scrub_infop
->last_scrub_version
;
455 scrub_infop
->last_scrub_dirty
= false;
456 scrub_maybe_delete_info();
459 const auto& ox
= get_projected_xattrs();
462 px
= allocate_xattr_map();
467 sr_t
* ps
= projected_inode::UNDEF_SRNODE
;
469 ps
= prepare_new_srnode(0);
470 ++num_projected_srnodes
;
473 projected_nodes
.emplace_back(pi
, xattr
? px
: ox
, ps
);
475 mut
->add_projected_node(this);
476 dout(15) << __func__
<< " " << pi
->ino
<< dendl
;
477 return projected_inode(std::move(pi
), std::move(px
), ps
);
480 void CInode::pop_and_dirty_projected_inode(LogSegment
*ls
, const MutationRef
& mut
)
482 ceph_assert(!projected_nodes
.empty());
483 auto front
= std::move(projected_nodes
.front());
484 dout(15) << __func__
<< " v" << front
.inode
->version
<< dendl
;
486 projected_nodes
.pop_front();
488 mut
->remove_projected_node(this);
490 bool pool_updated
= get_inode()->layout
.pool_id
!= front
.inode
->layout
.pool_id
;
491 bool pin_updated
= (get_inode()->export_pin
!= front
.inode
->export_pin
) ||
492 (get_inode()->export_ephemeral_distributed_pin
!=
493 front
.inode
->export_ephemeral_distributed_pin
);
495 reset_inode(std::move(front
.inode
));
496 if (front
.xattrs
!= get_xattrs())
497 reset_xattrs(std::move(front
.xattrs
));
499 if (front
.snapnode
!= projected_inode::UNDEF_SRNODE
) {
500 --num_projected_srnodes
;
501 pop_projected_snaprealm(front
.snapnode
, false);
505 if (get_inode()->is_backtrace_updated())
506 mark_dirty_parent(ls
, pool_updated
);
509 maybe_export_pin(true);
512 sr_t
*CInode::prepare_new_srnode(snapid_t snapid
)
514 const sr_t
*cur_srnode
= get_projected_srnode();
518 new_srnode
= new sr_t(*cur_srnode
);
521 snapid
= mdcache
->get_global_snaprealm()->get_newest_seq();
522 new_srnode
= new sr_t();
523 new_srnode
->seq
= snapid
;
524 new_srnode
->created
= snapid
;
525 new_srnode
->current_parent_since
= get_oldest_snap();
526 SnapRealm
*sr
= find_snaprealm();
527 dout(20) << __func__
<< ": inheriting change_attr from " << *sr
529 new_srnode
->change_attr
= sr
->srnode
.change_attr
;
534 const sr_t
*CInode::get_projected_srnode() const {
535 if (num_projected_srnodes
> 0) {
536 for (auto it
= projected_nodes
.rbegin(); it
!= projected_nodes
.rend(); ++it
)
537 if (it
->snapnode
!= projected_inode::UNDEF_SRNODE
)
541 return &snaprealm
->srnode
;
546 void CInode::project_snaprealm(sr_t
*new_srnode
)
548 dout(10) << __func__
<< " " << new_srnode
<< dendl
;
549 ceph_assert(projected_nodes
.back().snapnode
== projected_inode::UNDEF_SRNODE
);
550 projected_nodes
.back().snapnode
= new_srnode
;
551 ++num_projected_srnodes
;
554 void CInode::mark_snaprealm_global(sr_t
*new_srnode
)
556 ceph_assert(!is_dir());
557 // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since'
558 new_srnode
->last_destroyed
= new_srnode
->current_parent_since
;
559 new_srnode
->current_parent_since
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
560 new_srnode
->mark_parent_global();
563 void CInode::clear_snaprealm_global(sr_t
*new_srnode
)
565 // restore 'current_parent_since'
566 new_srnode
->current_parent_since
= new_srnode
->last_destroyed
;
567 new_srnode
->last_destroyed
= 0;
568 new_srnode
->seq
= mdcache
->get_global_snaprealm()->get_newest_seq();
569 new_srnode
->clear_parent_global();
572 bool CInode::is_projected_snaprealm_global() const
574 const sr_t
*srnode
= get_projected_srnode();
575 if (srnode
&& srnode
->is_parent_global())
580 void CInode::project_snaprealm_past_parent(SnapRealm
*newparent
)
582 sr_t
*new_snap
= project_snaprealm();
583 record_snaprealm_past_parent(new_snap
, newparent
);
587 /* if newparent != parent, add parent to past_parents
588 if parent DNE, we need to find what the parent actually is and fill that in */
589 void CInode::record_snaprealm_past_parent(sr_t
*new_snap
, SnapRealm
*newparent
)
591 ceph_assert(!new_snap
->is_parent_global());
592 SnapRealm
*oldparent
;
594 oldparent
= find_snaprealm();
596 oldparent
= snaprealm
->parent
;
599 if (newparent
!= oldparent
) {
600 snapid_t oldparentseq
= oldparent
->get_newest_seq();
601 if (oldparentseq
+ 1 > new_snap
->current_parent_since
) {
602 // copy old parent's snaps
603 const set
<snapid_t
>& snaps
= oldparent
->get_snaps();
604 auto p
= snaps
.lower_bound(new_snap
->current_parent_since
);
605 if (p
!= snaps
.end())
606 new_snap
->past_parent_snaps
.insert(p
, snaps
.end());
607 if (oldparentseq
> new_snap
->seq
)
608 new_snap
->seq
= oldparentseq
;
610 new_snap
->current_parent_since
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
614 void CInode::record_snaprealm_parent_dentry(sr_t
*new_snap
, SnapRealm
*oldparent
,
615 CDentry
*dn
, bool primary_dn
)
617 ceph_assert(new_snap
->is_parent_global());
620 oldparent
= dn
->get_dir()->inode
->find_snaprealm();
621 auto& snaps
= oldparent
->get_snaps();
624 auto p
= snaps
.lower_bound(dn
->first
);
625 if (p
!= snaps
.end())
626 new_snap
->past_parent_snaps
.insert(p
, snaps
.end());
628 // 'last_destroyed' is used as 'current_parent_since'
629 auto p
= snaps
.lower_bound(new_snap
->last_destroyed
);
630 if (p
!= snaps
.end())
631 new_snap
->past_parent_snaps
.insert(p
, snaps
.end());
632 new_snap
->last_destroyed
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
636 void CInode::early_pop_projected_snaprealm()
638 ceph_assert(!projected_nodes
.empty());
639 if (projected_nodes
.front().snapnode
!= projected_inode::UNDEF_SRNODE
) {
640 pop_projected_snaprealm(projected_nodes
.front().snapnode
, true);
641 projected_nodes
.front().snapnode
= projected_inode::UNDEF_SRNODE
;
642 --num_projected_srnodes
;
646 void CInode::pop_projected_snaprealm(sr_t
*next_snaprealm
, bool early
)
648 if (next_snaprealm
) {
649 dout(10) << __func__
<< (early
? " (early) " : " ")
650 << next_snaprealm
<< " seq " << next_snaprealm
->seq
<< dendl
;
654 auto old_flags
= snaprealm
->srnode
.flags
;
655 snaprealm
->srnode
= *next_snaprealm
;
656 delete next_snaprealm
;
658 if ((snaprealm
->srnode
.flags
^ old_flags
) & sr_t::PARENT_GLOBAL
) {
659 snaprealm
->adjust_parent();
662 if (snaprealm
->parent
)
663 dout(10) << " realm " << *snaprealm
<< " parent " << *snaprealm
->parent
<< dendl
;
665 dout(10) << __func__
<< (early
? " (early) null" : " null") << dendl
;
666 ceph_assert(snaprealm
);
667 snaprealm
->merge_to(NULL
);
672 // ====== CInode =======
676 InodeStoreBase::inode_const_ptr
InodeStoreBase::empty_inode
= InodeStoreBase::allocate_inode();
678 __u32
InodeStoreBase::hash_dentry_name(std::string_view dn
)
680 int which
= inode
->dir_layout
.dl_dir_hash
;
682 which
= CEPH_STR_HASH_LINUX
;
683 ceph_assert(ceph_str_hash_valid(which
));
684 return ceph_str_hash(which
, dn
.data(), dn
.length());
687 frag_t
InodeStoreBase::pick_dirfrag(std::string_view dn
)
689 if (dirfragtree
.empty())
690 return frag_t(); // avoid the string hash if we can.
692 __u32 h
= hash_dentry_name(dn
);
693 return dirfragtree
[h
];
696 std::pair
<bool, std::vector
<CDir
*>> CInode::get_dirfrags_under(frag_t fg
)
698 std::pair
<bool, std::vector
<CDir
*>> result
;
699 auto& all
= result
.first
;
700 auto& dirs
= result
.second
;
703 if (auto it
= dirfrags
.find(fg
); it
!= dirfrags
.end()){
705 dirs
.push_back(it
->second
);
710 for(auto &[_fg
, _dir
] : dirfrags
){
711 // frag_t.bits() can indicate the depth of the partition in the directory tree
713 // 01* : bit = 2, on the second floor
716 // 00* 01* 10* 11* -- > level 2, bit = 2
717 // so fragA.bits > fragB.bits means fragA is deeper than fragB
719 if (fg
.bits() >= _fg
.bits()) {
720 if (_fg
.contains(fg
)) {
725 if (fg
.contains(_fg
)) {
726 dirs
.push_back(_dir
);
727 // we can calculate how many sub slices a slice can be divided into
728 // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*)
729 // or 2^2 frags belonging to the second layer(00* 01* 10* 11*)
730 // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level
731 total
+= 1 << (24 - _fg
.bits());
736 // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache
737 all
= ((1<<(24-fg
.bits())) == total
);
741 void CInode::verify_dirfrags()
744 for (const auto &p
: dirfrags
) {
745 if (!dirfragtree
.is_leaf(p
.first
)) {
746 dout(0) << "have open dirfrag " << p
.first
<< " but not leaf in " << dirfragtree
747 << ": " << *p
.second
<< dendl
;
754 void CInode::force_dirfrags()
757 for (auto &p
: dirfrags
) {
758 if (!dirfragtree
.is_leaf(p
.first
)) {
759 dout(0) << "have open dirfrag " << p
.first
<< " but not leaf in " << dirfragtree
760 << ": " << *p
.second
<< dendl
;
767 dirfragtree
.get_leaves(leaves
);
768 for (const auto& leaf
: leaves
) {
769 mdcache
->get_force_dirfrag(dirfrag_t(ino(), leaf
), true);
776 CDir
*CInode::get_approx_dirfrag(frag_t fg
)
778 CDir
*dir
= get_dirfrag(fg
);
782 auto&& p
= get_dirfrags_under(fg
);
783 if (!p
.second
.empty())
784 return p
.second
.front();
787 while (fg
.bits() > 0) {
789 dir
= get_dirfrag(fg
);
795 CDir
*CInode::get_or_open_dirfrag(MDCache
*mdcache
, frag_t fg
)
797 ceph_assert(is_dir());
800 CDir
*dir
= get_dirfrag(fg
);
803 ceph_assert(is_auth() || mdcache
->mds
->is_any_replay());
804 dir
= new CDir(this, fg
, mdcache
, is_auth());
810 CDir
*CInode::add_dirfrag(CDir
*dir
)
812 auto em
= dirfrags
.emplace(std::piecewise_construct
, std::forward_as_tuple(dir
->dirfrag().frag
), std::forward_as_tuple(dir
));
813 ceph_assert(em
.second
);
815 if (stickydir_ref
> 0) {
816 dir
->state_set(CDir::STATE_STICKY
);
817 dir
->get(CDir::PIN_STICKY
);
825 void CInode::close_dirfrag(frag_t fg
)
827 dout(14) << __func__
<< " " << fg
<< dendl
;
828 ceph_assert(dirfrags
.count(fg
));
830 CDir
*dir
= dirfrags
[fg
];
831 dir
->remove_null_dentries();
837 if (stickydir_ref
> 0) {
838 dir
->state_clear(CDir::STATE_STICKY
);
839 dir
->put(CDir::PIN_STICKY
);
842 if (dir
->is_subtree_root())
845 // dump any remaining dentries, for debugging purposes
846 for (const auto &p
: dir
->items
)
847 dout(14) << __func__
<< " LEFTOVER dn " << *p
.second
<< dendl
;
849 ceph_assert(dir
->get_num_ref() == 0);
854 void CInode::close_dirfrags()
856 while (!dirfrags
.empty())
857 close_dirfrag(dirfrags
.begin()->first
);
860 bool CInode::has_subtree_root_dirfrag(int auth
)
862 if (num_subtree_roots
> 0) {
865 for (const auto &p
: dirfrags
) {
866 if (p
.second
->is_subtree_root() &&
867 p
.second
->dir_auth
.first
== auth
)
874 bool CInode::has_subtree_or_exporting_dirfrag()
876 if (num_subtree_roots
> 0 || num_exporting_dirs
> 0)
881 void CInode::get_stickydirs()
883 if (stickydir_ref
== 0) {
885 for (const auto &p
: dirfrags
) {
886 p
.second
->state_set(CDir::STATE_STICKY
);
887 p
.second
->get(CDir::PIN_STICKY
);
893 void CInode::put_stickydirs()
895 ceph_assert(stickydir_ref
> 0);
897 if (stickydir_ref
== 0) {
899 for (const auto &p
: dirfrags
) {
900 p
.second
->state_clear(CDir::STATE_STICKY
);
901 p
.second
->put(CDir::PIN_STICKY
);
912 void CInode::first_get()
916 parent
->get(CDentry::PIN_INODEPIN
);
919 void CInode::last_put()
923 parent
->put(CDentry::PIN_INODEPIN
);
928 if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
929 mdcache
->maybe_eval_stray(this, true);
932 void CInode::add_remote_parent(CDentry
*p
)
934 if (remote_parents
.empty())
935 get(PIN_REMOTEPARENT
);
936 remote_parents
.insert(p
);
938 void CInode::remove_remote_parent(CDentry
*p
)
940 remote_parents
.erase(p
);
941 if (remote_parents
.empty())
942 put(PIN_REMOTEPARENT
);
948 CDir
*CInode::get_parent_dir()
954 CDir
*CInode::get_projected_parent_dir()
956 CDentry
*p
= get_projected_parent_dn();
961 CInode
*CInode::get_parent_inode()
964 return parent
->dir
->inode
;
968 bool CInode::is_ancestor_of(const CInode
*other
) const
973 const CDentry
*pdn
= other
->get_oldest_parent_dn();
975 ceph_assert(other
->is_base());
978 other
= pdn
->get_dir()->get_inode();
983 bool CInode::is_projected_ancestor_of(const CInode
*other
) const
988 const CDentry
*pdn
= other
->get_projected_parent_dn();
990 ceph_assert(other
->is_base());
993 other
= pdn
->get_dir()->get_inode();
999 * Because a non-directory inode may have multiple links, the use_parent
1000 * argument allows selecting which parent to use for path construction. This
1001 * argument is only meaningful for the final component (i.e. the first of the
1002 * nested calls) because directories cannot have multiple hard links. If
1003 * use_parent is NULL and projected is true, the primary parent's projected
1004 * inode is used all the way up the path chain. Otherwise the primary parent
1005 * stable inode is used.
1007 void CInode::make_path_string(string
& s
, bool projected
, const CDentry
*use_parent
) const
1010 use_parent
= projected
? get_projected_parent_dn() : parent
;
1014 use_parent
->make_path_string(s
, projected
);
1015 } else if (is_root()) {
1017 } else if (is_mdsdir()) {
1019 uint64_t eino(ino());
1020 eino
-= MDS_INO_MDSDIR_OFFSET
;
1021 snprintf(t
, sizeof(t
), "~mds%" PRId64
, eino
);
1025 uint64_t eino(ino());
1026 snprintf(n
, sizeof(n
), "#%" PRIx64
, eino
);
1031 void CInode::make_path(filepath
& fp
, bool projected
) const
1033 const CDentry
*use_parent
= projected
? get_projected_parent_dn() : parent
;
1035 ceph_assert(!is_base());
1036 use_parent
->make_path(fp
, projected
);
1038 fp
= filepath(ino());
1042 void CInode::name_stray_dentry(string
& dname
)
1045 snprintf(s
, sizeof(s
), "%llx", (unsigned long long)ino().val
);
1049 version_t
CInode::pre_dirty()
1052 CDentry
* _cdentry
= get_projected_parent_dn();
1054 pv
= _cdentry
->pre_dirty(get_projected_version());
1055 dout(10) << "pre_dirty " << pv
<< " (current v " << get_inode()->version
<< ")" << dendl
;
1057 ceph_assert(is_base());
1058 pv
= get_projected_version() + 1;
1060 // force update backtrace for old format inode (see mempool_inode::decode)
1061 if (get_inode()->backtrace_version
== 0 && !projected_nodes
.empty()) {
1062 auto pi
= _get_projected_inode();
1063 if (pi
->backtrace_version
== 0)
1064 pi
->update_backtrace(pv
);
1069 void CInode::_mark_dirty(LogSegment
*ls
)
1071 if (!state_test(STATE_DIRTY
)) {
1072 state_set(STATE_DIRTY
);
1077 // move myself to this segment's dirty list
1079 ls
->dirty_inodes
.push_back(&item_dirty
);
1082 void CInode::mark_dirty(LogSegment
*ls
) {
1084 dout(10) << __func__
<< " " << *this << dendl
;
1087 NOTE: I may already be dirty, but this fn _still_ needs to be called so that
1088 the directory is (perhaps newly) dirtied, and so that parent_dir_version is
1092 // only auth can get dirty. "dirty" async data in replicas is relative to
1093 // filelock state, not the dirty flag.
1094 ceph_assert(is_auth());
1096 // touch my private version
1101 parent
->mark_dirty(get_version(), ls
);
1105 void CInode::mark_clean()
1107 dout(10) << __func__
<< " " << *this << dendl
;
1108 if (state_test(STATE_DIRTY
)) {
1109 state_clear(STATE_DIRTY
);
1112 // remove myself from ls dirty list
1113 item_dirty
.remove_myself();
1119 // per-inode storage
1120 // (currently for root inode only)
1122 struct C_IO_Inode_Stored
: public CInodeIOContext
{
1125 C_IO_Inode_Stored(CInode
*i
, version_t v
, Context
*f
) : CInodeIOContext(i
), version(v
), fin(f
) {}
1126 void finish(int r
) override
{
1127 in
->_stored(r
, version
, fin
);
1129 void print(ostream
& out
) const override
{
1130 out
<< "inode_store(" << in
->ino() << ")";
1134 object_t
InodeStoreBase::get_object_name(inodeno_t ino
, frag_t fg
, std::string_view suffix
)
1137 snprintf(n
, sizeof(n
), "%llx.%08llx", (long long unsigned)ino
, (long long unsigned)fg
);
1138 ceph_assert(strlen(n
) + suffix
.size() < sizeof n
);
1139 strncat(n
, suffix
.data(), suffix
.size());
1143 void CInode::store(MDSContext
*fin
)
1145 dout(10) << __func__
<< " " << get_version() << dendl
;
1146 ceph_assert(is_base());
1149 purge_stale_snap_data(snaprealm
->get_snaps());
1153 string magic
= CEPH_FS_ONDISK_MAGIC
;
1156 encode_store(bl
, mdcache
->mds
->mdsmap
->get_up_features());
1163 object_t oid
= CInode::get_object_name(ino(), frag_t(), ".inode");
1164 object_locator_t
oloc(mdcache
->mds
->get_metadata_pool());
1167 new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin
),
1168 mdcache
->mds
->finisher
);
1169 mdcache
->mds
->objecter
->mutate(oid
, oloc
, m
, snapc
,
1170 ceph::real_clock::now(), 0,
1174 void CInode::_stored(int r
, version_t v
, Context
*fin
)
1177 dout(1) << "store error " << r
<< " v " << v
<< " on " << *this << dendl
;
1178 mdcache
->mds
->clog
->error() << "failed to store inode " << ino()
1179 << " object: " << cpp_strerror(r
);
1180 mdcache
->mds
->handle_write_error(r
);
1185 dout(10) << __func__
<< " " << v
<< " on " << *this << dendl
;
1186 if (v
== get_projected_version())
1192 void CInode::flush(MDSContext
*fin
)
1194 dout(10) << __func__
<< " " << *this << dendl
;
1195 ceph_assert(is_auth() && can_auth_pin());
1197 MDSGatherBuilder
gather(g_ceph_context
);
1199 if (is_dirty_parent()) {
1200 store_backtrace(gather
.new_sub());
1204 store(gather
.new_sub());
1206 parent
->dir
->commit(0, gather
.new_sub());
1210 if (gather
.has_subs()) {
1211 gather
.set_finisher(fin
);
1218 struct C_IO_Inode_Fetched
: public CInodeIOContext
{
1221 C_IO_Inode_Fetched(CInode
*i
, Context
*f
) : CInodeIOContext(i
), fin(f
) {}
1222 void finish(int r
) override
{
1223 // Ignore 'r', because we fetch from two places, so r is usually CEPHFS_ENOENT
1224 in
->_fetched(bl
, bl2
, fin
);
1226 void print(ostream
& out
) const override
{
1227 out
<< "inode_fetch(" << in
->ino() << ")";
1231 void CInode::fetch(MDSContext
*fin
)
1233 dout(10) << __func__
<< dendl
;
1235 C_IO_Inode_Fetched
*c
= new C_IO_Inode_Fetched(this, fin
);
1236 C_GatherBuilder
gather(g_ceph_context
, new C_OnFinisher(c
, mdcache
->mds
->finisher
));
1238 object_t oid
= CInode::get_object_name(ino(), frag_t(), "");
1239 object_locator_t
oloc(mdcache
->mds
->get_metadata_pool());
1241 // Old on-disk format: inode stored in xattr of a dirfrag
1243 rd
.getxattr("inode", &c
->bl
, NULL
);
1244 mdcache
->mds
->objecter
->read(oid
, oloc
, rd
, CEPH_NOSNAP
, (bufferlist
*)NULL
, 0, gather
.new_sub());
1246 // Current on-disk format: inode stored in a .inode object
1247 object_t oid2
= CInode::get_object_name(ino(), frag_t(), ".inode");
1248 mdcache
->mds
->objecter
->read(oid2
, oloc
, 0, 0, CEPH_NOSNAP
, &c
->bl2
, 0, gather
.new_sub());
1253 void CInode::_fetched(bufferlist
& bl
, bufferlist
& bl2
, Context
*fin
)
1255 dout(10) << __func__
<< " got " << bl
.length() << " and " << bl2
.length() << dendl
;
1256 bufferlist::const_iterator p
;
1259 } else if (bl
.length()) {
1262 derr
<< "No data while reading inode " << ino() << dendl
;
1263 fin
->complete(-CEPHFS_ENOENT
);
1272 dout(10) << " magic is '" << magic
<< "' (expecting '"
1273 << CEPH_FS_ONDISK_MAGIC
<< "')" << dendl
;
1274 if (magic
!= CEPH_FS_ONDISK_MAGIC
) {
1275 dout(0) << "on disk magic '" << magic
<< "' != my magic '" << CEPH_FS_ONDISK_MAGIC
1277 fin
->complete(-CEPHFS_EINVAL
);
1280 dout(10) << "_fetched " << *this << dendl
;
1283 } catch (buffer::error
&err
) {
1284 derr
<< "Corrupt inode " << ino() << ": " << err
.what() << dendl
;
1285 fin
->complete(-CEPHFS_EINVAL
);
1290 void CInode::build_backtrace(int64_t pool
, inode_backtrace_t
& bt
)
1293 bt
.ancestors
.clear();
1297 CDentry
*pdn
= get_parent_dn();
1299 CInode
*diri
= pdn
->get_dir()->get_inode();
1300 bt
.ancestors
.push_back(inode_backpointer_t(diri
->ino(), pdn
->get_name(), in
->get_inode()->version
));
1302 pdn
= in
->get_parent_dn();
1304 bt
.old_pools
.reserve(get_inode()->old_pools
.size());
1305 for (auto &p
: get_inode()->old_pools
) {
1306 // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
1308 bt
.old_pools
.push_back(p
);
1312 struct C_IO_Inode_StoredBacktrace
: public CInodeIOContext
{
1315 C_IO_Inode_StoredBacktrace(CInode
*i
, version_t v
, Context
*f
) : CInodeIOContext(i
), version(v
), fin(f
) {}
1316 void finish(int r
) override
{
1317 in
->_stored_backtrace(r
, version
, fin
);
1319 void print(ostream
& out
) const override
{
1320 out
<< "backtrace_store(" << in
->ino() << ")";
1325 void CInode::_commit_ops(int r
, C_GatherBuilder
&gather_bld
,
1326 std::vector
<CInodeCommitOperation
> &ops_vec
,
1327 inode_backtrace_t
&bt
)
1329 dout(10) << __func__
<< dendl
;
1332 mdcache
->mds
->handle_write_error_with_lock(r
);
1337 object_t oid
= get_object_name(ino(), frag_t(), "");
1339 for (auto &op
: ops_vec
) {
1340 ObjectOperation obj_op
;
1341 object_locator_t
oloc(op
.get_pool());
1342 op
.update(obj_op
, bt
);
1343 mdcache
->mds
->objecter
->mutate(oid
, oloc
, obj_op
, snapc
,
1344 ceph::real_clock::now(),
1345 0, gather_bld
.new_sub());
1349 void CInode::_store_backtrace(std::vector
<CInodeCommitOperation
> &ops_vec
,
1350 inode_backtrace_t
&bt
, int op_prio
)
1352 dout(10) << __func__
<< " on " << *this << dendl
;
1353 ceph_assert(is_dirty_parent());
1356 op_prio
= CEPH_MSG_PRIO_DEFAULT
;
1360 const int64_t pool
= get_backtrace_pool();
1361 build_backtrace(pool
, bt
);
1363 std::string_view slink
= "";
1364 if (is_symlink() && mdcache
->get_symlink_recovery()) {
1368 ops_vec
.emplace_back(op_prio
, pool
, get_inode()->layout
,
1369 mdcache
->mds
->mdsmap
->get_up_features(), slink
);
1371 if (!state_test(STATE_DIRTYPOOL
) || get_inode()->old_pools
.empty()) {
1372 dout(20) << __func__
<< ": no dirtypool or no old pools" << dendl
;
1376 // In the case where DIRTYPOOL is set, we update all old pools backtraces
1377 // such that anyone reading them will see the new pool ID in
1378 // inode_backtrace_t::pool and go read everything else from there.
1379 for (const auto &p
: get_inode()->old_pools
) {
1383 dout(20) << __func__
<< ": updating old pool " << p
<< dendl
;
1385 ops_vec
.emplace_back(op_prio
, p
);
1389 void CInode::store_backtrace(MDSContext
*fin
, int op_prio
)
1391 std::vector
<CInodeCommitOperation
> ops_vec
;
1392 inode_backtrace_t bt
;
1393 auto version
= get_inode()->backtrace_version
;
1395 _store_backtrace(ops_vec
, bt
, op_prio
);
1397 C_GatherBuilder
gather(g_ceph_context
,
1399 new C_IO_Inode_StoredBacktrace(this, version
, fin
),
1400 mdcache
->mds
->finisher
));
1401 _commit_ops(0, gather
, ops_vec
, bt
);
1402 ceph_assert(gather
.has_subs());
1406 void CInode::store_backtrace(CInodeCommitOperations
&op
, int op_prio
)
1408 op
.version
= get_inode()->backtrace_version
;
1411 _store_backtrace(op
.ops_vec
, op
.bt
, op_prio
);
1414 void CInode::_stored_backtrace(int r
, version_t v
, Context
*fin
)
1416 if (r
== -CEPHFS_ENOENT
) {
1417 const int64_t pool
= get_backtrace_pool();
1418 bool exists
= mdcache
->mds
->objecter
->with_osdmap(
1419 [pool
](const OSDMap
&osd_map
) {
1420 return osd_map
.have_pg_pool(pool
);
1423 // This CEPHFS_ENOENT is because the pool doesn't exist (the user deleted it
1424 // out from under us), so the backtrace can never be written, so pretend
1425 // to succeed so that the user can proceed to e.g. delete the file.
1427 dout(4) << __func__
<< " got CEPHFS_ENOENT: a data pool was deleted "
1428 "beneath us!" << dendl
;
1434 dout(1) << "store backtrace error " << r
<< " v " << v
<< dendl
;
1435 mdcache
->mds
->clog
->error() << "failed to store backtrace on ino "
1436 << ino() << " object"
1437 << ", pool " << get_backtrace_pool()
1439 mdcache
->mds
->handle_write_error(r
);
1445 dout(10) << __func__
<< " v " << v
<< dendl
;
1448 if (v
== get_inode()->backtrace_version
)
1449 clear_dirty_parent();
1454 void CInode::fetch_backtrace(Context
*fin
, bufferlist
*backtrace
)
1456 mdcache
->fetch_backtrace(ino(), get_backtrace_pool(), *backtrace
, fin
);
1459 void CInode::mark_dirty_parent(LogSegment
*ls
, bool dirty_pool
)
1461 if (!state_test(STATE_DIRTYPARENT
)) {
1462 dout(10) << __func__
<< dendl
;
1463 state_set(STATE_DIRTYPARENT
);
1464 get(PIN_DIRTYPARENT
);
1468 state_set(STATE_DIRTYPOOL
);
1470 ls
->dirty_parent_inodes
.push_back(&item_dirty_parent
);
1473 void CInode::clear_dirty_parent()
1475 if (state_test(STATE_DIRTYPARENT
)) {
1476 dout(10) << __func__
<< dendl
;
1477 state_clear(STATE_DIRTYPARENT
);
1478 state_clear(STATE_DIRTYPOOL
);
1479 put(PIN_DIRTYPARENT
);
1480 item_dirty_parent
.remove_myself();
1484 void CInode::verify_diri_backtrace(bufferlist
&bl
, int err
)
1486 if (is_base() || is_dirty_parent() || !is_auth())
1489 dout(10) << __func__
<< dendl
;
1492 inode_backtrace_t backtrace
;
1494 decode(backtrace
, bl
);
1495 CDentry
*pdn
= get_parent_dn();
1496 if (backtrace
.ancestors
.empty() ||
1497 backtrace
.ancestors
[0].dname
!= pdn
->get_name() ||
1498 backtrace
.ancestors
[0].dirino
!= pdn
->get_dir()->ino())
1499 err
= -CEPHFS_EINVAL
;
1503 MDSRank
*mds
= mdcache
->mds
;
1504 mds
->clog
->error() << "bad backtrace on directory inode " << ino();
1505 ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace
> 1));
1507 mark_dirty_parent(mds
->mdlog
->get_current_segment(), false);
1508 mds
->mdlog
->flush();
1512 // ------------------
1516 void InodeStoreBase::encode_xattrs(bufferlist
&bl
) const {
1519 encode(*xattrs
, bl
);
1521 encode((__u32
)0, bl
);
1524 void InodeStoreBase::decode_xattrs(bufferlist::const_iterator
&p
) {
1526 mempool_xattr_map tmp
;
1527 decode_noshare(tmp
, p
);
1529 reset_xattrs(xattr_map_ptr());
1531 reset_xattrs(allocate_xattr_map(std::move(tmp
)));
1535 void InodeStoreBase::encode_old_inodes(bufferlist
&bl
, uint64_t features
) const {
1538 encode(*old_inodes
, bl
, features
);
1540 encode((__u32
)0, bl
);
1543 void InodeStoreBase::decode_old_inodes(bufferlist::const_iterator
&p
) {
1545 mempool_old_inode_map tmp
;
1548 reset_old_inodes(old_inode_map_ptr());
1550 reset_old_inodes(allocate_old_inode_map(std::move(tmp
)));
1554 void InodeStoreBase::encode_bare(bufferlist
&bl
, uint64_t features
,
1555 const bufferlist
*snap_blob
) const
1558 encode(*inode
, bl
, features
);
1559 if (inode
->is_symlink())
1560 encode(symlink
, bl
);
1561 encode(dirfragtree
, bl
);
1565 encode(*snap_blob
, bl
);
1567 encode(bufferlist(), bl
);
1568 encode_old_inodes(bl
, features
);
1569 encode(oldest_snap
, bl
);
1570 encode(damage_flags
, bl
);
1573 void InodeStoreBase::encode(bufferlist
&bl
, uint64_t features
,
1574 const bufferlist
*snap_blob
) const
1576 ENCODE_START(6, 4, bl
);
1577 encode_bare(bl
, features
, snap_blob
);
1581 void CInode::encode_store(bufferlist
& bl
, uint64_t features
)
1583 bufferlist snap_blob
;
1584 encode_snap_blob(snap_blob
);
1585 InodeStoreBase::encode(bl
, mdcache
->mds
->mdsmap
->get_up_features(),
1589 void InodeStoreBase::decode_bare(bufferlist::const_iterator
&bl
,
1590 bufferlist
& snap_blob
, __u8 struct_v
)
1594 auto _inode
= allocate_inode();
1595 decode(*_inode
, bl
);
1597 if (_inode
->is_symlink()) {
1600 symlink
= std::string_view(tmp
);
1602 decode(dirfragtree
, bl
);
1604 decode(snap_blob
, bl
);
1606 decode_old_inodes(bl
);
1607 if (struct_v
== 2 && _inode
->is_dir()) {
1608 bool default_layout_exists
;
1609 decode(default_layout_exists
, bl
);
1610 if (default_layout_exists
) {
1611 decode(struct_v
, bl
); // this was a default_file_layout
1612 decode(_inode
->layout
, bl
); // but we only care about the layout portion
1616 if (struct_v
>= 5) {
1617 // InodeStore is embedded in dentries without proper versioning, so
1618 // we consume up to the end of the buffer
1620 decode(oldest_snap
, bl
);
1624 decode(damage_flags
, bl
);
1628 reset_inode(std::move(_inode
));
1632 void InodeStoreBase::decode(bufferlist::const_iterator
&bl
, bufferlist
& snap_blob
)
1634 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl
);
1635 decode_bare(bl
, snap_blob
, struct_v
);
1639 void CInode::decode_store(bufferlist::const_iterator
& bl
)
1641 bufferlist snap_blob
;
1642 InodeStoreBase::decode(bl
, snap_blob
);
1643 decode_snap_blob(snap_blob
);
1646 // ------------------
1649 SimpleLock
* CInode::get_lock(int type
)
1652 case CEPH_LOCK_IVERSION
: return &versionlock
;
1653 case CEPH_LOCK_IFILE
: return &filelock
;
1654 case CEPH_LOCK_IAUTH
: return &authlock
;
1655 case CEPH_LOCK_ILINK
: return &linklock
;
1656 case CEPH_LOCK_IDFT
: return &dirfragtreelock
;
1657 case CEPH_LOCK_IXATTR
: return &xattrlock
;
1658 case CEPH_LOCK_ISNAP
: return &snaplock
;
1659 case CEPH_LOCK_INEST
: return &nestlock
;
1660 case CEPH_LOCK_IFLOCK
: return &flocklock
;
1661 case CEPH_LOCK_IPOLICY
: return &policylock
;
1666 void CInode::set_object_info(MDSCacheObjectInfo
&info
)
1672 void CInode::encode_lock_iauth(bufferlist
& bl
)
1674 ENCODE_START(2, 1, bl
);
1675 encode(get_inode()->version
, bl
);
1676 encode(get_inode()->ctime
, bl
);
1677 encode(get_inode()->mode
, bl
);
1678 encode(get_inode()->uid
, bl
);
1679 encode(get_inode()->gid
, bl
);
1680 encode(get_inode()->fscrypt_auth
, bl
);
1684 void CInode::decode_lock_iauth(bufferlist::const_iterator
& p
)
1686 ceph_assert(!is_auth());
1687 auto _inode
= allocate_inode(*get_inode());
1689 decode(_inode
->version
, p
);
1692 if (_inode
->ctime
< tm
) _inode
->ctime
= tm
;
1693 decode(_inode
->mode
, p
);
1694 decode(_inode
->uid
, p
);
1695 decode(_inode
->gid
, p
);
1697 decode(_inode
->fscrypt_auth
, p
);
1699 reset_inode(std::move(_inode
));
1702 void CInode::encode_lock_ilink(bufferlist
& bl
)
1704 ENCODE_START(1, 1, bl
);
1705 encode(get_inode()->version
, bl
);
1706 encode(get_inode()->ctime
, bl
);
1707 encode(get_inode()->nlink
, bl
);
1711 void CInode::decode_lock_ilink(bufferlist::const_iterator
& p
)
1713 ceph_assert(!is_auth());
1714 auto _inode
= allocate_inode(*get_inode());
1716 decode(_inode
->version
, p
);
1719 if (_inode
->ctime
< tm
) _inode
->ctime
= tm
;
1720 decode(_inode
->nlink
, p
);
1722 reset_inode(std::move(_inode
));
1725 void CInode::encode_lock_idft(bufferlist
& bl
)
1727 ENCODE_START(1, 1, bl
);
1729 encode(get_inode()->version
, bl
);
1731 // treat flushing as dirty when rejoining cache
1732 bool dirty
= dirfragtreelock
.is_dirty_or_flushing();
1736 // encode the raw tree
1737 encode(dirfragtree
, bl
);
1739 // also specify which frags are mine
1740 set
<frag_t
> myfrags
;
1741 auto&& dfls
= get_dirfrags();
1742 for (const auto& dir
: dfls
) {
1743 if (dir
->is_auth()) {
1744 frag_t fg
= dir
->get_frag();
1748 encode(myfrags
, bl
);
1753 void CInode::decode_lock_idft(bufferlist::const_iterator
& p
)
1760 decode(replica_dirty
, p
);
1761 if (replica_dirty
) {
1762 dout(10) << __func__
<< " setting dftlock dirty flag" << dendl
;
1763 dirfragtreelock
.mark_dirty(); // ok bc we're auth and caller will handle
1766 _inode
= allocate_inode(*get_inode());
1767 decode(_inode
->version
, p
);
1772 set
<frag_t
> authfrags
;
1773 decode(authfrags
, p
);
1775 // auth. believe replica's auth frags only.
1776 for (auto fg
: authfrags
) {
1777 if (!dirfragtree
.is_leaf(fg
)) {
1778 dout(10) << " forcing frag " << fg
<< " to leaf (split|merge)" << dendl
;
1779 dirfragtree
.force_to_leaf(g_ceph_context
, fg
);
1780 dirfragtreelock
.mark_dirty(); // ok bc we're auth and caller will handle
1784 // replica. take the new tree, BUT make sure any open
1785 // dirfrags remain leaves (they may have split _after_ this
1786 // dft was scattered, or we may still be be waiting on the
1787 // notify from the auth)
1788 dirfragtree
.swap(temp
);
1789 for (const auto &p
: dirfrags
) {
1790 if (!dirfragtree
.is_leaf(p
.first
)) {
1791 dout(10) << " forcing open dirfrag " << p
.first
<< " to leaf (racing with split|merge)" << dendl
;
1792 dirfragtree
.force_to_leaf(g_ceph_context
, p
.first
);
1794 if (p
.second
->is_auth())
1795 p
.second
->state_clear(CDir::STATE_DIRTYDFT
);
1798 if (g_conf()->mds_debug_frag
)
1804 reset_inode(std::move(_inode
));
1807 void CInode::encode_lock_ifile(bufferlist
& bl
)
1809 ENCODE_START(2, 1, bl
);
1811 encode(get_inode()->version
, bl
);
1812 encode(get_inode()->ctime
, bl
);
1813 encode(get_inode()->mtime
, bl
);
1814 encode(get_inode()->atime
, bl
);
1815 encode(get_inode()->time_warp_seq
, bl
);
1817 encode(get_inode()->layout
, bl
, mdcache
->mds
->mdsmap
->get_up_features());
1818 encode(get_inode()->size
, bl
);
1819 encode(get_inode()->truncate_seq
, bl
);
1820 encode(get_inode()->truncate_size
, bl
);
1821 encode(get_inode()->client_ranges
, bl
);
1822 encode(get_inode()->inline_data
, bl
);
1825 // treat flushing as dirty when rejoining cache
1826 bool dirty
= filelock
.is_dirty_or_flushing();
1829 dout(15) << __func__
<< " inode.dirstat is " << get_inode()->dirstat
<< dendl
;
1830 encode(get_inode()->dirstat
, bl
); // only meaningful if i am auth.
1833 for (const auto &p
: dirfrags
) {
1834 frag_t fg
= p
.first
;
1835 CDir
*dir
= p
.second
;
1836 if (is_auth() || dir
->is_auth()) {
1837 const auto& pf
= dir
->get_projected_fnode();
1838 dout(15) << fg
<< " " << *dir
<< dendl
;
1839 dout(20) << fg
<< " fragstat " << pf
->fragstat
<< dendl
;
1840 dout(20) << fg
<< " accounted_fragstat " << pf
->accounted_fragstat
<< dendl
;
1842 encode(dir
->first
, tmp
);
1843 encode(pf
->fragstat
, tmp
);
1844 encode(pf
->accounted_fragstat
, tmp
);
1849 bl
.claim_append(tmp
);
1851 encode(get_inode()->fscrypt_file
, bl
);
1855 void CInode::decode_lock_ifile(bufferlist::const_iterator
& p
)
1861 _inode
= allocate_inode(*get_inode());
1863 decode(_inode
->version
, p
);
1866 if (_inode
->ctime
< tm
) _inode
->ctime
= tm
;
1867 decode(_inode
->mtime
, p
);
1868 decode(_inode
->atime
, p
);
1869 decode(_inode
->time_warp_seq
, p
);
1871 decode(_inode
->layout
, p
);
1872 decode(_inode
->size
, p
);
1873 decode(_inode
->truncate_seq
, p
);
1874 decode(_inode
->truncate_size
, p
);
1875 decode(_inode
->client_ranges
, p
);
1876 decode(_inode
->inline_data
, p
);
1880 decode(replica_dirty
, p
);
1881 if (replica_dirty
) {
1882 dout(10) << __func__
<< " setting filelock dirty flag" << dendl
;
1883 filelock
.mark_dirty(); // ok bc we're auth and caller will handle
1887 frag_info_t dirstat
;
1890 dout(10) << " taking inode dirstat " << dirstat
<< " for " << *this << dendl
;
1891 _inode
->dirstat
= dirstat
; // take inode summation if replica
1895 dout(10) << " ...got " << n
<< " fragstats on " << *this << dendl
;
1899 frag_info_t fragstat
;
1900 frag_info_t accounted_fragstat
;
1903 decode(fragstat
, p
);
1904 decode(accounted_fragstat
, p
);
1905 dout(10) << fg
<< " [" << fgfirst
<< ",head] " << dendl
;
1906 dout(10) << fg
<< " fragstat " << fragstat
<< dendl
;
1907 dout(20) << fg
<< " accounted_fragstat " << accounted_fragstat
<< dendl
;
1909 CDir
*dir
= get_dirfrag(fg
);
1911 ceph_assert(dir
); // i am auth; i had better have this dir open
1912 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
1913 << " on " << *dir
<< dendl
;
1914 dir
->first
= fgfirst
;
1915 auto _fnode
= CDir::allocate_fnode(*dir
->get_fnode());
1916 _fnode
->fragstat
= fragstat
;
1917 _fnode
->accounted_fragstat
= accounted_fragstat
;
1918 dir
->reset_fnode(std::move(_fnode
));
1919 if (!(fragstat
== accounted_fragstat
)) {
1920 dout(10) << fg
<< " setting filelock updated flag" << dendl
;
1921 filelock
.mark_dirty(); // ok bc we're auth and caller will handle
1924 if (dir
&& dir
->is_auth()) {
1925 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
1926 << " on " << *dir
<< dendl
;
1927 dir
->first
= fgfirst
;
1928 const auto& pf
= dir
->get_projected_fnode();
1929 finish_scatter_update(&filelock
, dir
,
1930 _inode
->dirstat
.version
, pf
->accounted_fragstat
.version
);
1934 if (!is_auth() && struct_v
>= 2)
1935 decode(_inode
->fscrypt_file
, p
);
1939 reset_inode(std::move(_inode
));
1942 void CInode::encode_lock_inest(bufferlist
& bl
)
1944 ENCODE_START(1, 1, bl
);
1946 encode(get_inode()->version
, bl
);
1948 // treat flushing as dirty when rejoining cache
1949 bool dirty
= nestlock
.is_dirty_or_flushing();
1952 dout(15) << __func__
<< " inode.rstat is " << get_inode()->rstat
<< dendl
;
1953 encode(get_inode()->rstat
, bl
); // only meaningful if i am auth.
1956 for (const auto &p
: dirfrags
) {
1957 frag_t fg
= p
.first
;
1958 CDir
*dir
= p
.second
;
1959 if (is_auth() || dir
->is_auth()) {
1960 const auto& pf
= dir
->get_projected_fnode();
1961 dout(10) << __func__
<< " " << fg
<< " dir " << *dir
<< dendl
;
1962 dout(10) << __func__
<< " " << fg
<< " rstat " << pf
->rstat
<< dendl
;
1963 dout(10) << __func__
<< " " << fg
<< " accounted_rstat " << pf
->rstat
<< dendl
;
1964 dout(10) << __func__
<< " " << fg
<< " dirty_old_rstat " << dir
->dirty_old_rstat
<< dendl
;
1966 encode(dir
->first
, tmp
);
1967 encode(pf
->rstat
, tmp
);
1968 encode(pf
->accounted_rstat
, tmp
);
1969 encode(dir
->dirty_old_rstat
, tmp
);
1974 bl
.claim_append(tmp
);
1978 void CInode::decode_lock_inest(bufferlist::const_iterator
& p
)
1985 decode(replica_dirty
, p
);
1986 if (replica_dirty
) {
1987 dout(10) << __func__
<< " setting nestlock dirty flag" << dendl
;
1988 nestlock
.mark_dirty(); // ok bc we're auth and caller will handle
1991 _inode
= allocate_inode(*get_inode());
1992 decode(_inode
->version
, p
);
1997 dout(10) << __func__
<< " taking inode rstat " << rstat
<< " for " << *this << dendl
;
1998 _inode
->rstat
= rstat
; // take inode summation if replica
2006 nest_info_t accounted_rstat
;
2007 decltype(CDir::dirty_old_rstat
) dirty_old_rstat
;
2011 decode(accounted_rstat
, p
);
2012 decode(dirty_old_rstat
, p
);
2013 dout(10) << __func__
<< " " << fg
<< " [" << fgfirst
<< ",head]" << dendl
;
2014 dout(10) << __func__
<< " " << fg
<< " rstat " << rstat
<< dendl
;
2015 dout(10) << __func__
<< " " << fg
<< " accounted_rstat " << accounted_rstat
<< dendl
;
2016 dout(10) << __func__
<< " " << fg
<< " dirty_old_rstat " << dirty_old_rstat
<< dendl
;
2017 CDir
*dir
= get_dirfrag(fg
);
2019 ceph_assert(dir
); // i am auth; i had better have this dir open
2020 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
2021 << " on " << *dir
<< dendl
;
2022 dir
->first
= fgfirst
;
2023 auto _fnode
= CDir::allocate_fnode(*dir
->get_fnode());
2024 _fnode
->rstat
= rstat
;
2025 _fnode
->accounted_rstat
= accounted_rstat
;
2026 dir
->reset_fnode(std::move(_fnode
));
2027 dir
->dirty_old_rstat
.swap(dirty_old_rstat
);
2028 if (!(rstat
== accounted_rstat
) || !dir
->dirty_old_rstat
.empty()) {
2029 dout(10) << fg
<< " setting nestlock updated flag" << dendl
;
2030 nestlock
.mark_dirty(); // ok bc we're auth and caller will handle
2033 if (dir
&& dir
->is_auth()) {
2034 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
2035 << " on " << *dir
<< dendl
;
2036 dir
->first
= fgfirst
;
2037 const auto& pf
= dir
->get_projected_fnode();
2038 finish_scatter_update(&nestlock
, dir
,
2039 _inode
->rstat
.version
, pf
->accounted_rstat
.version
);
2046 reset_inode(std::move(_inode
));
2049 void CInode::encode_lock_ixattr(bufferlist
& bl
)
2051 ENCODE_START(2, 1, bl
);
2052 encode(get_inode()->version
, bl
);
2053 encode(get_inode()->ctime
, bl
);
2055 encode(get_inode()->xattr_version
, bl
);
2059 void CInode::decode_lock_ixattr(bufferlist::const_iterator
& p
)
2061 ceph_assert(!is_auth());
2062 auto _inode
= allocate_inode(*get_inode());
2064 decode(_inode
->version
, p
);
2067 if (_inode
->ctime
< tm
)
2070 if (struct_v
>= 2) {
2071 decode(_inode
->xattr_version
, p
);
2074 reset_inode(std::move(_inode
));
2077 void CInode::encode_lock_isnap(bufferlist
& bl
)
2079 ENCODE_START(1, 1, bl
);
2080 encode(get_inode()->version
, bl
);
2081 encode(get_inode()->ctime
, bl
);
2086 void CInode::decode_lock_isnap(bufferlist::const_iterator
& p
)
2088 ceph_assert(!is_auth());
2089 auto _inode
= allocate_inode(*get_inode());
2091 decode(_inode
->version
, p
);
2094 if (_inode
->ctime
< tm
) _inode
->ctime
= tm
;
2097 reset_inode(std::move(_inode
));
2100 void CInode::encode_lock_iflock(bufferlist
& bl
)
2102 ENCODE_START(1, 1, bl
);
2103 encode(get_inode()->version
, bl
);
2104 _encode_file_locks(bl
);
2108 void CInode::decode_lock_iflock(bufferlist::const_iterator
& p
)
2110 ceph_assert(!is_auth());
2111 auto _inode
= allocate_inode(*get_inode());
2113 decode(_inode
->version
, p
);
2114 _decode_file_locks(p
);
2116 reset_inode(std::move(_inode
));
2119 void CInode::encode_lock_ipolicy(bufferlist
& bl
)
2121 ENCODE_START(2, 1, bl
);
2123 encode(get_inode()->version
, bl
);
2124 encode(get_inode()->ctime
, bl
);
2125 encode(get_inode()->layout
, bl
, mdcache
->mds
->mdsmap
->get_up_features());
2126 encode(get_inode()->quota
, bl
);
2127 encode(get_inode()->export_pin
, bl
);
2128 encode(get_inode()->export_ephemeral_distributed_pin
, bl
);
2129 encode(get_inode()->export_ephemeral_random_pin
, bl
);
2134 void CInode::decode_lock_ipolicy(bufferlist::const_iterator
& p
)
2136 ceph_assert(!is_auth());
2137 auto _inode
= allocate_inode(*get_inode());
2140 decode(_inode
->version
, p
);
2143 if (_inode
->ctime
< tm
)
2145 decode(_inode
->layout
, p
);
2146 decode(_inode
->quota
, p
);
2147 decode(_inode
->export_pin
, p
);
2148 if (struct_v
>= 2) {
2149 decode(_inode
->export_ephemeral_distributed_pin
, p
);
2150 decode(_inode
->export_ephemeral_random_pin
, p
);
2155 bool pin_updated
= (get_inode()->export_pin
!= _inode
->export_pin
) ||
2156 (get_inode()->export_ephemeral_distributed_pin
!=
2157 _inode
->export_ephemeral_distributed_pin
);
2158 reset_inode(std::move(_inode
));
2159 maybe_export_pin(pin_updated
);
2162 void CInode::encode_lock_state(int type
, bufferlist
& bl
)
2164 ENCODE_START(1, 1, bl
);
2167 encode(parent
->first
, bl
);
2170 case CEPH_LOCK_IAUTH
:
2171 encode_lock_iauth(bl
);
2174 case CEPH_LOCK_ILINK
:
2175 encode_lock_ilink(bl
);
2178 case CEPH_LOCK_IDFT
:
2179 encode_lock_idft(bl
);
2182 case CEPH_LOCK_IFILE
:
2183 encode_lock_ifile(bl
);
2186 case CEPH_LOCK_INEST
:
2187 encode_lock_inest(bl
);
2190 case CEPH_LOCK_IXATTR
:
2191 encode_lock_ixattr(bl
);
2194 case CEPH_LOCK_ISNAP
:
2195 encode_lock_isnap(bl
);
2198 case CEPH_LOCK_IFLOCK
:
2199 encode_lock_iflock(bl
);
2202 case CEPH_LOCK_IPOLICY
:
2203 encode_lock_ipolicy(bl
);
2212 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2214 void CInode::decode_lock_state(int type
, const bufferlist
& bl
)
2216 auto p
= bl
.cbegin();
2222 decode(newfirst
, p
);
2223 if (!is_auth() && newfirst
!= first
) {
2224 dout(10) << __func__
<< " first " << first
<< " -> " << newfirst
<< dendl
;
2228 decode(newfirst
, p
);
2229 if (!parent
->is_auth() && newfirst
!= parent
->first
) {
2230 dout(10) << __func__
<< " parent first " << first
<< " -> " << newfirst
<< dendl
;
2231 parent
->first
= newfirst
;
2236 case CEPH_LOCK_IAUTH
:
2237 decode_lock_iauth(p
);
2240 case CEPH_LOCK_ILINK
:
2241 decode_lock_ilink(p
);
2244 case CEPH_LOCK_IDFT
:
2245 decode_lock_idft(p
);
2248 case CEPH_LOCK_IFILE
:
2249 decode_lock_ifile(p
);
2252 case CEPH_LOCK_INEST
:
2253 decode_lock_inest(p
);
2256 case CEPH_LOCK_IXATTR
:
2257 decode_lock_ixattr(p
);
2260 case CEPH_LOCK_ISNAP
:
2261 decode_lock_isnap(p
);
2264 case CEPH_LOCK_IFLOCK
:
2265 decode_lock_iflock(p
);
2268 case CEPH_LOCK_IPOLICY
:
2269 decode_lock_ipolicy(p
);
2279 bool CInode::is_dirty_scattered()
2282 filelock
.is_dirty_or_flushing() ||
2283 nestlock
.is_dirty_or_flushing() ||
2284 dirfragtreelock
.is_dirty_or_flushing();
2287 void CInode::clear_scatter_dirty()
2289 filelock
.remove_dirty();
2290 nestlock
.remove_dirty();
2291 dirfragtreelock
.remove_dirty();
2294 void CInode::clear_dirty_scattered(int type
)
2296 dout(10) << __func__
<< " " << type
<< " on " << *this << dendl
;
2297 ceph_assert(is_dir());
2299 case CEPH_LOCK_IFILE
:
2300 item_dirty_dirfrag_dir
.remove_myself();
2303 case CEPH_LOCK_INEST
:
2304 item_dirty_dirfrag_nest
.remove_myself();
2307 case CEPH_LOCK_IDFT
:
2308 item_dirty_dirfrag_dirfragtree
.remove_myself();
2318 * when we initially scatter a lock, we need to check if any of the dirfrags
2319 * have out of date accounted_rstat/fragstat. if so, mark the lock stale.
2321 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2322 void CInode::start_scatter(ScatterLock
*lock
)
2324 dout(10) << __func__
<< " " << *lock
<< " on " << *this << dendl
;
2325 ceph_assert(is_auth());
2326 const auto& pi
= get_projected_inode();
2328 for (const auto &p
: dirfrags
) {
2329 frag_t fg
= p
.first
;
2330 CDir
*dir
= p
.second
;
2331 const auto& pf
= dir
->get_projected_fnode();
2332 dout(20) << fg
<< " " << *dir
<< dendl
;
2334 if (!dir
->is_auth())
2337 switch (lock
->get_type()) {
2338 case CEPH_LOCK_IFILE
:
2339 finish_scatter_update(lock
, dir
, pi
->dirstat
.version
, pf
->accounted_fragstat
.version
);
2342 case CEPH_LOCK_INEST
:
2343 finish_scatter_update(lock
, dir
, pi
->rstat
.version
, pf
->accounted_rstat
.version
);
2346 case CEPH_LOCK_IDFT
:
2347 dir
->state_clear(CDir::STATE_DIRTYDFT
);
2354 class C_Inode_FragUpdate
: public MDSLogContextBase
{
2359 MDSRank
*get_mds() override
{return in
->mdcache
->mds
;}
2360 void finish(int r
) override
{
2361 in
->_finish_frag_update(dir
, mut
);
2365 C_Inode_FragUpdate(CInode
*i
, CDir
*d
, MutationRef
& m
) : in(i
), dir(d
), mut(m
) {}
2368 void CInode::finish_scatter_update(ScatterLock
*lock
, CDir
*dir
,
2369 version_t inode_version
, version_t dir_accounted_version
)
2371 frag_t fg
= dir
->get_frag();
2372 ceph_assert(dir
->is_auth());
2374 if (dir
->is_frozen()) {
2375 dout(10) << __func__
<< " " << fg
<< " frozen, marking " << *lock
<< " stale " << *dir
<< dendl
;
2376 } else if (dir
->get_version() == 0) {
2377 dout(10) << __func__
<< " " << fg
<< " not loaded, marking " << *lock
<< " stale " << *dir
<< dendl
;
2379 if (dir_accounted_version
!= inode_version
) {
2380 dout(10) << __func__
<< " " << fg
<< " journaling accounted scatterstat update v" << inode_version
<< dendl
;
2382 MDLog
*mdlog
= mdcache
->mds
->mdlog
;
2383 MutationRef
mut(new MutationImpl());
2384 mut
->ls
= mdlog
->get_current_segment();
2386 auto pf
= dir
->project_fnode(mut
);
2388 std::string_view ename
;
2389 switch (lock
->get_type()) {
2390 case CEPH_LOCK_IFILE
:
2391 pf
->fragstat
.version
= inode_version
;
2392 pf
->accounted_fragstat
= pf
->fragstat
;
2393 ename
= "lock ifile accounted scatter stat update";
2395 case CEPH_LOCK_INEST
:
2396 pf
->rstat
.version
= inode_version
;
2397 pf
->accounted_rstat
= pf
->rstat
;
2398 ename
= "lock inest accounted scatter stat update";
2400 if (!is_auth() && lock
->get_state() == LOCK_MIX
) {
2401 dout(10) << __func__
<< " try to assimilate dirty rstat on "
2403 dir
->assimilate_dirty_rstat_inodes(mut
);
2411 EUpdate
*le
= new EUpdate(mdlog
, ename
);
2412 mdlog
->start_entry(le
);
2413 le
->metablob
.add_dir_context(dir
);
2414 le
->metablob
.add_dir(dir
, true);
2416 ceph_assert(!dir
->is_frozen());
2419 if (lock
->get_type() == CEPH_LOCK_INEST
&&
2420 !is_auth() && lock
->get_state() == LOCK_MIX
) {
2421 dout(10) << __func__
<< " finish assimilating dirty rstat on "
2423 dir
->assimilate_dirty_rstat_inodes_finish(&le
->metablob
);
2425 if (!(pf
->rstat
== pf
->accounted_rstat
)) {
2426 if (!mut
->is_wrlocked(&nestlock
)) {
2427 mdcache
->mds
->locker
->wrlock_force(&nestlock
, mut
);
2430 mdcache
->mds
->locker
->mark_updated_scatterlock(&nestlock
);
2431 mut
->ls
->dirty_dirfrag_nest
.push_back(&item_dirty_dirfrag_nest
);
2435 pf
->version
= dir
->pre_dirty();
2437 mdlog
->submit_entry(le
, new C_Inode_FragUpdate(this, dir
, mut
));
2439 dout(10) << __func__
<< " " << fg
<< " accounted " << *lock
2440 << " scatter stat unchanged at v" << dir_accounted_version
<< dendl
;
2445 void CInode::_finish_frag_update(CDir
*dir
, MutationRef
& mut
)
2447 dout(10) << __func__
<< " on " << *dir
<< dendl
;
2449 mdcache
->mds
->locker
->drop_locks(mut
.get());
2455 * when we gather a lock, we need to assimilate dirfrag changes into the inode
2456 * state. it's possible we can't update the dirfrag accounted_rstat/fragstat
2457 * because the frag is auth and frozen, or that the replica couldn't for the same
2458 * reason. hopefully it will get updated the next time the lock cycles.
2460 * we have two dimensions of behavior:
2461 * - we may be (auth and !frozen), and able to update, or not.
2462 * - the frag may be stale, or not.
2464 * if the frag is non-stale, we want to assimilate the diff into the
2465 * inode, regardless of whether it's auth or updateable.
2467 * if we update the frag, we want to set accounted_fragstat = frag,
2468 * both if we took the diff or it was stale and we are making it
2471 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2472 void CInode::finish_scatter_gather_update(int type
, MutationRef
& mut
)
2474 LogChannelRef clog
= mdcache
->mds
->clog
;
2476 dout(10) << __func__
<< " " << type
<< " on " << *this << dendl
;
2477 ceph_assert(is_auth());
2480 case CEPH_LOCK_IFILE
:
2482 fragtree_t tmpdft
= dirfragtree
;
2483 struct frag_info_t dirstat
;
2484 bool dirstat_valid
= true;
2487 ceph_assert(is_auth());
2488 auto pi
= _get_projected_inode();
2490 bool touched_mtime
= false, touched_chattr
= false;
2491 dout(20) << " orig dirstat " << pi
->dirstat
<< dendl
;
2492 pi
->dirstat
.version
++;
2493 for (const auto &p
: dirfrags
) {
2494 frag_t fg
= p
.first
;
2495 CDir
*dir
= p
.second
;
2496 dout(20) << fg
<< " " << *dir
<< dendl
;
2499 if (dir
->get_version() != 0) {
2500 update
= dir
->is_auth() && !dir
->is_frozen();
2503 dirstat_valid
= false;
2506 CDir::fnode_const_ptr pf
;
2509 pf
= dir
->project_fnode(mut
);
2511 pf
= dir
->get_projected_fnode();
2514 if (pf
->accounted_fragstat
.version
== pi
->dirstat
.version
- 1) {
2515 dout(20) << fg
<< " fragstat " << pf
->fragstat
<< dendl
;
2516 dout(20) << fg
<< " accounted_fragstat " << pf
->accounted_fragstat
<< dendl
;
2517 pi
->dirstat
.add_delta(pf
->fragstat
, pf
->accounted_fragstat
, &touched_mtime
, &touched_chattr
);
2519 dout(20) << fg
<< " skipping STALE accounted_fragstat " << pf
->accounted_fragstat
<< dendl
;
2522 if (pf
->fragstat
.nfiles
< 0 ||
2523 pf
->fragstat
.nsubdirs
< 0) {
2524 clog
->error() << "bad/negative dir size on "
2525 << dir
->dirfrag() << " " << pf
->fragstat
;
2526 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter
);
2528 auto _pf
= const_cast<fnode_t
*>(pf
.get());
2529 if (pf
->fragstat
.nfiles
< 0)
2530 _pf
->fragstat
.nfiles
= 0;
2531 if (pf
->fragstat
.nsubdirs
< 0)
2532 _pf
->fragstat
.nsubdirs
= 0;
2536 auto _pf
= const_cast<fnode_t
*>(pf
.get());
2537 _pf
->accounted_fragstat
= _pf
->fragstat
;
2538 _pf
->fragstat
.version
= _pf
->accounted_fragstat
.version
= pi
->dirstat
.version
;
2539 _pf
->version
= dir
->pre_dirty();
2540 dout(10) << fg
<< " updated accounted_fragstat " << pf
->fragstat
<< " on " << *dir
<< dendl
;
2543 tmpdft
.force_to_leaf(g_ceph_context
, fg
);
2544 dirstat
.add(pf
->fragstat
);
2547 pi
->mtime
= pi
->ctime
= pi
->dirstat
.mtime
;
2551 dout(20) << " final dirstat " << pi
->dirstat
<< dendl
;
2553 if (dirstat_valid
&& !dirstat
.same_sums(pi
->dirstat
)) {
2555 tmpdft
.get_leaves_under(frag_t(), leaves
);
2556 for (const auto& leaf
: leaves
) {
2557 if (!dirfrags
.count(leaf
)) {
2558 dirstat_valid
= false;
2562 if (dirstat_valid
) {
2563 if (state_test(CInode::STATE_REPAIRSTATS
)) {
2564 dout(20) << " dirstat mismatch, fixing" << dendl
;
2566 clog
->error() << "unmatched fragstat on " << ino() << ", inode has "
2567 << pi
->dirstat
<< ", dirfrags have " << dirstat
;
2568 ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter
);
2570 // trust the dirfrags for now
2571 version_t v
= pi
->dirstat
.version
;
2572 if (pi
->dirstat
.mtime
> dirstat
.mtime
)
2573 dirstat
.mtime
= pi
->dirstat
.mtime
;
2574 if (pi
->dirstat
.change_attr
> dirstat
.change_attr
)
2575 dirstat
.change_attr
= pi
->dirstat
.change_attr
;
2576 pi
->dirstat
= dirstat
;
2577 pi
->dirstat
.version
= v
;
2581 if (pi
->dirstat
.nfiles
< 0 || pi
->dirstat
.nsubdirs
< 0) {
2583 make_path_string(path
);
2584 clog
->error() << "Inconsistent statistics detected: fragstat on inode "
2585 << ino() << " (" << path
<< "), inode has " << pi
->dirstat
;
2586 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter
);
2588 if (pi
->dirstat
.nfiles
< 0)
2589 pi
->dirstat
.nfiles
= 0;
2590 if (pi
->dirstat
.nsubdirs
< 0)
2591 pi
->dirstat
.nsubdirs
= 0;
2596 case CEPH_LOCK_INEST
:
2599 ceph_assert(is_auth());
2601 fragtree_t tmpdft
= dirfragtree
;
2603 bool rstat_valid
= true;
2606 if (const sr_t
*srnode
= get_projected_srnode(); srnode
)
2607 rstat
.rsnaps
= srnode
->snaps
.size();
2609 auto pi
= _get_projected_inode();
2610 dout(20) << " orig rstat " << pi
->rstat
<< dendl
;
2611 pi
->rstat
.version
++;
2612 for (const auto &p
: dirfrags
) {
2613 frag_t fg
= p
.first
;
2614 CDir
*dir
= p
.second
;
2615 dout(20) << fg
<< " " << *dir
<< dendl
;
2618 if (dir
->get_version() != 0) {
2619 update
= dir
->is_auth() && !dir
->is_frozen();
2622 rstat_valid
= false;
2625 CDir::fnode_const_ptr pf
;
2628 pf
= dir
->project_fnode(mut
);
2630 pf
= dir
->get_projected_fnode();
2633 if (pf
->accounted_rstat
.version
== pi
->rstat
.version
-1) {
2634 // only pull this frag's dirty rstat inodes into the frag if
2635 // the frag is non-stale and updateable. if it's stale,
2636 // that info will just get thrown out!
2638 dir
->assimilate_dirty_rstat_inodes(mut
);
2640 dout(20) << fg
<< " rstat " << pf
->rstat
<< dendl
;
2641 dout(20) << fg
<< " accounted_rstat " << pf
->accounted_rstat
<< dendl
;
2642 dout(20) << fg
<< " dirty_old_rstat " << dir
->dirty_old_rstat
<< dendl
;
2643 mdcache
->project_rstat_frag_to_inode(pf
->rstat
, pf
->accounted_rstat
,
2644 dir
->first
, CEPH_NOSNAP
, this, true);
2645 for (auto &p
: dir
->dirty_old_rstat
) {
2646 mdcache
->project_rstat_frag_to_inode(p
.second
.rstat
, p
.second
.accounted_rstat
,
2647 p
.second
.first
, p
.first
, this, true);
2649 if (update
) // dir contents not valid if frozen or non-auth
2650 dir
->check_rstats();
2652 dout(20) << fg
<< " skipping STALE accounted_rstat " << pf
->accounted_rstat
<< dendl
;
2655 auto _pf
= const_cast<fnode_t
*>(pf
.get());
2656 _pf
->accounted_rstat
= pf
->rstat
;
2657 _pf
->rstat
.version
= _pf
->accounted_rstat
.version
= pi
->rstat
.version
;
2658 _pf
->version
= dir
->pre_dirty();
2659 dir
->dirty_old_rstat
.clear();
2660 dir
->check_rstats();
2661 dout(10) << fg
<< " updated accounted_rstat " << pf
->rstat
<< " on " << *dir
<< dendl
;
2664 tmpdft
.force_to_leaf(g_ceph_context
, fg
);
2665 rstat
.add(pf
->rstat
);
2667 dout(20) << " final rstat " << pi
->rstat
<< dendl
;
2669 if (rstat_valid
&& !rstat
.same_sums(pi
->rstat
)) {
2671 tmpdft
.get_leaves_under(frag_t(), leaves
);
2672 for (const auto& leaf
: leaves
) {
2673 if (!dirfrags
.count(leaf
)) {
2674 rstat_valid
= false;
2679 if (state_test(CInode::STATE_REPAIRSTATS
)) {
2680 dout(20) << " rstat mismatch, fixing" << dendl
;
2682 clog
->error() << "inconsistent rstat on inode " << ino()
2683 << ", inode has " << pi
->rstat
2684 << ", directory fragments have " << rstat
;
2685 ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter
);
2687 // trust the dirfrag for now
2688 version_t v
= pi
->rstat
.version
;
2689 if (pi
->rstat
.rctime
> rstat
.rctime
)
2690 rstat
.rctime
= pi
->rstat
.rctime
;
2692 pi
->rstat
.version
= v
;
2696 mdcache
->broadcast_quota_to_client(this);
2700 case CEPH_LOCK_IDFT
:
2708 void CInode::finish_scatter_gather_update_accounted(int type
, EMetaBlob
*metablob
)
2710 dout(10) << __func__
<< " " << type
<< " on " << *this << dendl
;
2711 ceph_assert(is_auth());
2713 for (const auto &p
: dirfrags
) {
2714 CDir
*dir
= p
.second
;
2715 if (!dir
->is_auth() || dir
->get_version() == 0 || dir
->is_frozen())
2718 if (type
== CEPH_LOCK_IDFT
)
2719 continue; // nothing to do.
2721 if (type
== CEPH_LOCK_INEST
)
2722 dir
->assimilate_dirty_rstat_inodes_finish(metablob
);
2724 dout(10) << " journaling updated frag accounted_ on " << *dir
<< dendl
;
2725 ceph_assert(dir
->is_projected());
2726 metablob
->add_dir(dir
, true);
2732 bool CInode::is_frozen() const
2734 if (is_frozen_inode()) return true;
2735 if (parent
&& parent
->dir
->is_frozen()) return true;
2739 bool CInode::is_frozen_dir() const
2741 if (parent
&& parent
->dir
->is_frozen_dir()) return true;
2745 bool CInode::is_freezing() const
2747 if (is_freezing_inode()) return true;
2748 if (parent
&& parent
->dir
->is_freezing()) return true;
2752 void CInode::add_dir_waiter(frag_t fg
, MDSContext
*c
)
2754 if (waiting_on_dir
.empty())
2756 waiting_on_dir
[fg
].push_back(c
);
2757 dout(10) << __func__
<< " frag " << fg
<< " " << c
<< " on " << *this << dendl
;
2760 void CInode::take_dir_waiting(frag_t fg
, MDSContext::vec
& ls
)
2762 if (waiting_on_dir
.empty())
2765 auto it
= waiting_on_dir
.find(fg
);
2766 if (it
!= waiting_on_dir
.end()) {
2767 dout(10) << __func__
<< " frag " << fg
<< " on " << *this << dendl
;
2768 auto& waiting
= it
->second
;
2769 ls
.insert(ls
.end(), waiting
.begin(), waiting
.end());
2770 waiting_on_dir
.erase(it
);
2772 if (waiting_on_dir
.empty())
2777 void CInode::add_waiter(uint64_t tag
, MDSContext
*c
)
2779 dout(10) << __func__
<< " tag " << std::hex
<< tag
<< std::dec
<< " " << c
2780 << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH
)
2781 << " !frozen " << !is_frozen_inode()
2782 << " !freezing " << !is_freezing_inode()
2784 // wait on the directory?
2785 // make sure its not the inode that is explicitly ambiguous|freezing|frozen
2786 if (((tag
& WAIT_SINGLEAUTH
) && !state_test(STATE_AMBIGUOUSAUTH
)) ||
2787 ((tag
& WAIT_UNFREEZE
) &&
2788 !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) {
2789 dout(15) << "passing waiter up tree" << dendl
;
2790 parent
->dir
->add_waiter(tag
, c
);
2793 dout(15) << "taking waiter here" << dendl
;
2794 MDSCacheObject::add_waiter(tag
, c
);
2797 void CInode::take_waiting(uint64_t mask
, MDSContext::vec
& ls
)
2799 if ((mask
& WAIT_DIR
) && !waiting_on_dir
.empty()) {
2800 // take all dentry waiters
2801 while (!waiting_on_dir
.empty()) {
2802 auto it
= waiting_on_dir
.begin();
2803 dout(10) << __func__
<< " dirfrag " << it
->first
<< " on " << *this << dendl
;
2804 auto& waiting
= it
->second
;
2805 ls
.insert(ls
.end(), waiting
.begin(), waiting
.end());
2806 waiting_on_dir
.erase(it
);
2812 MDSCacheObject::take_waiting(mask
, ls
);
2815 void CInode::maybe_finish_freeze_inode()
2817 CDir
*dir
= get_parent_dir();
2818 if (auth_pins
> auth_pin_freeze_allowance
|| dir
->frozen_inode_suppressed
)
2821 dout(10) << "maybe_finish_freeze_inode - frozen" << dendl
;
2822 ceph_assert(auth_pins
== auth_pin_freeze_allowance
);
2825 state_clear(STATE_FREEZING
);
2826 state_set(STATE_FROZEN
);
2828 item_freezing_inode
.remove_myself();
2829 dir
->num_frozen_inodes
++;
2831 finish_waiting(WAIT_FROZEN
);
2834 bool CInode::freeze_inode(int auth_pin_allowance
)
2836 CDir
*dir
= get_parent_dir();
2839 ceph_assert(auth_pin_allowance
> 0); // otherwise we need to adjust parent's nested_auth_pins
2840 ceph_assert(auth_pins
>= auth_pin_allowance
);
2841 if (auth_pins
== auth_pin_allowance
&& !dir
->frozen_inode_suppressed
) {
2842 dout(10) << "freeze_inode - frozen" << dendl
;
2843 if (!state_test(STATE_FROZEN
)) {
2845 state_set(STATE_FROZEN
);
2846 dir
->num_frozen_inodes
++;
2851 dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance
<< dendl
;
2852 auth_pin_freeze_allowance
= auth_pin_allowance
;
2853 dir
->freezing_inodes
.push_back(&item_freezing_inode
);
2856 state_set(STATE_FREEZING
);
2858 if (!dir
->lock_caches_with_auth_pins
.empty())
2859 mdcache
->mds
->locker
->invalidate_lock_caches(dir
);
2861 const static int lock_types
[] = {
2862 CEPH_LOCK_IVERSION
, CEPH_LOCK_IFILE
, CEPH_LOCK_IAUTH
, CEPH_LOCK_ILINK
, CEPH_LOCK_IDFT
,
2863 CEPH_LOCK_IXATTR
, CEPH_LOCK_ISNAP
, CEPH_LOCK_INEST
, CEPH_LOCK_IFLOCK
, CEPH_LOCK_IPOLICY
, 0
2865 for (int i
= 0; lock_types
[i
]; ++i
) {
2866 auto lock
= get_lock(lock_types
[i
]);
2867 if (lock
->is_cached())
2868 mdcache
->mds
->locker
->invalidate_lock_caches(lock
);
2870 // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed
2871 // and finish freezing the inode
2872 return state_test(STATE_FROZEN
);
2875 void CInode::unfreeze_inode(MDSContext::vec
& finished
)
2877 dout(10) << __func__
<< dendl
;
2878 if (state_test(STATE_FREEZING
)) {
2879 state_clear(STATE_FREEZING
);
2881 item_freezing_inode
.remove_myself();
2882 } else if (state_test(STATE_FROZEN
)) {
2883 state_clear(STATE_FROZEN
);
2885 get_parent_dir()->num_frozen_inodes
--;
2888 take_waiting(WAIT_UNFREEZE
, finished
);
2891 void CInode::unfreeze_inode()
2893 MDSContext::vec finished
;
2894 unfreeze_inode(finished
);
2895 mdcache
->mds
->queue_waiters(finished
);
2898 void CInode::freeze_auth_pin()
2900 ceph_assert(state_test(CInode::STATE_FROZEN
));
2901 state_set(CInode::STATE_FROZENAUTHPIN
);
2902 get_parent_dir()->num_frozen_inodes
++;
2905 void CInode::unfreeze_auth_pin()
2907 ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN
));
2908 state_clear(CInode::STATE_FROZENAUTHPIN
);
2909 get_parent_dir()->num_frozen_inodes
--;
2910 if (!state_test(STATE_FREEZING
|STATE_FROZEN
)) {
2911 MDSContext::vec finished
;
2912 take_waiting(WAIT_UNFREEZE
, finished
);
2913 mdcache
->mds
->queue_waiters(finished
);
2917 void CInode::clear_ambiguous_auth(MDSContext::vec
& finished
)
2919 ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH
));
2920 state_clear(CInode::STATE_AMBIGUOUSAUTH
);
2921 take_waiting(CInode::WAIT_SINGLEAUTH
, finished
);
2924 void CInode::clear_ambiguous_auth()
2926 MDSContext::vec finished
;
2927 clear_ambiguous_auth(finished
);
2928 mdcache
->mds
->queue_waiters(finished
);
2932 bool CInode::can_auth_pin(int *err_ret
) const {
2936 } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
2937 err
= ERR_EXPORTING_INODE
;
2940 return parent
->can_auth_pin(err_ret
);
2948 void CInode::auth_pin(void *by
)
2954 #ifdef MDS_AUTHPIN_SET
2955 auth_pin_set
.insert(by
);
2958 dout(10) << "auth_pin by " << by
<< " on " << *this << " now " << auth_pins
<< dendl
;
2961 parent
->adjust_nested_auth_pins(1, this);
2964 void CInode::auth_unpin(void *by
)
2968 #ifdef MDS_AUTHPIN_SET
2970 auto it
= auth_pin_set
.find(by
);
2971 ceph_assert(it
!= auth_pin_set
.end());
2972 auth_pin_set
.erase(it
);
2979 dout(10) << "auth_unpin by " << by
<< " on " << *this << " now " << auth_pins
<< dendl
;
2981 ceph_assert(auth_pins
>= 0);
2984 parent
->adjust_nested_auth_pins(-1, by
);
2986 if (is_freezing_inode())
2987 maybe_finish_freeze_inode();
2992 mds_authority_t
CInode::authority() const
2994 if (inode_auth
.first
>= 0)
2998 return parent
->dir
->authority();
3000 // new items that are not yet linked in (in the committed plane) belong
3001 // to their first parent.
3002 if (!projected_parent
.empty())
3003 return projected_parent
.front()->dir
->authority();
3005 return CDIR_AUTH_UNDEF
;
3011 snapid_t
CInode::get_oldest_snap()
3014 if (is_any_old_inodes())
3015 t
= get_old_inodes()->begin()->second
.first
;
3016 return std::min(t
, oldest_snap
);
3019 const CInode::mempool_old_inode
& CInode::cow_old_inode(snapid_t follows
, bool cow_head
)
3021 ceph_assert(follows
>= first
);
3023 const auto& pi
= cow_head
? get_projected_inode() : get_previous_projected_inode();
3024 const auto& px
= cow_head
? get_projected_xattrs() : get_previous_projected_xattrs();
3026 auto _old_inodes
= allocate_old_inode_map();
3028 *_old_inodes
= *old_inodes
;
3030 mempool_old_inode
&old
= (*_old_inodes
)[follows
];
3034 dout(10) << " " << px
->size() << " xattrs cowed, " << *px
<< dendl
;
3038 if (first
< oldest_snap
)
3039 oldest_snap
= first
;
3041 old
.inode
.trim_client_ranges(follows
);
3043 if (g_conf()->mds_snap_rstat
&&
3044 !(old
.inode
.rstat
== old
.inode
.accounted_rstat
))
3045 dirty_old_rstats
.insert(follows
);
3049 dout(10) << __func__
<< " " << (cow_head
? "head" : "previous_head" )
3050 << " to [" << old
.first
<< "," << follows
<< "] on "
3053 reset_old_inodes(std::move(_old_inodes
));
3057 void CInode::pre_cow_old_inode()
3059 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
3060 dout(20) << __func__
<< " follows " << follows
<< " on " << *this << dendl
;
3061 if (first
<= follows
)
3062 cow_old_inode(follows
, true);
3065 bool CInode::has_snap_data(snapid_t snapid
)
3067 bool found
= snapid
>= first
&& snapid
<= last
;
3068 if (!found
&& is_any_old_inodes()) {
3069 auto p
= old_inodes
->lower_bound(snapid
);
3070 if (p
!= old_inodes
->end()) {
3071 if (p
->second
.first
> snapid
) {
3072 if (p
!= old_inodes
->begin())
3075 if (p
->second
.first
<= snapid
&& snapid
<= p
->first
) {
3083 void CInode::purge_stale_snap_data(const set
<snapid_t
>& snaps
)
3085 dout(10) << __func__
<< " " << snaps
<< dendl
;
3087 if (!get_old_inodes())
3090 std::vector
<snapid_t
> to_remove
;
3091 for (auto p
: *get_old_inodes()) {
3092 const snapid_t
&id
= p
.first
;
3093 const auto &s
= snaps
.lower_bound(p
.second
.first
);
3094 if (s
== snaps
.end() || *s
> id
) {
3095 dout(10) << " purging old_inode [" << p
.second
.first
<< "," << id
<< "]" << dendl
;
3096 to_remove
.push_back(id
);
3100 if (to_remove
.size() == get_old_inodes()->size()) {
3101 reset_old_inodes(old_inode_map_ptr());
3102 } else if (!to_remove
.empty()) {
3103 auto _old_inodes
= allocate_old_inode_map(*get_old_inodes());
3104 for (auto id
: to_remove
)
3105 _old_inodes
->erase(id
);
3106 reset_old_inodes(std::move(_old_inodes
));
3111 * pick/create an old_inode
3113 snapid_t
CInode::pick_old_inode(snapid_t snap
) const
3115 if (is_any_old_inodes()) {
3116 auto it
= old_inodes
->lower_bound(snap
); // p is first key >= to snap
3117 if (it
!= old_inodes
->end() && it
->second
.first
<= snap
) {
3118 dout(10) << __func__
<< " snap " << snap
<< " -> [" << it
->second
.first
<< "," << it
->first
<< "]" << dendl
;
3122 dout(10) << __func__
<< " snap " << snap
<< " -> nothing" << dendl
;
3126 void CInode::open_snaprealm(bool nosplit
)
3129 SnapRealm
*parent
= find_snaprealm();
3130 snaprealm
= new SnapRealm(mdcache
, this);
3132 dout(10) << __func__
<< " " << snaprealm
3133 << " parent is " << parent
3135 dout(30) << " siblings are " << parent
->open_children
<< dendl
;
3136 snaprealm
->parent
= parent
;
3138 parent
->split_at(snaprealm
);
3139 parent
->open_children
.insert(snaprealm
);
3143 void CInode::close_snaprealm(bool nojoin
)
3146 dout(15) << __func__
<< " " << *snaprealm
<< dendl
;
3147 if (snaprealm
->parent
) {
3148 snaprealm
->parent
->open_children
.erase(snaprealm
);
3150 //snaprealm->parent->join(snaprealm);
3157 SnapRealm
*CInode::find_snaprealm() const
3159 const CInode
*cur
= this;
3160 while (!cur
->snaprealm
) {
3161 const CDentry
*pdn
= cur
->get_oldest_parent_dn();
3164 cur
= pdn
->get_dir()->get_inode();
3166 return cur
->snaprealm
;
3169 void CInode::encode_snap_blob(bufferlist
&snapbl
)
3173 encode(snaprealm
->srnode
, snapbl
);
3174 dout(20) << __func__
<< " " << *snaprealm
<< dendl
;
3177 void CInode::decode_snap_blob(const bufferlist
& snapbl
)
3180 if (snapbl
.length()) {
3182 auto old_flags
= snaprealm
->srnode
.flags
;
3183 auto p
= snapbl
.cbegin();
3184 decode(snaprealm
->srnode
, p
);
3186 if ((snaprealm
->srnode
.flags
^ old_flags
) & sr_t::PARENT_GLOBAL
) {
3187 snaprealm
->adjust_parent();
3190 dout(20) << __func__
<< " " << *snaprealm
<< dendl
;
3191 } else if (snaprealm
&&
3192 !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675
3193 ceph_assert(mdcache
->mds
->is_any_replay());
3194 snaprealm
->merge_to(NULL
);
3198 void CInode::encode_snap(bufferlist
& bl
)
3200 ENCODE_START(1, 1, bl
);
3202 encode_snap_blob(snapbl
);
3204 encode(oldest_snap
, bl
);
3208 void CInode::decode_snap(bufferlist::const_iterator
& p
)
3213 decode(oldest_snap
, p
);
3214 decode_snap_blob(snapbl
);
3218 // =============================================
3220 client_t
CInode::calc_ideal_loner()
3222 if (mdcache
->is_readonly())
3224 if (!get_mds_caps_wanted().empty())
3228 client_t loner
= -1;
3229 for (const auto &p
: client_caps
) {
3230 if (!p
.second
.is_stale() &&
3232 !has_subtree_or_exporting_dirfrag() :
3233 (p
.second
.wanted() & (CEPH_CAP_ANY_WR
|CEPH_CAP_FILE_RD
)))) {
3243 bool CInode::choose_ideal_loner()
3245 want_loner_cap
= calc_ideal_loner();
3246 int changed
= false;
3247 if (loner_cap
>= 0 && loner_cap
!= want_loner_cap
) {
3248 if (!try_drop_loner())
3253 if (want_loner_cap
>= 0) {
3254 if (loner_cap
< 0) {
3255 set_loner_cap(want_loner_cap
);
3258 ceph_assert(loner_cap
== want_loner_cap
);
3263 bool CInode::try_set_loner()
3265 ceph_assert(want_loner_cap
>= 0);
3266 if (loner_cap
>= 0 && loner_cap
!= want_loner_cap
)
3268 set_loner_cap(want_loner_cap
);
3272 void CInode::set_loner_cap(client_t l
)
3275 authlock
.set_excl_client(loner_cap
);
3276 filelock
.set_excl_client(loner_cap
);
3277 linklock
.set_excl_client(loner_cap
);
3278 xattrlock
.set_excl_client(loner_cap
);
3281 bool CInode::try_drop_loner()
3286 int other_allowed
= get_caps_allowed_by_type(CAP_ANY
);
3287 Capability
*cap
= get_client_cap(loner_cap
);
3289 (cap
->issued() & ~other_allowed
) == 0) {
3297 // choose new lock state during recovery, based on issued caps
3298 void CInode::choose_lock_state(SimpleLock
*lock
, int allissued
)
3300 int shift
= lock
->get_cap_shift();
3301 int issued
= (allissued
>> shift
) & lock
->get_cap_mask();
3303 if (lock
->is_xlocked()) {
3305 } else if (lock
->get_state() != LOCK_MIX
) {
3306 if (issued
& (CEPH_CAP_GEXCL
| CEPH_CAP_GBUFFER
))
3307 lock
->set_state(LOCK_EXCL
);
3308 else if (issued
& CEPH_CAP_GWR
) {
3309 if (issued
& (CEPH_CAP_GCACHE
| CEPH_CAP_GSHARED
))
3310 lock
->set_state(LOCK_EXCL
);
3312 lock
->set_state(LOCK_MIX
);
3313 } else if (lock
->is_dirty()) {
3314 if (is_replicated())
3315 lock
->set_state(LOCK_MIX
);
3317 lock
->set_state(LOCK_LOCK
);
3319 lock
->set_state(LOCK_SYNC
);
3322 // our states have already been chosen during rejoin.
3323 if (lock
->is_xlocked())
3324 ceph_assert(lock
->get_state() == LOCK_LOCK
);
3328 void CInode::choose_lock_states(int dirty_caps
)
3330 int issued
= get_caps_issued() | dirty_caps
;
3331 if (is_auth() && (issued
& (CEPH_CAP_ANY_EXCL
|CEPH_CAP_ANY_WR
)))
3332 choose_ideal_loner();
3333 choose_lock_state(&filelock
, issued
);
3334 choose_lock_state(&nestlock
, issued
);
3335 choose_lock_state(&dirfragtreelock
, issued
);
3336 choose_lock_state(&authlock
, issued
);
3337 choose_lock_state(&xattrlock
, issued
);
3338 choose_lock_state(&linklock
, issued
);
3341 int CInode::count_nonstale_caps()
3344 for (const auto &p
: client_caps
) {
3345 if (!p
.second
.is_stale())
3351 bool CInode::multiple_nonstale_caps()
3354 for (const auto &p
: client_caps
) {
3355 if (!p
.second
.is_stale()) {
3364 void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map
<int32_t,int32_t>& m
)
3366 bool old_empty
= mds_caps_wanted
.empty();
3367 mds_caps_wanted
.swap(m
);
3368 if (old_empty
!= (bool)mds_caps_wanted
.empty()) {
3370 adjust_num_caps_notable(1);
3372 adjust_num_caps_notable(-1);
3376 void CInode::set_mds_caps_wanted(mds_rank_t mds
, int32_t wanted
)
3378 bool old_empty
= mds_caps_wanted
.empty();
3380 mds_caps_wanted
[mds
] = wanted
;
3382 adjust_num_caps_notable(1);
3383 } else if (!old_empty
) {
3384 mds_caps_wanted
.erase(mds
);
3385 if (mds_caps_wanted
.empty())
3386 adjust_num_caps_notable(-1);
3390 Capability
*CInode::add_client_cap(client_t client
, Session
*session
,
3391 SnapRealm
*conrealm
, bool new_inode
)
3393 ceph_assert(last
== CEPH_NOSNAP
);
3394 if (client_caps
.empty()) {
3397 containing_realm
= conrealm
;
3399 containing_realm
= find_snaprealm();
3400 containing_realm
->inodes_with_caps
.push_back(&item_caps
);
3401 dout(10) << __func__
<< " first cap, joining realm " << *containing_realm
<< dendl
;
3403 mdcache
->num_inodes_with_caps
++;
3405 parent
->dir
->adjust_num_inodes_with_caps(1);
3408 uint64_t cap_id
= new_inode
? 1 : ++mdcache
->last_cap_id
;
3409 auto ret
= client_caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
),
3410 std::forward_as_tuple(this, session
, cap_id
));
3411 ceph_assert(ret
.second
== true);
3412 Capability
*cap
= &ret
.first
->second
;
3414 cap
->client_follows
= first
-1;
3415 containing_realm
->add_cap(client
, cap
);
3420 void CInode::remove_client_cap(client_t client
)
3422 auto it
= client_caps
.find(client
);
3423 ceph_assert(it
!= client_caps
.end());
3424 Capability
*cap
= &it
->second
;
3426 cap
->item_session_caps
.remove_myself();
3427 cap
->item_revoking_caps
.remove_myself();
3428 cap
->item_client_revoking_caps
.remove_myself();
3429 containing_realm
->remove_cap(client
, cap
);
3431 if (client
== loner_cap
)
3434 if (cap
->is_wanted_notable())
3435 adjust_num_caps_notable(-1);
3437 client_caps
.erase(it
);
3438 if (client_caps
.empty()) {
3439 dout(10) << __func__
<< " last cap, leaving realm " << *containing_realm
<< dendl
;
3441 item_caps
.remove_myself();
3442 containing_realm
= NULL
;
3443 mdcache
->num_inodes_with_caps
--;
3445 parent
->dir
->adjust_num_inodes_with_caps(-1);
3448 //clean up advisory locks
3449 bool fcntl_removed
= fcntl_locks
? fcntl_locks
->remove_all_from(client
) : false;
3450 bool flock_removed
= flock_locks
? flock_locks
->remove_all_from(client
) : false;
3451 if (fcntl_removed
|| flock_removed
) {
3452 MDSContext::vec waiters
;
3453 take_waiting(CInode::WAIT_FLOCK
, waiters
);
3454 mdcache
->mds
->queue_waiters(waiters
);
3458 void CInode::move_to_realm(SnapRealm
*realm
)
3460 dout(10) << __func__
<< " joining realm " << *realm
3461 << ", leaving realm " << *containing_realm
<< dendl
;
3462 for (auto& p
: client_caps
) {
3463 containing_realm
->remove_cap(p
.first
, &p
.second
);
3464 realm
->add_cap(p
.first
, &p
.second
);
3466 item_caps
.remove_myself();
3467 realm
->inodes_with_caps
.push_back(&item_caps
);
3468 containing_realm
= realm
;
3471 Capability
*CInode::reconnect_cap(client_t client
, const cap_reconnect_t
& icr
, Session
*session
)
3473 Capability
*cap
= get_client_cap(client
);
3476 cap
->merge(icr
.capinfo
.wanted
, icr
.capinfo
.issued
);
3478 cap
= add_client_cap(client
, session
);
3479 cap
->set_cap_id(icr
.capinfo
.cap_id
);
3480 cap
->set_wanted(icr
.capinfo
.wanted
);
3481 cap
->issue_norevoke(icr
.capinfo
.issued
);
3484 cap
->set_last_issue_stamp(ceph_clock_now());
3488 void CInode::clear_client_caps_after_export()
3490 while (!client_caps
.empty())
3491 remove_client_cap(client_caps
.begin()->first
);
3493 want_loner_cap
= -1;
3494 if (!get_mds_caps_wanted().empty()) {
3495 mempool::mds_co::compact_map
<int32_t,int32_t> empty
;
3496 set_mds_caps_wanted(empty
);
3500 void CInode::export_client_caps(map
<client_t
,Capability::Export
>& cl
)
3502 for (const auto &p
: client_caps
) {
3503 cl
[p
.first
] = p
.second
.make_export();
3508 int CInode::get_caps_liked() const
3511 return CEPH_CAP_PIN
| CEPH_CAP_ANY_EXCL
| CEPH_CAP_ANY_SHARED
; // but not, say, FILE_RD|WR|WRBUFFER
3513 return CEPH_CAP_ANY
& ~CEPH_CAP_FILE_LAZYIO
;
3516 int CInode::get_caps_allowed_ever() const
3520 allowed
= CEPH_CAP_PIN
| CEPH_CAP_ANY_EXCL
| CEPH_CAP_ANY_SHARED
;
3522 allowed
= CEPH_CAP_ANY
;
3525 (filelock
.gcaps_allowed_ever() << filelock
.get_cap_shift()) |
3526 (authlock
.gcaps_allowed_ever() << authlock
.get_cap_shift()) |
3527 (xattrlock
.gcaps_allowed_ever() << xattrlock
.get_cap_shift()) |
3528 (linklock
.gcaps_allowed_ever() << linklock
.get_cap_shift()));
3531 int CInode::get_caps_allowed_by_type(int type
) const
3535 (filelock
.gcaps_allowed(type
) << filelock
.get_cap_shift()) |
3536 (authlock
.gcaps_allowed(type
) << authlock
.get_cap_shift()) |
3537 (xattrlock
.gcaps_allowed(type
) << xattrlock
.get_cap_shift()) |
3538 (linklock
.gcaps_allowed(type
) << linklock
.get_cap_shift());
3541 int CInode::get_caps_careful() const
3544 (filelock
.gcaps_careful() << filelock
.get_cap_shift()) |
3545 (authlock
.gcaps_careful() << authlock
.get_cap_shift()) |
3546 (xattrlock
.gcaps_careful() << xattrlock
.get_cap_shift()) |
3547 (linklock
.gcaps_careful() << linklock
.get_cap_shift());
3550 int CInode::get_xlocker_mask(client_t client
) const
3553 (filelock
.gcaps_xlocker_mask(client
) << filelock
.get_cap_shift()) |
3554 (authlock
.gcaps_xlocker_mask(client
) << authlock
.get_cap_shift()) |
3555 (xattrlock
.gcaps_xlocker_mask(client
) << xattrlock
.get_cap_shift()) |
3556 (linklock
.gcaps_xlocker_mask(client
) << linklock
.get_cap_shift());
3559 int CInode::get_caps_allowed_for_client(Session
*session
, Capability
*cap
,
3560 const mempool_inode
*file_i
) const
3562 client_t client
= session
->get_client();
3564 if (client
== get_loner()) {
3565 // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked
3567 get_caps_allowed_by_type(CAP_LONER
) |
3568 (get_caps_allowed_by_type(CAP_XLOCKER
) & get_xlocker_mask(client
));
3570 allowed
= get_caps_allowed_by_type(CAP_ANY
);
3574 allowed
&= ~CEPH_CAP_ANY_DIR_OPS
;
3575 if (cap
&& (allowed
& CEPH_CAP_FILE_EXCL
))
3576 allowed
|= cap
->get_lock_cache_allowed();
3578 if (file_i
->inline_data
.version
== CEPH_INLINE_NONE
&&
3579 file_i
->layout
.pool_ns
.empty()) {
3582 if ((file_i
->inline_data
.version
!= CEPH_INLINE_NONE
&&
3583 cap
->is_noinline()) ||
3584 (!file_i
->layout
.pool_ns
.empty() &&
3585 cap
->is_nopoolns()))
3586 allowed
&= ~(CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_WR
);
3588 auto& conn
= session
->get_connection();
3589 if ((file_i
->inline_data
.version
!= CEPH_INLINE_NONE
&&
3590 !conn
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) ||
3591 (!file_i
->layout
.pool_ns
.empty() &&
3592 !conn
->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
)))
3593 allowed
&= ~(CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_WR
);
3599 // caps issued, wanted
3600 int CInode::get_caps_issued(int *ploner
, int *pother
, int *pxlocker
,
3601 int shift
, int mask
)
3604 int loner
= 0, other
= 0, xlocker
= 0;
3609 for (const auto &p
: client_caps
) {
3610 int i
= p
.second
.issued();
3612 if (p
.first
== loner_cap
)
3616 xlocker
|= get_xlocker_mask(p
.first
) & i
;
3618 if (ploner
) *ploner
= (loner
>> shift
) & mask
;
3619 if (pother
) *pother
= (other
>> shift
) & mask
;
3620 if (pxlocker
) *pxlocker
= (xlocker
>> shift
) & mask
;
3621 return (c
>> shift
) & mask
;
3624 bool CInode::is_any_caps_wanted() const
3626 for (const auto &p
: client_caps
) {
3627 if (p
.second
.wanted())
3633 int CInode::get_caps_wanted(int *ploner
, int *pother
, int shift
, int mask
) const
3636 int loner
= 0, other
= 0;
3637 for (const auto &p
: client_caps
) {
3638 if (!p
.second
.is_stale()) {
3639 int t
= p
.second
.wanted();
3641 if (p
.first
== loner_cap
)
3646 //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
3649 for (const auto &p
: mds_caps_wanted
) {
3652 //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
3654 if (ploner
) *ploner
= (loner
>> shift
) & mask
;
3655 if (pother
) *pother
= (other
>> shift
) & mask
;
3656 return (w
>> shift
) & mask
;
3659 bool CInode::issued_caps_need_gather(SimpleLock
*lock
)
3661 int loner_issued
, other_issued
, xlocker_issued
;
3662 get_caps_issued(&loner_issued
, &other_issued
, &xlocker_issued
,
3663 lock
->get_cap_shift(), lock
->get_cap_mask());
3664 if ((loner_issued
& ~lock
->gcaps_allowed(CAP_LONER
)) ||
3665 (other_issued
& ~lock
->gcaps_allowed(CAP_ANY
)) ||
3666 (xlocker_issued
& ~lock
->gcaps_allowed(CAP_XLOCKER
)))
3671 void CInode::adjust_num_caps_notable(int d
)
3673 if (!is_clientwriteable()) {
3674 if (!num_caps_notable
&& d
> 0)
3675 mdcache
->open_file_table
.add_inode(this);
3676 else if (num_caps_notable
> 0 && num_caps_notable
== -d
)
3677 mdcache
->open_file_table
.remove_inode(this);
3680 num_caps_notable
+=d
;
3681 ceph_assert(num_caps_notable
>= 0);
3684 void CInode::mark_clientwriteable()
3686 if (last
!= CEPH_NOSNAP
)
3688 if (!state_test(STATE_CLIENTWRITEABLE
)) {
3689 if (num_caps_notable
== 0)
3690 mdcache
->open_file_table
.add_inode(this);
3691 state_set(STATE_CLIENTWRITEABLE
);
3695 void CInode::clear_clientwriteable()
3697 if (state_test(STATE_CLIENTWRITEABLE
)) {
3698 if (num_caps_notable
== 0)
3699 mdcache
->open_file_table
.remove_inode(this);
3700 state_clear(STATE_CLIENTWRITEABLE
);
3704 // =============================================
3706 int CInode::encode_inodestat(bufferlist
& bl
, Session
*session
,
3707 SnapRealm
*dir_realm
,
3712 client_t client
= session
->get_client();
3713 ceph_assert(snapid
);
3718 const mempool_inode
*oi
= get_inode().get();
3719 const mempool_inode
*pi
= get_projected_inode().get();
3721 const mempool_xattr_map
*pxattrs
= nullptr;
3723 if (snapid
!= CEPH_NOSNAP
) {
3725 // for now at least, old_inodes is only defined/valid on the auth
3729 if (is_any_old_inodes()) {
3730 auto it
= old_inodes
->lower_bound(snapid
);
3731 if (it
!= old_inodes
->end()) {
3732 if (it
->second
.first
> snapid
) {
3733 if (it
!= old_inodes
->begin())
3736 if (it
->second
.first
<= snapid
&& snapid
<= it
->first
) {
3737 dout(15) << __func__
<< " snapid " << snapid
3738 << " to old_inode [" << it
->second
.first
<< "," << it
->first
<< "]"
3739 << " " << it
->second
.inode
.rstat
3741 pi
= oi
= &it
->second
.inode
;
3742 pxattrs
= &it
->second
.xattrs
;
3744 // snapshoted remote dentry can result this
3745 dout(0) << __func__
<< " old_inode for snapid " << snapid
3746 << " not found" << dendl
;
3749 } else if (snapid
< first
|| snapid
> last
) {
3750 // snapshoted remote dentry can result this
3751 dout(0) << __func__
<< " [" << first
<< "," << last
<< "]"
3752 << " not match snapid " << snapid
<< dendl
;
3757 std::map
<std::string
, std::string
> snap_metadata
;
3758 SnapRealm
*realm
= find_snaprealm();
3759 if (snapid
!= CEPH_NOSNAP
&& realm
) {
3760 // add snapshot timestamp vxattr
3761 map
<snapid_t
,const SnapInfo
*> infomap
;
3762 realm
->get_snap_info(infomap
,
3765 if (!infomap
.empty()) {
3766 ceph_assert(infomap
.size() == 1);
3767 const SnapInfo
*si
= infomap
.begin()->second
;
3768 snap_btime
= si
->stamp
;
3769 snap_metadata
= si
->metadata
;
3774 bool no_caps
= !valid
||
3775 session
->is_stale() ||
3776 (dir_realm
&& realm
!= dir_realm
) ||
3778 state_test(CInode::STATE_EXPORTINGCAPS
);
3780 dout(20) << __func__
<< " no caps"
3781 << (!valid
?", !valid":"")
3782 << (session
->is_stale()?", session stale ":"")
3783 << ((dir_realm
&& realm
!= dir_realm
)?", snaprealm differs ":"")
3784 << (is_frozen()?", frozen inode":"")
3785 << (state_test(CInode::STATE_EXPORTINGCAPS
)?", exporting caps":"")
3789 // "fake" a version that is odd (stable) version, +1 if projected.
3790 version_t version
= (oi
->version
* 2) + is_projected();
3792 Capability
*cap
= get_client_cap(client
);
3793 bool pfile
= filelock
.is_xlocked_by_client(client
) || get_loner() == client
;
3794 //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL));
3795 bool pauth
= authlock
.is_xlocked_by_client(client
) || get_loner() == client
;
3796 bool plink
= linklock
.is_xlocked_by_client(client
) || get_loner() == client
;
3797 bool pxattr
= xattrlock
.is_xlocked_by_client(client
) || get_loner() == client
;
3799 bool plocal
= versionlock
.get_last_wrlock_client() == client
;
3800 bool ppolicy
= policylock
.is_xlocked_by_client(client
) || get_loner()==client
;
3802 const mempool_inode
*any_i
= (pfile
|pauth
|plink
|pxattr
|plocal
) ? pi
: oi
;
3804 dout(20) << " pfile " << pfile
<< " pauth " << pauth
3805 << " plink " << plink
<< " pxattr " << pxattr
3806 << " plocal " << plocal
3807 << " mtime " << any_i
->mtime
3808 << " ctime " << any_i
->ctime
3809 << " change_attr " << any_i
->change_attr
3810 << " valid=" << valid
<< dendl
;
3813 const mempool_inode
*file_i
= pfile
? pi
:oi
;
3814 file_layout_t layout
;
3816 layout
= (ppolicy
? pi
: oi
)->layout
;
3818 layout
= file_i
->layout
;
3821 // max_size is min of projected, actual
3823 std::min(oi
->get_client_range(client
),
3824 pi
->get_client_range(client
));
3827 version_t inline_version
= 0;
3828 bufferlist inline_data
;
3829 if (file_i
->inline_data
.version
== CEPH_INLINE_NONE
) {
3830 inline_version
= CEPH_INLINE_NONE
;
3831 } else if ((!cap
&& !no_caps
) ||
3832 (cap
&& cap
->client_inline_version
< file_i
->inline_data
.version
) ||
3833 (getattr_caps
& CEPH_CAP_FILE_RD
)) { // client requests inline data
3834 inline_version
= file_i
->inline_data
.version
;
3835 if (file_i
->inline_data
.length() > 0)
3836 file_i
->inline_data
.get_data(inline_data
);
3839 // nest (do same as file... :/)
3841 cap
->last_rbytes
= file_i
->rstat
.rbytes
;
3842 cap
->last_rsize
= file_i
->rstat
.rsize();
3846 const mempool_inode
*auth_i
= pauth
? pi
:oi
;
3849 const mempool_inode
*link_i
= plink
? pi
:oi
;
3852 const mempool_inode
*xattr_i
= pxattr
? pi
:oi
;
3856 version_t xattr_version
;
3857 if ((!cap
&& !no_caps
) ||
3858 (cap
&& cap
->client_xattr_version
< xattr_i
->xattr_version
) ||
3859 (getattr_caps
& CEPH_CAP_XATTR_SHARED
)) { // client requests xattrs
3861 pxattrs
= pxattr
? get_projected_xattrs().get() : get_xattrs().get();
3862 xattr_version
= xattr_i
->xattr_version
;
3870 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap
) +
3871 sizeof(struct ceph_file_layout
) +
3872 sizeof(struct ceph_timespec
) * 3 + 4 + // ctime ~ time_warp_seq
3873 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink
3874 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec
) + // dirstat.nfiles ~ rstat.rctime
3875 sizeof(__u32
) + sizeof(__u32
) * 2 * dirfragtree
._splits
.size() + // dirfragtree
3876 sizeof(__u32
) + symlink
.length() + // symlink
3877 sizeof(struct ceph_dir_layout
); // dir_layout
3879 if (xattr_version
) {
3880 bytes
+= sizeof(__u32
) + sizeof(__u32
); // xattr buffer len + number entries
3882 for (const auto &p
: *pxattrs
)
3883 bytes
+= sizeof(__u32
) * 2 + p
.first
.length() + p
.second
.length();
3886 bytes
+= sizeof(__u32
); // xattr buffer len
3889 sizeof(version_t
) + sizeof(__u32
) + inline_data
.length() + // inline data
3890 1 + 1 + 8 + 8 + 4 + // quota
3891 4 + layout
.pool_ns
.size() + // pool ns
3892 sizeof(struct ceph_timespec
) + 8; // btime + change_attr
3894 if (bytes
> max_bytes
)
3895 return -CEPHFS_ENOSPC
;
3900 struct ceph_mds_reply_cap ecap
;
3901 if (snapid
!= CEPH_NOSNAP
) {
3903 * snapped inodes (files or dirs) only get read-only caps. always
3904 * issue everything possible, since it is read only.
3906 * if a snapped inode has caps, limit issued caps based on the
3909 * if it is a live inode, limit issued caps based on the lock
3912 * do NOT adjust cap issued state, because the client always
3913 * tracks caps per-snap and the mds does either per-interval or
3916 ecap
.caps
= valid
? get_caps_allowed_by_type(CAP_ANY
) : CEPH_STAT_CAP_INODE
;
3917 if (last
== CEPH_NOSNAP
|| is_any_caps())
3918 ecap
.caps
= ecap
.caps
& get_caps_allowed_for_client(session
, nullptr, file_i
);
3923 if (!no_caps
&& !cap
) {
3925 cap
= add_client_cap(client
, session
, realm
);
3927 choose_ideal_loner();
3931 if (!no_caps
&& cap
) {
3932 int likes
= get_caps_liked();
3933 int allowed
= get_caps_allowed_for_client(session
, cap
, file_i
);
3934 issue
= (cap
->wanted() | likes
) & allowed
;
3935 cap
->issue_norevoke(issue
, true);
3936 issue
= cap
->pending();
3937 dout(10) << "encode_inodestat issuing " << ccap_string(issue
)
3938 << " seq " << cap
->get_last_seq() << dendl
;
3939 } else if (cap
&& cap
->is_new() && !dir_realm
) {
3940 // alway issue new caps to client, otherwise the caps get lost
3941 ceph_assert(cap
->is_stale());
3942 ceph_assert(!cap
->pending());
3943 issue
= CEPH_CAP_PIN
;
3944 cap
->issue_norevoke(issue
, true);
3945 dout(10) << "encode_inodestat issuing " << ccap_string(issue
)
3946 << " seq " << cap
->get_last_seq()
3947 << "(stale&new caps)" << dendl
;
3951 cap
->set_last_issue();
3952 cap
->set_last_issue_stamp(ceph_clock_now());
3954 ecap
.wanted
= cap
->wanted();
3955 ecap
.cap_id
= cap
->get_cap_id();
3956 ecap
.seq
= cap
->get_last_seq();
3957 ecap
.mseq
= cap
->get_mseq();
3958 ecap
.realm
= realm
->inode
->ino();
3968 ecap
.flags
= is_auth() ? CEPH_CAP_FLAG_AUTH
: 0;
3969 dout(10) << "encode_inodestat caps " << ccap_string(ecap
.caps
)
3970 << " seq " << ecap
.seq
<< " mseq " << ecap
.mseq
3971 << " xattrv " << xattr_version
<< dendl
;
3973 if (inline_data
.length() && cap
) {
3974 if ((cap
->pending() | getattr_caps
) & CEPH_CAP_FILE_SHARED
) {
3975 dout(10) << "including inline version " << inline_version
<< dendl
;
3976 cap
->client_inline_version
= inline_version
;
3978 dout(10) << "dropping inline version " << inline_version
<< dendl
;
3980 inline_data
.clear();
3984 // include those xattrs?
3985 if (xattr_version
&& cap
) {
3986 if ((cap
->pending() | getattr_caps
) & CEPH_CAP_XATTR_SHARED
) {
3987 dout(10) << "including xattrs version " << xattr_version
<< dendl
;
3988 cap
->client_xattr_version
= xattr_version
;
3990 dout(10) << "dropping xattrs version " << xattr_version
<< dendl
;
3995 // The end result of encode_xattrs() is equivalent to:
3998 // if (xattr_version) {
4000 // encode(*pxattrs, bl);
4002 // encode((__u32)0, bl);
4007 // But encoding xattrs into the 'xbl' requires a memory allocation.
4008 // The 'bl' should have enough pre-allocated memory in most cases.
4009 // Encoding xattrs directly into it can avoid the extra allocation.
4010 auto encode_xattrs
= [xattr_version
, pxattrs
, &bl
]() {
4012 if (xattr_version
) {
4014 auto filler
= bl
.append_hole(sizeof(xbl_len
));
4015 const auto starting_bl_len
= bl
.length();
4017 encode(*pxattrs
, bl
);
4019 encode((__u32
)0, bl
);
4020 xbl_len
= bl
.length() - starting_bl_len
;
4021 filler
.copy_in(sizeof(xbl_len
), (char *)&xbl_len
);
4023 encode((__u32
)0, bl
);
4028 * note: encoding matches MClientReply::InodeStat
4030 if (session
->info
.has_feature(CEPHFS_FEATURE_REPLY_ENCODING
)) {
4031 ENCODE_START(7, 1, bl
);
4032 encode(oi
->ino
, bl
);
4034 encode(oi
->rdev
, bl
);
4035 encode(version
, bl
);
4036 encode(xattr_version
, bl
);
4039 ceph_file_layout legacy_layout
;
4040 layout
.to_legacy(&legacy_layout
);
4041 encode(legacy_layout
, bl
);
4043 encode(any_i
->ctime
, bl
);
4044 encode(file_i
->mtime
, bl
);
4045 encode(file_i
->atime
, bl
);
4046 encode(file_i
->time_warp_seq
, bl
);
4047 encode(file_i
->size
, bl
);
4048 encode(max_size
, bl
);
4049 encode(file_i
->truncate_size
, bl
);
4050 encode(file_i
->truncate_seq
, bl
);
4051 encode(auth_i
->mode
, bl
);
4052 encode((uint32_t)auth_i
->uid
, bl
);
4053 encode((uint32_t)auth_i
->gid
, bl
);
4054 encode(link_i
->nlink
, bl
);
4055 encode(file_i
->dirstat
.nfiles
, bl
);
4056 encode(file_i
->dirstat
.nsubdirs
, bl
);
4057 encode(file_i
->rstat
.rbytes
, bl
);
4058 encode(file_i
->rstat
.rfiles
, bl
);
4059 encode(file_i
->rstat
.rsubdirs
, bl
);
4060 encode(file_i
->rstat
.rctime
, bl
);
4061 dirfragtree
.encode(bl
);
4062 encode(symlink
, bl
);
4063 encode(file_i
->dir_layout
, bl
);
4065 encode(inline_version
, bl
);
4066 encode(inline_data
, bl
);
4067 const mempool_inode
*policy_i
= ppolicy
? pi
: oi
;
4068 encode(policy_i
->quota
, bl
);
4069 encode(layout
.pool_ns
, bl
);
4070 encode(any_i
->btime
, bl
);
4071 encode(any_i
->change_attr
, bl
);
4072 encode(file_i
->export_pin
, bl
);
4073 encode(snap_btime
, bl
);
4074 encode(file_i
->rstat
.rsnaps
, bl
);
4075 encode(snap_metadata
, bl
);
4076 encode(!file_i
->fscrypt_auth
.empty(), bl
);
4077 encode(file_i
->fscrypt_auth
, bl
);
4078 encode(file_i
->fscrypt_file
, bl
);
4082 ceph_assert(session
->get_connection());
4084 encode(oi
->ino
, bl
);
4086 encode(oi
->rdev
, bl
);
4087 encode(version
, bl
);
4088 encode(xattr_version
, bl
);
4091 ceph_file_layout legacy_layout
;
4092 layout
.to_legacy(&legacy_layout
);
4093 encode(legacy_layout
, bl
);
4095 encode(any_i
->ctime
, bl
);
4096 encode(file_i
->mtime
, bl
);
4097 encode(file_i
->atime
, bl
);
4098 encode(file_i
->time_warp_seq
, bl
);
4099 encode(file_i
->size
, bl
);
4100 encode(max_size
, bl
);
4101 encode(file_i
->truncate_size
, bl
);
4102 encode(file_i
->truncate_seq
, bl
);
4103 encode(auth_i
->mode
, bl
);
4104 encode((uint32_t)auth_i
->uid
, bl
);
4105 encode((uint32_t)auth_i
->gid
, bl
);
4106 encode(link_i
->nlink
, bl
);
4107 encode(file_i
->dirstat
.nfiles
, bl
);
4108 encode(file_i
->dirstat
.nsubdirs
, bl
);
4109 encode(file_i
->rstat
.rbytes
, bl
);
4110 encode(file_i
->rstat
.rfiles
, bl
);
4111 encode(file_i
->rstat
.rsubdirs
, bl
);
4112 encode(file_i
->rstat
.rctime
, bl
);
4113 dirfragtree
.encode(bl
);
4114 encode(symlink
, bl
);
4115 auto& conn
= session
->get_connection();
4116 if (conn
->has_feature(CEPH_FEATURE_DIRLAYOUTHASH
)) {
4117 encode(file_i
->dir_layout
, bl
);
4120 if (conn
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
4121 encode(inline_version
, bl
);
4122 encode(inline_data
, bl
);
4124 if (conn
->has_feature(CEPH_FEATURE_MDS_QUOTA
)) {
4125 const mempool_inode
*policy_i
= ppolicy
? pi
: oi
;
4126 encode(policy_i
->quota
, bl
);
4128 if (conn
->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
)) {
4129 encode(layout
.pool_ns
, bl
);
4131 if (conn
->has_feature(CEPH_FEATURE_FS_BTIME
)) {
4132 encode(any_i
->btime
, bl
);
4133 encode(any_i
->change_attr
, bl
);
4140 void CInode::encode_cap_message(const ref_t
<MClientCaps
> &m
, Capability
*cap
)
4144 client_t client
= cap
->get_client();
4146 bool pfile
= filelock
.is_xlocked_by_client(client
) || (cap
->issued() & CEPH_CAP_FILE_EXCL
);
4147 bool pauth
= authlock
.is_xlocked_by_client(client
);
4148 bool plink
= linklock
.is_xlocked_by_client(client
);
4149 bool pxattr
= xattrlock
.is_xlocked_by_client(client
);
4151 const mempool_inode
*oi
= get_inode().get();
4152 const mempool_inode
*pi
= get_projected_inode().get();
4153 const mempool_inode
*i
= (pfile
|pauth
|plink
|pxattr
) ? pi
: oi
;
4155 dout(20) << __func__
<< " pfile " << pfile
4156 << " pauth " << pauth
<< " plink " << plink
<< " pxattr " << pxattr
4157 << " mtime " << i
->mtime
<< " ctime " << i
->ctime
<< " change_attr " << i
->change_attr
<< dendl
;
4160 m
->set_layout(i
->layout
);
4162 m
->truncate_seq
= i
->truncate_seq
;
4163 m
->truncate_size
= i
->truncate_size
;
4164 m
->fscrypt_file
= i
->fscrypt_file
;
4165 m
->fscrypt_auth
= i
->fscrypt_auth
;
4166 m
->mtime
= i
->mtime
;
4167 m
->atime
= i
->atime
;
4168 m
->ctime
= i
->ctime
;
4169 m
->btime
= i
->btime
;
4170 m
->change_attr
= i
->change_attr
;
4171 m
->time_warp_seq
= i
->time_warp_seq
;
4172 m
->nfiles
= i
->dirstat
.nfiles
;
4173 m
->nsubdirs
= i
->dirstat
.nsubdirs
;
4175 if (cap
->client_inline_version
< i
->inline_data
.version
) {
4176 m
->inline_version
= cap
->client_inline_version
= i
->inline_data
.version
;
4177 if (i
->inline_data
.length() > 0)
4178 i
->inline_data
.get_data(m
->inline_data
);
4180 m
->inline_version
= 0;
4183 // max_size is min of projected, actual.
4184 uint64_t oldms
= oi
->get_client_range(client
);
4185 uint64_t newms
= pi
->get_client_range(client
);
4186 m
->max_size
= std::min(oldms
, newms
);
4189 m
->head
.mode
= i
->mode
;
4190 m
->head
.uid
= i
->uid
;
4191 m
->head
.gid
= i
->gid
;
4194 m
->head
.nlink
= i
->nlink
;
4198 const auto& ix
= pxattr
? get_projected_xattrs() : get_xattrs();
4199 if ((cap
->pending() & CEPH_CAP_XATTR_SHARED
) &&
4200 i
->xattr_version
> cap
->client_xattr_version
) {
4201 dout(10) << " including xattrs v " << i
->xattr_version
<< dendl
;
4203 encode(*ix
, m
->xattrbl
);
4205 encode((__u32
)0, m
->xattrbl
);
4206 m
->head
.xattr_version
= i
->xattr_version
;
4207 cap
->client_xattr_version
= i
->xattr_version
;
4213 void CInode::_encode_base(bufferlist
& bl
, uint64_t features
)
4215 ENCODE_START(1, 1, bl
);
4217 encode(*get_inode(), bl
, features
);
4218 encode(symlink
, bl
);
4219 encode(dirfragtree
, bl
);
4221 encode_old_inodes(bl
, features
);
4222 encode(damage_flags
, bl
);
4226 void CInode::_decode_base(bufferlist::const_iterator
& p
)
4231 auto _inode
= allocate_inode();
4233 reset_inode(std::move(_inode
));
4238 symlink
= std::string_view(tmp
);
4240 decode(dirfragtree
, p
);
4242 decode_old_inodes(p
);
4243 decode(damage_flags
, p
);
4248 void CInode::_encode_locks_full(bufferlist
& bl
)
4251 encode(authlock
, bl
);
4252 encode(linklock
, bl
);
4253 encode(dirfragtreelock
, bl
);
4254 encode(filelock
, bl
);
4255 encode(xattrlock
, bl
);
4256 encode(snaplock
, bl
);
4257 encode(nestlock
, bl
);
4258 encode(flocklock
, bl
);
4259 encode(policylock
, bl
);
4261 encode(loner_cap
, bl
);
4263 void CInode::_decode_locks_full(bufferlist::const_iterator
& p
)
4266 decode(authlock
, p
);
4267 decode(linklock
, p
);
4268 decode(dirfragtreelock
, p
);
4269 decode(filelock
, p
);
4270 decode(xattrlock
, p
);
4271 decode(snaplock
, p
);
4272 decode(nestlock
, p
);
4273 decode(flocklock
, p
);
4274 decode(policylock
, p
);
4276 decode(loner_cap
, p
);
4277 set_loner_cap(loner_cap
);
4278 want_loner_cap
= loner_cap
; // for now, we'll eval() shortly.
4281 void CInode::_encode_locks_state_for_replica(bufferlist
& bl
, bool need_recover
)
4283 ENCODE_START(1, 1, bl
);
4284 authlock
.encode_state_for_replica(bl
);
4285 linklock
.encode_state_for_replica(bl
);
4286 dirfragtreelock
.encode_state_for_replica(bl
);
4287 filelock
.encode_state_for_replica(bl
);
4288 nestlock
.encode_state_for_replica(bl
);
4289 xattrlock
.encode_state_for_replica(bl
);
4290 snaplock
.encode_state_for_replica(bl
);
4291 flocklock
.encode_state_for_replica(bl
);
4292 policylock
.encode_state_for_replica(bl
);
4293 encode(need_recover
, bl
);
4297 void CInode::_encode_locks_state_for_rejoin(bufferlist
& bl
, int rep
)
4299 authlock
.encode_state_for_replica(bl
);
4300 linklock
.encode_state_for_replica(bl
);
4301 dirfragtreelock
.encode_state_for_rejoin(bl
, rep
);
4302 filelock
.encode_state_for_rejoin(bl
, rep
);
4303 nestlock
.encode_state_for_rejoin(bl
, rep
);
4304 xattrlock
.encode_state_for_replica(bl
);
4305 snaplock
.encode_state_for_replica(bl
);
4306 flocklock
.encode_state_for_replica(bl
);
4307 policylock
.encode_state_for_replica(bl
);
4310 void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator
& p
, bool is_new
)
4313 authlock
.decode_state(p
, is_new
);
4314 linklock
.decode_state(p
, is_new
);
4315 dirfragtreelock
.decode_state(p
, is_new
);
4316 filelock
.decode_state(p
, is_new
);
4317 nestlock
.decode_state(p
, is_new
);
4318 xattrlock
.decode_state(p
, is_new
);
4319 snaplock
.decode_state(p
, is_new
);
4320 flocklock
.decode_state(p
, is_new
);
4321 policylock
.decode_state(p
, is_new
);
4324 decode(need_recover
, p
);
4325 if (need_recover
&& is_new
) {
4326 // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
4327 // and change the object when replaying unsafe requests.
4328 authlock
.mark_need_recover();
4329 linklock
.mark_need_recover();
4330 dirfragtreelock
.mark_need_recover();
4331 filelock
.mark_need_recover();
4332 nestlock
.mark_need_recover();
4333 xattrlock
.mark_need_recover();
4334 snaplock
.mark_need_recover();
4335 flocklock
.mark_need_recover();
4336 policylock
.mark_need_recover();
4340 void CInode::_decode_locks_rejoin(bufferlist::const_iterator
& p
, MDSContext::vec
& waiters
,
4341 list
<SimpleLock
*>& eval_locks
, bool survivor
)
4343 authlock
.decode_state_rejoin(p
, waiters
, survivor
);
4344 linklock
.decode_state_rejoin(p
, waiters
, survivor
);
4345 dirfragtreelock
.decode_state_rejoin(p
, waiters
, survivor
);
4346 filelock
.decode_state_rejoin(p
, waiters
, survivor
);
4347 nestlock
.decode_state_rejoin(p
, waiters
, survivor
);
4348 xattrlock
.decode_state_rejoin(p
, waiters
, survivor
);
4349 snaplock
.decode_state_rejoin(p
, waiters
, survivor
);
4350 flocklock
.decode_state_rejoin(p
, waiters
, survivor
);
4351 policylock
.decode_state_rejoin(p
, waiters
, survivor
);
4353 if (!dirfragtreelock
.is_stable() && !dirfragtreelock
.is_wrlocked())
4354 eval_locks
.push_back(&dirfragtreelock
);
4355 if (!filelock
.is_stable() && !filelock
.is_wrlocked())
4356 eval_locks
.push_back(&filelock
);
4357 if (!nestlock
.is_stable() && !nestlock
.is_wrlocked())
4358 eval_locks
.push_back(&nestlock
);
4364 void CInode::encode_export(bufferlist
& bl
)
4366 ENCODE_START(5, 4, bl
);
4367 _encode_base(bl
, mdcache
->mds
->mdsmap
->get_up_features());
4373 encode(get_replicas(), bl
);
4375 // include scatterlock info for any bounding CDirs
4376 bufferlist bounding
;
4377 if (get_inode()->is_dir())
4378 for (const auto &p
: dirfrags
) {
4379 CDir
*dir
= p
.second
;
4380 if (dir
->state_test(CDir::STATE_EXPORTBOUND
)) {
4381 encode(p
.first
, bounding
);
4382 encode(dir
->get_fnode()->fragstat
, bounding
);
4383 encode(dir
->get_fnode()->accounted_fragstat
, bounding
);
4384 encode(dir
->get_fnode()->rstat
, bounding
);
4385 encode(dir
->get_fnode()->accounted_rstat
, bounding
);
4386 dout(10) << " encoded fragstat/rstat info for " << *dir
<< dendl
;
4389 encode(bounding
, bl
);
4391 _encode_locks_full(bl
);
4393 _encode_file_locks(bl
);
4397 get(PIN_TEMPEXPORTING
);
4400 void CInode::finish_export()
4402 state
&= MASK_STATE_EXPORT_KEPT
;
4407 //dirlock.clear_updated();
4411 put(PIN_TEMPEXPORTING
);
4414 void CInode::decode_import(bufferlist::const_iterator
& p
,
4424 s
&= MASK_STATE_EXPORTED
;
4426 set_ephemeral_pin((s
& STATE_DISTEPHEMERALPIN
),
4427 (s
& STATE_RANDEPHEMERALPIN
));
4428 state_set(STATE_AUTH
| s
);
4435 if (is_dirty_parent()) {
4436 get(PIN_DIRTYPARENT
);
4437 mark_dirty_parent(ls
);
4442 decode(get_replicas(), p
);
4443 if (is_replicated())
4444 get(PIN_REPLICATED
);
4447 // decode fragstat info on bounding cdirs
4448 bufferlist bounding
;
4449 decode(bounding
, p
);
4450 auto q
= bounding
.cbegin();
4454 CDir
*dir
= get_dirfrag(fg
);
4455 ceph_assert(dir
); // we should have all bounds open
4457 // Only take the remote's fragstat/rstat if we are non-auth for
4458 // this dirfrag AND the lock is NOT in a scattered (MIX) state.
4459 // We know lock is stable, and MIX is the only state in which
4460 // the inode auth (who sent us this data) may not have the best
4463 // HMM: Are there cases where dir->is_auth() is an insufficient
4464 // check because the dirfrag is under migration? That implies
4465 // it is frozen (and in a SYNC or LOCK state). FIXME.
4467 auto _fnode
= CDir::allocate_fnode(*dir
->get_fnode());
4468 if (dir
->is_auth() ||
4469 filelock
.get_state() == LOCK_MIX
) {
4470 dout(10) << " skipped fragstat info for " << *dir
<< dendl
;
4475 decode(_fnode
->fragstat
, q
);
4476 decode(_fnode
->accounted_fragstat
, q
);
4477 dout(10) << " took fragstat info for " << *dir
<< dendl
;
4479 if (dir
->is_auth() ||
4480 nestlock
.get_state() == LOCK_MIX
) {
4481 dout(10) << " skipped rstat info for " << *dir
<< dendl
;
4486 decode(_fnode
->rstat
, q
);
4487 decode(_fnode
->accounted_rstat
, q
);
4488 dout(10) << " took rstat info for " << *dir
<< dendl
;
4490 dir
->reset_fnode(std::move(_fnode
));
4493 _decode_locks_full(p
);
4495 _decode_file_locks(p
);
4501 void InodeStoreBase::dump(Formatter
*f
) const
4504 f
->dump_string("symlink", symlink
);
4506 f
->open_array_section("xattrs");
4508 for (const auto& [key
, val
] : *xattrs
) {
4509 f
->open_object_section("xattr");
4510 f
->dump_string("key", key
);
4511 std::string
v(val
.c_str(), val
.length());
4512 f
->dump_string("val", v
);
4517 f
->open_object_section("dirfragtree");
4518 dirfragtree
.dump(f
);
4519 f
->close_section(); // dirfragtree
4521 f
->open_array_section("old_inodes");
4523 for (const auto &p
: *old_inodes
) {
4524 f
->open_object_section("old_inode");
4525 // The key is the last snapid, the first is in the mempool_old_inode
4526 f
->dump_int("last", p
.first
);
4528 f
->close_section(); // old_inode
4531 f
->close_section(); // old_inodes
4533 f
->dump_unsigned("oldest_snap", oldest_snap
);
4534 f
->dump_unsigned("damage_flags", damage_flags
);
4538 void decode_json_obj(mempool::mds_co::string
& t
, JSONObj
*obj
){
4540 t
= mempool::mds_co::string(std::string_view(obj
->get_data()));
4543 void InodeStoreBase::decode_json(JSONObj
*obj
)
4546 auto _inode
= allocate_inode();
4547 _inode
->decode_json(obj
);
4548 reset_inode(std::move(_inode
));
4551 JSONDecoder::decode_json("symlink", symlink
, obj
, true);
4552 // JSONDecoder::decode_json("dirfragtree", dirfragtree, obj, true); // cann't decode it now
4556 mempool_xattr_map tmp
;
4557 JSONDecoder::decode_json("xattrs", tmp
, xattrs_cb
, obj
, true);
4559 reset_xattrs(xattr_map_ptr());
4561 reset_xattrs(allocate_xattr_map(std::move(tmp
)));
4563 // JSONDecoder::decode_json("old_inodes", old_inodes, InodeStoreBase::old_indoes_cb, obj, true); // cann't decode old_inodes now
4564 JSONDecoder::decode_json("oldest_snap", oldest_snap
.val
, obj
, true);
4565 JSONDecoder::decode_json("damage_flags", damage_flags
, obj
, true);
4567 //JSONDecoder::decode_json("snap_blob", srnode, obj, true); // cann't decode it now
4568 //snap_blob = srnode;
4571 void InodeStoreBase::xattrs_cb(InodeStoreBase::mempool_xattr_map
& c
, JSONObj
*obj
){
4574 JSONDecoder::decode_json("key", k
, obj
, true);
4576 JSONDecoder::decode_json("val", v
, obj
, true);
4577 c
[k
.c_str()] = buffer::copy(v
.c_str(), v
.size());
4580 void InodeStoreBase::old_indoes_cb(InodeStoreBase::mempool_old_inode_map
& c
, JSONObj
*obj
){
4583 JSONDecoder::decode_json("last", s
.val
, obj
, true);
4584 InodeStoreBase::mempool_old_inode i
;
4585 // i.decode_json(obj); // cann't decode now, simon
4589 void InodeStore::generate_test_instances(std::list
<InodeStore
*> &ls
)
4591 InodeStore
*populated
= new InodeStore
;
4592 populated
->get_inode()->ino
= 0xdeadbeef;
4593 populated
->symlink
= "rhubarb";
4594 ls
.push_back(populated
);
4597 void InodeStoreBare::generate_test_instances(std::list
<InodeStoreBare
*> &ls
)
4599 InodeStoreBare
*populated
= new InodeStoreBare
;
4600 populated
->get_inode()->ino
= 0xdeadbeef;
4601 populated
->symlink
= "rhubarb";
4602 ls
.push_back(populated
);
4605 void CInode::validate_disk_state(CInode::validated_data
*results
,
4608 class ValidationContinuation
: public MDSContinuation
{
4612 CInode::validated_data
*results
;
4624 ValidationContinuation(CInode
*i
,
4625 CInode::validated_data
*data_r
,
4627 MDSContinuation(i
->mdcache
->mds
->server
),
4632 set_callback(START
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_start
));
4633 set_callback(BACKTRACE
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_backtrace
));
4634 set_callback(INODE
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_inode_disk
));
4635 set_callback(DIRFRAGS
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_dirfrags
));
4638 ~ValidationContinuation() override
{
4641 in
->mdcache
->num_shadow_inodes
--;
4646 * Fetch backtrace and set tag if tag is non-empty
4648 void fetch_backtrace_and_tag(CInode
*in
,
4649 std::string_view tag
, bool is_internal
,
4650 Context
*fin
, int *bt_r
, bufferlist
*bt
)
4652 const int64_t pool
= in
->get_backtrace_pool();
4653 object_t oid
= CInode::get_object_name(in
->ino(), frag_t(), "");
4655 ObjectOperation fetch
;
4656 fetch
.getxattr("parent", bt
, bt_r
);
4657 in
->mdcache
->mds
->objecter
->read(oid
, object_locator_t(pool
), fetch
, CEPH_NOSNAP
,
4659 if (in
->mdcache
->mds
->logger
) {
4660 in
->mdcache
->mds
->logger
->inc(l_mds_openino_backtrace_fetch
);
4661 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_backtrace_fetch
);
4666 ObjectOperation scrub_tag
;
4668 encode(tag
, tag_bl
);
4669 scrub_tag
.setxattr("scrub_tag", tag_bl
);
4671 in
->mdcache
->mds
->objecter
->mutate(oid
, object_locator_t(pool
), scrub_tag
, snapc
,
4672 ceph::real_clock::now(),
4674 if (in
->mdcache
->mds
->logger
)
4675 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_set_tag
);
4679 bool _start(int rval
) {
4680 ceph_assert(in
->can_auth_pin());
4683 if (in
->is_dirty()) {
4684 MDCache
*mdcache
= in
->mdcache
; // For the benefit of dout
4685 auto ino
= [this]() { return in
->ino(); }; // For the benefit of dout
4686 dout(20) << "validating a dirty CInode; results will be inconclusive"
4690 C_OnFinisher
*conf
= new C_OnFinisher(get_io_callback(BACKTRACE
),
4691 in
->mdcache
->mds
->finisher
);
4693 std::string_view tag
= in
->scrub_infop
->header
->get_tag();
4694 bool is_internal
= in
->scrub_infop
->header
->is_internal_tag();
4695 // Rather than using the usual CInode::fetch_backtrace,
4696 // use a special variant that optionally writes a tag in the same
4698 fetch_backtrace_and_tag(in
, tag
, is_internal
, conf
, &results
->backtrace
.ondisk_read_retval
, &bl
);
4702 bool _backtrace(int rval
) {
4703 // set up basic result reporting and make sure we got the data
4704 results
->performed_validation
= true; // at least, some of it!
4705 results
->backtrace
.checked
= true;
4707 const int64_t pool
= in
->get_backtrace_pool();
4708 inode_backtrace_t
& memory_backtrace
= results
->backtrace
.memory_value
;
4709 in
->build_backtrace(pool
, memory_backtrace
);
4710 bool equivalent
, divergent
;
4713 MDCache
*mdcache
= in
->mdcache
; // For the benefit of dout
4714 auto ino
= [this]() { return in
->ino(); }; // For the benefit of dout
4716 // Ignore rval because it's the result of a FAILOK operation
4717 // from fetch_backtrace_and_tag: the real result is in
4718 // backtrace.ondisk_read_retval
4719 dout(20) << "ondisk_read_retval: " << results
->backtrace
.ondisk_read_retval
<< dendl
;
4720 if (results
->backtrace
.ondisk_read_retval
!= 0) {
4721 results
->backtrace
.error_str
<< "failed to read off disk; see retval";
4722 // we probably have a new unwritten file!
4723 // so skip the backtrace scrub for this entry and say that all's well
4724 if (in
->is_mdsdir()){
4725 dout(20) << "forcing backtrace as passed since mdsdir actually doesn't have backtrace" << dendl
;
4726 results
->backtrace
.passed
= true;
4728 if (in
->is_dirty_parent()) {
4729 dout(20) << "forcing backtrace as passed since inode is dirty parent" << dendl
;
4730 results
->backtrace
.passed
= true;
4735 // extract the backtrace, and compare it to a newly-constructed one
4737 auto p
= bl
.cbegin();
4739 decode(results
->backtrace
.ondisk_value
, p
);
4740 dout(10) << "decoded " << bl
.length() << " bytes of backtrace successfully" << dendl
;
4741 } catch (buffer::error
&) {
4742 if (results
->backtrace
.ondisk_read_retval
== 0 && rval
!= 0) {
4743 // Cases where something has clearly gone wrong with the overall
4744 // fetch op, though we didn't get a nonzero rc from the getxattr
4745 // operation. e.g. object missing.
4746 results
->backtrace
.ondisk_read_retval
= rval
;
4748 results
->backtrace
.error_str
<< "failed to decode on-disk backtrace ("
4749 << bl
.length() << " bytes)!";
4750 // we probably have a new unwritten file!
4751 // so skip the backtrace scrub for this entry and say that all's well
4752 if (in
->is_dirty_parent()) {
4753 dout(20) << "decode failed; forcing backtrace as passed since "
4754 "inode is dirty parent" << dendl
;
4755 results
->backtrace
.passed
= true;
4761 memory_newer
= memory_backtrace
.compare(results
->backtrace
.ondisk_value
,
4762 &equivalent
, &divergent
);
4764 if (divergent
|| memory_newer
< 0) {
4765 // we're divergent, or on-disk version is newer
4766 results
->backtrace
.error_str
<< "On-disk backtrace is divergent or newer";
4767 /* if the backtraces are divergent and the link count is 0, then
4768 * most likely its a stray entry that's being purged and things are
4769 * well and there's no reason for alarm
4771 if (divergent
&& (in
->is_dirty_parent() || in
->get_inode()->nlink
== 0)) {
4772 results
->backtrace
.passed
= true;
4773 dout(20) << "divergent backtraces are acceptable when dn "
4774 "is being purged or has been renamed or moved to a "
4775 "different directory " << *in
<< dendl
;
4778 results
->backtrace
.passed
= true;
4782 if (!results
->backtrace
.passed
&& in
->scrub_infop
->header
->get_repair()) {
4784 in
->make_path_string(path
);
4785 in
->mdcache
->mds
->clog
->warn() << "bad backtrace on inode " << in
->ino()
4786 << "(" << path
<< "), rewriting it";
4787 in
->mark_dirty_parent(in
->mdcache
->mds
->mdlog
->get_current_segment(),
4789 // Flag that we repaired this BT so that it won't go into damagetable
4790 results
->backtrace
.repaired
= true;
4791 if (in
->mdcache
->mds
->logger
)
4792 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_backtrace_repaired
);
4795 // If the inode's number was free in the InoTable, fix that
4798 InoTable
*inotable
= mdcache
->mds
->inotable
;
4800 dout(10) << "scrub: inotable ino = " << in
->ino() << dendl
;
4801 dout(10) << "scrub: inotable free says "
4802 << inotable
->is_marked_free(in
->ino()) << dendl
;
4804 if (inotable
->is_marked_free(in
->ino())) {
4805 LogChannelRef clog
= in
->mdcache
->mds
->clog
;
4806 clog
->error() << "scrub: inode wrongly marked free: " << in
->ino();
4808 if (in
->scrub_infop
->header
->get_repair()) {
4809 bool repaired
= inotable
->repair(in
->ino());
4811 clog
->error() << "inode table repaired for inode: " << in
->ino();
4814 if (in
->mdcache
->mds
->logger
)
4815 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_inotable_repaired
);
4817 clog
->error() << "Cannot repair inotable while other operations"
4826 if (in
->mdcache
->mds
->logger
)
4827 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_dir_inodes
);
4828 return validate_directory_data();
4830 if (in
->mdcache
->mds
->logger
)
4831 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_file_inodes
);
4832 // TODO: validate on-disk inode for normal files
4837 bool validate_directory_data() {
4838 ceph_assert(in
->is_dir());
4840 if (in
->is_base()) {
4842 shadow_in
= new CInode(in
->mdcache
);
4843 in
->mdcache
->create_unlinked_system_inode(shadow_in
, in
->ino(), in
->get_inode()->mode
);
4844 in
->mdcache
->num_shadow_inodes
++;
4846 shadow_in
->fetch(get_internal_callback(INODE
));
4847 if (in
->mdcache
->mds
->logger
)
4848 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_dir_base_inodes
);
4851 // TODO: validate on-disk inode for non-base directories
4852 if (in
->mdcache
->mds
->logger
)
4853 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_dirfrag_rstats
);
4854 results
->inode
.passed
= true;
4855 return check_dirfrag_rstats();
4859 bool _inode_disk(int rval
) {
4860 const auto& si
= shadow_in
->get_inode();
4861 const auto& i
= in
->get_inode();
4863 results
->inode
.checked
= true;
4864 results
->inode
.ondisk_read_retval
= rval
;
4865 results
->inode
.ondisk_value
= *si
;
4866 results
->inode
.memory_value
= *i
;
4868 if (si
->version
> i
->version
) {
4870 results
->inode
.error_str
<< "On-disk inode is newer than in-memory one; ";
4873 bool divergent
= false;
4874 int r
= i
->compare(*si
, &divergent
);
4875 results
->inode
.passed
= !divergent
&& r
>= 0;
4876 if (!results
->inode
.passed
) {
4877 results
->inode
.error_str
<<
4878 "On-disk inode is divergent or newer than in-memory one; ";
4883 return check_dirfrag_rstats();
4886 bool check_dirfrag_rstats() {
4887 if (in
->has_subtree_root_dirfrag()) {
4888 in
->mdcache
->rdlock_dirfrags_stats(in
, get_internal_callback(DIRFRAGS
));
4891 return immediate(DIRFRAGS
, 0);
4895 bool _dirfrags(int rval
) {
4896 // basic reporting setup
4897 results
->raw_stats
.checked
= true;
4898 results
->raw_stats
.ondisk_read_retval
= rval
;
4900 results
->raw_stats
.memory_value
.dirstat
= in
->get_inode()->dirstat
;
4901 results
->raw_stats
.memory_value
.rstat
= in
->get_inode()->rstat
;
4902 frag_info_t
& dir_info
= results
->raw_stats
.ondisk_value
.dirstat
;
4903 nest_info_t
& nest_info
= results
->raw_stats
.ondisk_value
.rstat
;
4906 results
->raw_stats
.error_str
<< "Failed to read dirfrags off disk";
4910 // check each dirfrag...
4911 for (const auto &p
: in
->dirfrags
) {
4912 CDir
*dir
= p
.second
;
4913 ceph_assert(dir
->get_version() > 0);
4914 nest_info
.add(dir
->get_fnode()->accounted_rstat
);
4915 dir_info
.add(dir
->get_fnode()->accounted_fragstat
);
4917 nest_info
.rsubdirs
++; // it gets one to account for self
4918 if (const sr_t
*srnode
= in
->get_projected_srnode(); srnode
)
4919 nest_info
.rsnaps
+= srnode
->snaps
.size();
4921 // ...and that their sum matches our inode settings
4922 if (!dir_info
.same_sums(in
->get_inode()->dirstat
) ||
4923 !nest_info
.same_sums(in
->get_inode()->rstat
)) {
4924 if (in
->scrub_infop
->header
->get_repair()) {
4925 results
->raw_stats
.error_str
4926 << "freshly-calculated rstats don't match existing ones (will be fixed)";
4927 in
->mdcache
->repair_inode_stats(in
);
4928 results
->raw_stats
.repaired
= true;
4930 results
->raw_stats
.error_str
4931 << "freshly-calculated rstats don't match existing ones";
4933 if (in
->is_dirty()) {
4934 MDCache
*mdcache
= in
->mdcache
; // for dout()
4935 auto ino
= [this]() { return in
->ino(); }; // for dout()
4936 dout(20) << "raw stats most likely wont match since inode is dirty; "
4937 "please rerun scrub when system is stable; "
4938 "assuming passed for now;" << dendl
;
4939 results
->raw_stats
.passed
= true;
4944 results
->raw_stats
.passed
= true;
4946 MDCache
*mdcache
= in
->mdcache
; // for dout()
4947 auto ino
= [this]() { return in
->ino(); }; // for dout()
4948 dout(20) << "raw stats check passed on " << *in
<< dendl
;
4955 void _done() override
{
4956 if ((!results
->raw_stats
.checked
|| results
->raw_stats
.passed
) &&
4957 (!results
->backtrace
.checked
|| results
->backtrace
.passed
) &&
4958 (!results
->inode
.checked
|| results
->inode
.passed
))
4959 results
->passed_validation
= true;
4961 // Flag that we did some repair work so that our repair operation
4962 // can be flushed at end of scrub
4963 if (results
->backtrace
.repaired
||
4964 results
->inode
.repaired
||
4965 results
->raw_stats
.repaired
)
4966 in
->scrub_infop
->header
->set_repaired();
4968 fin
->complete(get_rval());
4970 in
->auth_unpin(this);
4975 dout(10) << "scrub starting validate_disk_state on " << *this << dendl
;
4976 ValidationContinuation
*vc
= new ValidationContinuation(this,
4982 void CInode::validated_data::dump(Formatter
*f
) const
4984 f
->open_object_section("results");
4986 f
->dump_bool("performed_validation", performed_validation
);
4987 f
->dump_bool("passed_validation", passed_validation
);
4988 f
->open_object_section("backtrace");
4990 f
->dump_bool("checked", backtrace
.checked
);
4991 f
->dump_bool("passed", backtrace
.passed
);
4992 f
->dump_int("read_ret_val", backtrace
.ondisk_read_retval
);
4993 f
->dump_stream("ondisk_value") << backtrace
.ondisk_value
;
4994 f
->dump_stream("memoryvalue") << backtrace
.memory_value
;
4995 f
->dump_string("error_str", backtrace
.error_str
.str());
4997 f
->close_section(); // backtrace
4998 f
->open_object_section("raw_stats");
5000 f
->dump_bool("checked", raw_stats
.checked
);
5001 f
->dump_bool("passed", raw_stats
.passed
);
5002 f
->dump_int("read_ret_val", raw_stats
.ondisk_read_retval
);
5003 f
->dump_stream("ondisk_value.dirstat") << raw_stats
.ondisk_value
.dirstat
;
5004 f
->dump_stream("ondisk_value.rstat") << raw_stats
.ondisk_value
.rstat
;
5005 f
->dump_stream("memory_value.dirstat") << raw_stats
.memory_value
.dirstat
;
5006 f
->dump_stream("memory_value.rstat") << raw_stats
.memory_value
.rstat
;
5007 f
->dump_string("error_str", raw_stats
.error_str
.str());
5009 f
->close_section(); // raw_stats
5010 // dump failure return code
5012 if (backtrace
.checked
&& backtrace
.ondisk_read_retval
)
5013 rc
= backtrace
.ondisk_read_retval
;
5014 if (inode
.checked
&& inode
.ondisk_read_retval
)
5015 rc
= inode
.ondisk_read_retval
;
5016 if (raw_stats
.checked
&& raw_stats
.ondisk_read_retval
)
5017 rc
= raw_stats
.ondisk_read_retval
;
5018 f
->dump_int("return_code", rc
);
5020 f
->close_section(); // results
5023 bool CInode::validated_data::all_damage_repaired() const
5026 (raw_stats
.checked
&& !raw_stats
.passed
&& !raw_stats
.repaired
)
5028 (backtrace
.checked
&& !backtrace
.passed
&& !backtrace
.repaired
)
5030 (inode
.checked
&& !inode
.passed
&& !inode
.repaired
);
5035 void CInode::dump(Formatter
*f
, int flags
) const
5037 if (flags
& DUMP_PATH
) {
5039 make_path_string(path
, true);
5042 f
->dump_string("path", path
);
5045 if (flags
& DUMP_INODE_STORE_BASE
)
5046 InodeStoreBase::dump(f
);
5048 if (flags
& DUMP_MDS_CACHE_OBJECT
)
5049 MDSCacheObject::dump(f
);
5051 if (flags
& DUMP_LOCKS
) {
5052 f
->open_object_section("versionlock");
5053 versionlock
.dump(f
);
5056 f
->open_object_section("authlock");
5060 f
->open_object_section("linklock");
5064 f
->open_object_section("dirfragtreelock");
5065 dirfragtreelock
.dump(f
);
5068 f
->open_object_section("filelock");
5072 f
->open_object_section("xattrlock");
5076 f
->open_object_section("snaplock");
5080 f
->open_object_section("nestlock");
5084 f
->open_object_section("flocklock");
5088 f
->open_object_section("policylock");
5093 if (flags
& DUMP_STATE
) {
5094 f
->open_array_section("states");
5095 MDSCacheObject::dump_states(f
);
5096 if (state_test(STATE_EXPORTING
))
5097 f
->dump_string("state", "exporting");
5098 if (state_test(STATE_OPENINGDIR
))
5099 f
->dump_string("state", "openingdir");
5100 if (state_test(STATE_FREEZING
))
5101 f
->dump_string("state", "freezing");
5102 if (state_test(STATE_FROZEN
))
5103 f
->dump_string("state", "frozen");
5104 if (state_test(STATE_AMBIGUOUSAUTH
))
5105 f
->dump_string("state", "ambiguousauth");
5106 if (state_test(STATE_EXPORTINGCAPS
))
5107 f
->dump_string("state", "exportingcaps");
5108 if (state_test(STATE_NEEDSRECOVER
))
5109 f
->dump_string("state", "needsrecover");
5110 if (state_test(STATE_PURGING
))
5111 f
->dump_string("state", "purging");
5112 if (state_test(STATE_DIRTYPARENT
))
5113 f
->dump_string("state", "dirtyparent");
5114 if (state_test(STATE_DIRTYRSTAT
))
5115 f
->dump_string("state", "dirtyrstat");
5116 if (state_test(STATE_STRAYPINNED
))
5117 f
->dump_string("state", "straypinned");
5118 if (state_test(STATE_FROZENAUTHPIN
))
5119 f
->dump_string("state", "frozenauthpin");
5120 if (state_test(STATE_DIRTYPOOL
))
5121 f
->dump_string("state", "dirtypool");
5122 if (state_test(STATE_ORPHAN
))
5123 f
->dump_string("state", "orphan");
5124 if (state_test(STATE_MISSINGOBJS
))
5125 f
->dump_string("state", "missingobjs");
5129 if (flags
& DUMP_CAPS
) {
5130 f
->open_array_section("client_caps");
5131 for (const auto &p
: client_caps
) {
5132 auto &client
= p
.first
;
5133 auto cap
= &p
.second
;
5134 f
->open_object_section("client_cap");
5135 f
->dump_int("client_id", client
.v
);
5136 f
->dump_string("pending", ccap_string(cap
->pending()));
5137 f
->dump_string("issued", ccap_string(cap
->issued()));
5138 f
->dump_string("wanted", ccap_string(cap
->wanted()));
5139 f
->dump_int("last_sent", cap
->get_last_seq());
5144 f
->dump_int("loner", loner_cap
.v
);
5145 f
->dump_int("want_loner", want_loner_cap
.v
);
5147 f
->open_array_section("mds_caps_wanted");
5148 for (const auto &p
: mds_caps_wanted
) {
5149 f
->open_object_section("mds_cap_wanted");
5150 f
->dump_int("rank", p
.first
);
5151 f
->dump_string("cap", ccap_string(p
.second
));
5157 if (flags
& DUMP_DIRFRAGS
) {
5158 f
->open_array_section("dirfrags");
5159 auto&& dfs
= get_dirfrags();
5160 for(const auto &dir
: dfs
) {
5161 f
->open_object_section("dir");
5162 dir
->dump(f
, CDir::DUMP_DEFAULT
| CDir::DUMP_ITEMS
);
5163 dir
->check_rstats();
5170 /****** Scrub Stuff *****/
5171 void CInode::scrub_info_create() const
5173 dout(25) << __func__
<< dendl
;
5174 ceph_assert(!scrub_infop
);
5176 // break out of const-land to set up implicit initial state
5177 CInode
*me
= const_cast<CInode
*>(this);
5178 const auto& pi
= me
->get_projected_inode();
5180 std::unique_ptr
<scrub_info_t
> si(new scrub_info_t());
5181 si
->last_scrub_stamp
= pi
->last_scrub_stamp
;
5182 si
->last_scrub_version
= pi
->last_scrub_version
;
5184 me
->scrub_infop
.swap(si
);
5187 void CInode::scrub_maybe_delete_info()
5190 !scrub_infop
->scrub_in_progress
&&
5191 !scrub_infop
->last_scrub_dirty
) {
5192 scrub_infop
.reset();
5196 void CInode::scrub_initialize(ScrubHeaderRef
& header
)
5198 dout(20) << __func__
<< " with scrub_version " << get_version() << dendl
;
5201 scrub_infop
->scrub_in_progress
= true;
5202 scrub_infop
->queued_frags
.clear();
5203 scrub_infop
->header
= header
;
5204 header
->inc_num_pending();
5205 // right now we don't handle remote inodes
5208 void CInode::scrub_aborted() {
5209 dout(20) << __func__
<< dendl
;
5210 ceph_assert(scrub_is_in_progress());
5212 scrub_infop
->scrub_in_progress
= false;
5213 scrub_infop
->header
->dec_num_pending();
5214 scrub_maybe_delete_info();
5217 void CInode::scrub_finished() {
5218 dout(20) << __func__
<< dendl
;
5219 ceph_assert(scrub_is_in_progress());
5221 scrub_infop
->last_scrub_version
= get_version();
5222 scrub_infop
->last_scrub_stamp
= ceph_clock_now();
5223 scrub_infop
->last_scrub_dirty
= true;
5224 scrub_infop
->scrub_in_progress
= false;
5225 scrub_infop
->header
->dec_num_pending();
5228 int64_t CInode::get_backtrace_pool() const
5231 return mdcache
->mds
->get_metadata_pool();
5233 // Files are required to have an explicit layout that specifies
5235 ceph_assert(get_inode()->layout
.pool_id
!= -1);
5236 return get_inode()->layout
.pool_id
;
5240 void CInode::queue_export_pin(mds_rank_t export_pin
)
5242 if (state_test(CInode::STATE_QUEUEDEXPORTPIN
))
5246 if (export_pin
>= 0)
5247 target
= export_pin
;
5248 else if (export_pin
== MDS_RANK_EPHEMERAL_RAND
)
5249 target
= mdcache
->hash_into_rank_bucket(ino());
5251 target
= MDS_RANK_NONE
;
5253 unsigned min_frag_bits
= mdcache
->get_ephemeral_dist_frag_bits();
5255 for (auto& p
: dirfrags
) {
5256 CDir
*dir
= p
.second
;
5257 if (!dir
->is_auth())
5260 if (export_pin
== MDS_RANK_EPHEMERAL_DIST
) {
5261 if (dir
->get_frag().bits() < min_frag_bits
) {
5266 target
= mdcache
->hash_into_rank_bucket(ino(), dir
->get_frag());
5269 if (target
!= MDS_RANK_NONE
) {
5270 if (dir
->is_subtree_root()) {
5271 // set auxsubtree bit or export it
5272 if (!dir
->state_test(CDir::STATE_AUXSUBTREE
) ||
5273 target
!= dir
->get_dir_auth().first
)
5276 // create aux subtree or export it
5280 // clear aux subtrees ?
5281 queue
= dir
->state_test(CDir::STATE_AUXSUBTREE
);
5288 state_set(CInode::STATE_QUEUEDEXPORTPIN
);
5289 mdcache
->export_pin_queue
.insert(this);
5293 void CInode::maybe_export_pin(bool update
)
5295 if (!g_conf()->mds_bal_export_pin
)
5297 if (!is_dir() || !is_normal())
5300 dout(15) << __func__
<< " update=" << update
<< " " << *this << dendl
;
5302 mds_rank_t export_pin
= get_export_pin(false);
5303 if (export_pin
== MDS_RANK_NONE
&& !update
)
5306 check_pin_policy(export_pin
);
5307 queue_export_pin(export_pin
);
5310 void CInode::set_ephemeral_pin(bool dist
, bool rand
)
5314 state
|= STATE_DISTEPHEMERALPIN
;
5316 state
|= STATE_RANDEPHEMERALPIN
;
5320 if (state_test(state
) != state
) {
5321 dout(10) << "set ephemeral (" << (dist
? "dist" : "")
5322 << (rand
? " rand" : "") << ") pin on " << *this << dendl
;
5323 if (!is_ephemerally_pinned()) {
5324 auto p
= mdcache
->export_ephemeral_pins
.insert(this);
5325 ceph_assert(p
.second
);
5331 void CInode::clear_ephemeral_pin(bool dist
, bool rand
)
5335 state
|= STATE_DISTEPHEMERALPIN
;
5337 state
|= STATE_RANDEPHEMERALPIN
;
5339 if (state_test(state
)) {
5340 dout(10) << "clear ephemeral (" << (dist
? "dist" : "")
5341 << (rand
? " rand" : "") << ") pin on " << *this << dendl
;
5343 if (!is_ephemerally_pinned()) {
5344 auto count
= mdcache
->export_ephemeral_pins
.erase(this);
5345 ceph_assert(count
== 1);
5350 void CInode::maybe_ephemeral_rand(double threshold
)
5352 if (!mdcache
->get_export_ephemeral_random_config()) {
5353 dout(15) << __func__
<< " config false: cannot ephemeral random pin " << *this << dendl
;
5354 clear_ephemeral_pin(false, true);
5356 } else if (!is_dir() || !is_normal()) {
5357 dout(15) << __func__
<< " !dir or !normal: cannot ephemeral random pin " << *this << dendl
;
5358 clear_ephemeral_pin(false, true);
5360 } else if (get_inode()->nlink
== 0) {
5361 dout(15) << __func__
<< " unlinked directory: cannot ephemeral random pin " << *this << dendl
;
5362 clear_ephemeral_pin(false, true);
5364 } else if (state_test(CInode::STATE_RANDEPHEMERALPIN
)) {
5365 dout(10) << __func__
<< " already ephemeral random pinned: requeueing " << *this << dendl
;
5366 queue_export_pin(MDS_RANK_EPHEMERAL_RAND
);
5370 /* not precomputed? */
5371 if (threshold
< 0.0) {
5372 threshold
= get_ephemeral_rand();
5374 if (threshold
<= 0.0) {
5377 double n
= ceph::util::generate_random_number(0.0, 1.0);
5379 dout(15) << __func__
<< " rand " << n
<< " <?= " << threshold
5380 << " " << *this << dendl
;
5382 if (n
<= threshold
) {
5383 dout(10) << __func__
<< " randomly export pinning " << *this << dendl
;
5384 set_ephemeral_pin(false, true);
5385 queue_export_pin(MDS_RANK_EPHEMERAL_RAND
);
5389 void CInode::setxattr_ephemeral_rand(double probability
)
5391 ceph_assert(is_dir());
5392 _get_projected_inode()->export_ephemeral_random_pin
= probability
;
5395 void CInode::setxattr_ephemeral_dist(bool val
)
5397 ceph_assert(is_dir());
5398 _get_projected_inode()->export_ephemeral_distributed_pin
= val
;
5401 void CInode::set_export_pin(mds_rank_t rank
)
5403 ceph_assert(is_dir());
5404 _get_projected_inode()->export_pin
= rank
;
5405 maybe_export_pin(true);
5408 mds_rank_t
CInode::get_export_pin(bool inherit
) const
5410 if (!g_conf()->mds_bal_export_pin
)
5411 return MDS_RANK_NONE
;
5413 /* An inode that is export pinned may not necessarily be a subtree root, we
5414 * need to traverse the parents. A base or system inode cannot be pinned.
5415 * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5416 * have a parent yet.
5418 mds_rank_t r_target
= MDS_RANK_NONE
;
5419 const CInode
*in
= this;
5420 const CDir
*dir
= nullptr;
5422 if (in
->is_system())
5424 const CDentry
*pdn
= in
->get_parent_dn();
5427 if (in
->get_inode()->nlink
== 0) {
5428 // ignore export pin for unlinked directory
5432 if (in
->get_inode()->export_pin
>= 0) {
5433 return in
->get_inode()->export_pin
;
5434 } else if (in
->get_inode()->export_ephemeral_distributed_pin
&&
5435 mdcache
->get_export_ephemeral_distributed_config()) {
5437 return mdcache
->hash_into_rank_bucket(in
->ino(), dir
->get_frag());
5438 return MDS_RANK_EPHEMERAL_DIST
;
5439 } else if (r_target
!= MDS_RANK_NONE
&& in
->get_inode()->export_ephemeral_random_pin
> 0.0) {
5441 } else if (r_target
== MDS_RANK_NONE
&& in
->is_ephemeral_rand() &&
5442 mdcache
->get_export_ephemeral_random_config()) {
5443 /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
5445 return MDS_RANK_EPHEMERAL_RAND
;
5447 r_target
= MDS_RANK_EPHEMERAL_RAND
;
5449 r_target
= mdcache
->hash_into_rank_bucket(in
->ino());
5454 dir
= pdn
->get_dir();
5457 return MDS_RANK_NONE
;
5460 void CInode::check_pin_policy(mds_rank_t export_pin
)
5462 if (export_pin
== MDS_RANK_EPHEMERAL_DIST
) {
5463 set_ephemeral_pin(true, false);
5464 clear_ephemeral_pin(false, true);
5465 } else if (export_pin
== MDS_RANK_EPHEMERAL_RAND
) {
5466 set_ephemeral_pin(false, true);
5467 clear_ephemeral_pin(true, false);
5468 } else if (is_ephemerally_pinned()) {
5469 // export_pin >= 0 || export_pin == MDS_RANK_NONE
5470 clear_ephemeral_pin(true, true);
5471 if (export_pin
!= get_inode()->export_pin
) // inherited export_pin
5472 queue_export_pin(MDS_RANK_NONE
);
5476 double CInode::get_ephemeral_rand() const
5478 /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5479 * have a parent yet.
5481 const CInode
*in
= this;
5482 double max
= mdcache
->export_ephemeral_random_max
;
5484 if (in
->is_system())
5486 const CDentry
*pdn
= in
->get_parent_dn();
5489 // ignore export pin for unlinked directory
5490 if (in
->get_inode()->nlink
== 0)
5493 if (in
->get_inode()->export_ephemeral_random_pin
> 0.0)
5494 return std::min(in
->get_inode()->export_ephemeral_random_pin
, max
);
5496 /* An export_pin overrides only if no closer parent (incl. this one) has a
5499 if (in
->get_inode()->export_pin
>= 0 ||
5500 in
->get_inode()->export_ephemeral_distributed_pin
)
5503 in
= pdn
->get_dir()->inode
;
5508 void CInode::get_nested_dirfrags(std::vector
<CDir
*>& v
) const
5510 for (const auto &p
: dirfrags
) {
5511 const auto& dir
= p
.second
;
5512 if (!dir
->is_subtree_root())
5517 void CInode::get_subtree_dirfrags(std::vector
<CDir
*>& v
) const
5519 for (const auto &p
: dirfrags
) {
5520 const auto& dir
= p
.second
;
5521 if (dir
->is_subtree_root())
5526 MEMPOOL_DEFINE_OBJECT_FACTORY(CInode
, co_inode
, mds_co
);