1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/int_types.h"
16 #include "common/errno.h"
30 #include "events/EUpdate.h"
32 #include "osdc/Objecter.h"
36 #include "LogSegment.h"
38 #include "common/Clock.h"
40 #include "common/config.h"
41 #include "global/global_context.h"
42 #include "include/ceph_assert.h"
44 #include "mds/MDSContinuation.h"
45 #include "mds/InoTable.h"
46 #include "cephfs_features.h"
47 #include "osdc/Objecter.h"
49 #define dout_context g_ceph_context
50 #define dout_subsys ceph_subsys_mds
52 #define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") "
56 void CInodeCommitOperation::update(ObjectOperation
&op
, inode_backtrace_t
&bt
) {
59 op
.priority
= priority
;
63 encode(bt
, parent_bl
);
64 op
.setxattr("parent", parent_bl
);
66 // for the old pool there is no need to update the layout and symlink
67 if (!update_layout_symlink
)
71 encode(_layout
, layout_bl
, _features
);
72 op
.setxattr("layout", layout_bl
);
74 if (!_symlink
.empty()) {
75 bufferlist symlink_bl
;
76 encode(_symlink
, symlink_bl
);
77 op
.setxattr("symlink", symlink_bl
);
81 class CInodeIOContext
: public MDSIOContextBase
85 MDSRank
*get_mds() override
{return in
->mdcache
->mds
;}
87 explicit CInodeIOContext(CInode
*in_
) : in(in_
) {
88 ceph_assert(in
!= NULL
);
92 sr_t
* const CInode::projected_inode::UNDEF_SRNODE
= (sr_t
*)(unsigned long)-1;
94 LockType
CInode::versionlock_type(CEPH_LOCK_IVERSION
);
95 LockType
CInode::authlock_type(CEPH_LOCK_IAUTH
);
96 LockType
CInode::linklock_type(CEPH_LOCK_ILINK
);
97 LockType
CInode::dirfragtreelock_type(CEPH_LOCK_IDFT
);
98 LockType
CInode::filelock_type(CEPH_LOCK_IFILE
);
99 LockType
CInode::xattrlock_type(CEPH_LOCK_IXATTR
);
100 LockType
CInode::snaplock_type(CEPH_LOCK_ISNAP
);
101 LockType
CInode::nestlock_type(CEPH_LOCK_INEST
);
102 LockType
CInode::flocklock_type(CEPH_LOCK_IFLOCK
);
103 LockType
CInode::policylock_type(CEPH_LOCK_IPOLICY
);
105 std::string_view
CInode::pin_name(int p
) const
108 case PIN_DIRFRAG
: return "dirfrag";
109 case PIN_CAPS
: return "caps";
110 case PIN_IMPORTING
: return "importing";
111 case PIN_OPENINGDIR
: return "openingdir";
112 case PIN_REMOTEPARENT
: return "remoteparent";
113 case PIN_BATCHOPENJOURNAL
: return "batchopenjournal";
114 case PIN_SCATTERED
: return "scattered";
115 case PIN_STICKYDIRS
: return "stickydirs";
116 //case PIN_PURGING: return "purging";
117 case PIN_FREEZING
: return "freezing";
118 case PIN_FROZEN
: return "frozen";
119 case PIN_IMPORTINGCAPS
: return "importingcaps";
120 case PIN_EXPORTINGCAPS
: return "exportingcaps";
121 case PIN_PASTSNAPPARENT
: return "pastsnapparent";
122 case PIN_OPENINGSNAPPARENTS
: return "openingsnapparents";
123 case PIN_TRUNCATING
: return "truncating";
124 case PIN_STRAY
: return "stray";
125 case PIN_NEEDSNAPFLUSH
: return "needsnapflush";
126 case PIN_DIRTYRSTAT
: return "dirtyrstat";
127 case PIN_DIRTYPARENT
: return "dirtyparent";
128 case PIN_DIRWAITER
: return "dirwaiter";
129 default: return generic_pin_name(p
);
133 //int cinode_pins[CINODE_NUM_PINS]; // counts
134 ostream
& CInode::print_db_line_prefix(ostream
& out
)
136 return out
<< ceph_clock_now() << " mds." << mdcache
->mds
->get_nodeid() << ".cache.ino(" << ino() << ") ";
140 * write caps and lock ids
142 struct cinode_lock_info_t cinode_lock_info
[] = {
143 { CEPH_LOCK_IFILE
, CEPH_CAP_ANY_FILE_WR
},
144 { CEPH_LOCK_IAUTH
, CEPH_CAP_AUTH_EXCL
},
145 { CEPH_LOCK_ILINK
, CEPH_CAP_LINK_EXCL
},
146 { CEPH_LOCK_IXATTR
, CEPH_CAP_XATTR_EXCL
},
148 int num_cinode_locks
= sizeof(cinode_lock_info
) / sizeof(cinode_lock_info
[0]);
150 ostream
& operator<<(ostream
& out
, const CInode
& in
)
153 in
.make_path_string(path
, true);
155 out
<< "[inode " << in
.ino();
157 << (in
.is_multiversion() ? "...":"")
158 << in
.first
<< "," << in
.last
<< "]";
159 out
<< " " << path
<< (in
.is_dir() ? "/":"");
163 if (in
.is_replicated())
164 out
<< in
.get_replicas();
166 mds_authority_t a
= in
.authority();
167 out
<< " rep@" << a
.first
;
168 if (a
.second
!= CDIR_AUTH_UNKNOWN
)
169 out
<< "," << a
.second
;
170 out
<< "." << in
.get_replica_nonce();
174 out
<< " symlink='" << in
.symlink
<< "'";
175 if (in
.is_dir() && !in
.dirfragtree
.empty())
176 out
<< " " << in
.dirfragtree
;
178 out
<< " v" << in
.get_version();
179 if (in
.get_projected_version() > in
.get_version())
180 out
<< " pv" << in
.get_projected_version();
182 if (in
.get_num_auth_pins()) {
183 out
<< " ap=" << in
.get_num_auth_pins();
184 #ifdef MDS_AUTHPIN_SET
185 in
.print_authpin_set(out
);
190 out
<< " snaprealm=" << in
.snaprealm
;
192 if (in
.state_test(CInode::STATE_AMBIGUOUSAUTH
)) out
<< " AMBIGAUTH";
193 if (in
.state_test(CInode::STATE_NEEDSRECOVER
)) out
<< " NEEDSRECOVER";
194 if (in
.state_test(CInode::STATE_RECOVERING
)) out
<< " RECOVERING";
195 if (in
.state_test(CInode::STATE_DIRTYPARENT
)) out
<< " DIRTYPARENT";
196 if (in
.state_test(CInode::STATE_MISSINGOBJS
)) out
<< " MISSINGOBJS";
197 if (in
.is_ephemeral_dist()) out
<< " DISTEPHEMERALPIN";
198 if (in
.is_ephemeral_rand()) out
<< " RANDEPHEMERALPIN";
199 if (in
.is_freezing_inode()) out
<< " FREEZING=" << in
.auth_pin_freeze_allowance
;
200 if (in
.is_frozen_inode()) out
<< " FROZEN";
201 if (in
.is_frozen_auth_pin()) out
<< " FROZEN_AUTHPIN";
203 const auto& pi
= in
.get_projected_inode();
204 if (pi
->is_truncating())
205 out
<< " truncating(" << pi
->truncate_from
<< " to " << pi
->truncate_size
<< ")";
208 out
<< " " << in
.get_inode()->dirstat
;
209 if (g_conf()->mds_debug_scatterstat
&& in
.is_projected()) {
210 out
<< "->" << pi
->dirstat
;
213 out
<< " s=" << in
.get_inode()->size
;
214 if (in
.get_inode()->nlink
!= 1)
215 out
<< " nl=" << in
.get_inode()->nlink
;
219 out
<< " " << in
.get_inode()->rstat
;
220 if (!(in
.get_inode()->rstat
== in
.get_inode()->accounted_rstat
))
221 out
<< "/" << in
.get_inode()->accounted_rstat
;
222 if (g_conf()->mds_debug_scatterstat
&& in
.is_projected()) {
223 out
<< "->" << pi
->rstat
;
224 if (!(pi
->rstat
== pi
->accounted_rstat
))
225 out
<< "/" << pi
->accounted_rstat
;
228 if (in
.is_any_old_inodes()) {
229 out
<< " old_inodes=" << in
.get_old_inodes()->size();
232 if (!in
.client_need_snapflush
.empty())
233 out
<< " need_snapflush=" << in
.client_need_snapflush
;
236 if (!in
.authlock
.is_sync_and_unlocked())
237 out
<< " " << in
.authlock
;
238 if (!in
.linklock
.is_sync_and_unlocked())
239 out
<< " " << in
.linklock
;
240 if (in
.get_inode()->is_dir()) {
241 if (!in
.dirfragtreelock
.is_sync_and_unlocked())
242 out
<< " " << in
.dirfragtreelock
;
243 if (!in
.snaplock
.is_sync_and_unlocked())
244 out
<< " " << in
.snaplock
;
245 if (!in
.nestlock
.is_sync_and_unlocked())
246 out
<< " " << in
.nestlock
;
247 if (!in
.policylock
.is_sync_and_unlocked())
248 out
<< " " << in
.policylock
;
250 if (!in
.flocklock
.is_sync_and_unlocked())
251 out
<< " " << in
.flocklock
;
253 if (!in
.filelock
.is_sync_and_unlocked())
254 out
<< " " << in
.filelock
;
255 if (!in
.xattrlock
.is_sync_and_unlocked())
256 out
<< " " << in
.xattrlock
;
257 if (!in
.versionlock
.is_sync_and_unlocked())
258 out
<< " " << in
.versionlock
;
260 // hack: spit out crap on which clients have caps
261 if (in
.get_inode()->client_ranges
.size())
262 out
<< " cr=" << in
.get_inode()->client_ranges
;
264 if (!in
.get_client_caps().empty()) {
267 for (const auto &p
: in
.get_client_caps()) {
268 if (!first
) out
<< ",";
269 out
<< p
.first
<< "="
270 << ccap_string(p
.second
.pending());
271 if (p
.second
.issued() != p
.second
.pending())
272 out
<< "/" << ccap_string(p
.second
.issued());
273 out
<< "/" << ccap_string(p
.second
.wanted())
274 << "@" << p
.second
.get_last_seq();
278 if (in
.get_loner() >= 0 || in
.get_wanted_loner() >= 0) {
279 out
<< ",l=" << in
.get_loner();
280 if (in
.get_loner() != in
.get_wanted_loner())
281 out
<< "(" << in
.get_wanted_loner() << ")";
284 if (!in
.get_mds_caps_wanted().empty()) {
287 for (const auto &p
: in
.get_mds_caps_wanted()) {
290 out
<< p
.first
<< '=' << ccap_string(p
.second
);
296 if (in
.get_num_ref()) {
298 in
.print_pin_set(out
);
301 if (in
.get_inode()->export_pin
!= MDS_RANK_NONE
) {
302 out
<< " export_pin=" << in
.get_inode()->export_pin
;
304 if (in
.state_test(CInode::STATE_DISTEPHEMERALPIN
)) {
307 if (in
.state_test(CInode::STATE_RANDEPHEMERALPIN
)) {
316 CInode::CInode(MDCache
*c
, bool auth
, snapid_t f
, snapid_t l
) :
317 mdcache(c
), first(f
), last(l
),
320 item_open_file(this),
321 item_dirty_parent(this),
322 item_dirty_dirfrag_dir(this),
323 item_dirty_dirfrag_nest(this),
324 item_dirty_dirfrag_dirfragtree(this),
326 versionlock(this, &versionlock_type
),
327 authlock(this, &authlock_type
),
328 linklock(this, &linklock_type
),
329 dirfragtreelock(this, &dirfragtreelock_type
),
330 filelock(this, &filelock_type
),
331 xattrlock(this, &xattrlock_type
),
332 snaplock(this, &snaplock_type
),
333 nestlock(this, &nestlock_type
),
334 flocklock(this, &flocklock_type
),
335 policylock(this, &policylock_type
)
338 state_set(STATE_AUTH
);
341 void CInode::print(ostream
& out
)
346 void CInode::add_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
)
348 dout(10) << __func__
<< " client." << client
<< " snapid " << snapid
<< " on " << snapin
<< dendl
;
350 if (client_need_snapflush
.empty()) {
351 get(CInode::PIN_NEEDSNAPFLUSH
);
353 // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially
354 // long periods waiting for clients to flush their snaps.
355 auth_pin(this); // pin head get_inode()->..
358 auto &clients
= client_need_snapflush
[snapid
];
360 snapin
->auth_pin(this); // ...and pin snapped/old inode!
362 clients
.insert(client
);
365 void CInode::remove_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
)
367 dout(10) << __func__
<< " client." << client
<< " snapid " << snapid
<< " on " << snapin
<< dendl
;
368 auto it
= client_need_snapflush
.find(snapid
);
369 if (it
== client_need_snapflush
.end()) {
370 dout(10) << " snapid not found" << dendl
;
373 size_t n
= it
->second
.erase(client
);
375 dout(10) << " client not found" << dendl
;
378 if (it
->second
.empty()) {
379 client_need_snapflush
.erase(it
);
380 snapin
->auth_unpin(this);
382 if (client_need_snapflush
.empty()) {
383 put(CInode::PIN_NEEDSNAPFLUSH
);
389 pair
<bool,bool> CInode::split_need_snapflush(CInode
*cowin
, CInode
*in
)
391 dout(10) << __func__
<< " [" << cowin
->first
<< "," << cowin
->last
<< "] for " << *cowin
<< dendl
;
392 bool cowin_need_flush
= false;
393 bool orig_need_flush
= false;
394 auto it
= client_need_snapflush
.lower_bound(cowin
->first
);
395 while (it
!= client_need_snapflush
.end() && it
->first
< in
->first
) {
396 ceph_assert(!it
->second
.empty());
397 if (cowin
->last
>= it
->first
) {
398 cowin
->auth_pin(this);
399 cowin_need_flush
= true;
402 it
= client_need_snapflush
.erase(it
);
404 in
->auth_unpin(this);
407 if (it
!= client_need_snapflush
.end() && it
->first
<= in
->last
)
408 orig_need_flush
= true;
410 return make_pair(cowin_need_flush
, orig_need_flush
);
413 void CInode::mark_dirty_rstat()
415 if (!state_test(STATE_DIRTYRSTAT
)) {
416 dout(10) << __func__
<< dendl
;
417 state_set(STATE_DIRTYRSTAT
);
419 CDentry
*pdn
= get_projected_parent_dn();
420 if (pdn
->is_auth()) {
421 CDir
*pdir
= pdn
->dir
;
422 pdir
->dirty_rstat_inodes
.push_back(&dirty_rstat_item
);
423 mdcache
->mds
->locker
->mark_updated_scatterlock(&pdir
->inode
->nestlock
);
425 // under cross-MDS rename.
426 // DIRTYRSTAT flag will get cleared when rename finishes
427 ceph_assert(state_test(STATE_AMBIGUOUSAUTH
));
431 void CInode::clear_dirty_rstat()
433 if (state_test(STATE_DIRTYRSTAT
)) {
434 dout(10) << __func__
<< dendl
;
435 state_clear(STATE_DIRTYRSTAT
);
437 dirty_rstat_item
.remove_myself();
441 CInode::projected_inode
CInode::project_inode(const MutationRef
& mut
,
442 bool xattr
, bool snap
)
444 if (mut
&& mut
->is_projected(this)) {
445 ceph_assert(!xattr
&& !snap
);
446 auto _inode
= std::const_pointer_cast
<mempool_inode
>(projected_nodes
.back().inode
);
447 return projected_inode(std::move(_inode
), xattr_map_ptr());
450 auto pi
= allocate_inode(*get_projected_inode());
452 if (scrub_infop
&& scrub_infop
->last_scrub_dirty
) {
453 pi
->last_scrub_stamp
= scrub_infop
->last_scrub_stamp
;
454 pi
->last_scrub_version
= scrub_infop
->last_scrub_version
;
455 scrub_infop
->last_scrub_dirty
= false;
456 scrub_maybe_delete_info();
459 const auto& ox
= get_projected_xattrs();
462 px
= allocate_xattr_map();
467 sr_t
* ps
= projected_inode::UNDEF_SRNODE
;
469 ps
= prepare_new_srnode(0);
470 ++num_projected_srnodes
;
473 projected_nodes
.emplace_back(pi
, xattr
? px
: ox
, ps
);
475 mut
->add_projected_node(this);
476 dout(15) << __func__
<< " " << pi
->ino
<< dendl
;
477 return projected_inode(std::move(pi
), std::move(px
), ps
);
480 void CInode::pop_and_dirty_projected_inode(LogSegment
*ls
, const MutationRef
& mut
)
482 ceph_assert(!projected_nodes
.empty());
483 auto front
= std::move(projected_nodes
.front());
484 dout(15) << __func__
<< " v" << front
.inode
->version
<< dendl
;
486 projected_nodes
.pop_front();
488 mut
->remove_projected_node(this);
490 bool pool_updated
= get_inode()->layout
.pool_id
!= front
.inode
->layout
.pool_id
;
491 bool pin_updated
= (get_inode()->export_pin
!= front
.inode
->export_pin
) ||
492 (get_inode()->export_ephemeral_distributed_pin
!=
493 front
.inode
->export_ephemeral_distributed_pin
);
495 reset_inode(std::move(front
.inode
));
496 if (front
.xattrs
!= get_xattrs())
497 reset_xattrs(std::move(front
.xattrs
));
499 if (front
.snapnode
!= projected_inode::UNDEF_SRNODE
) {
500 --num_projected_srnodes
;
501 pop_projected_snaprealm(front
.snapnode
, false);
505 if (get_inode()->is_backtrace_updated())
506 mark_dirty_parent(ls
, pool_updated
);
509 maybe_export_pin(true);
512 sr_t
*CInode::prepare_new_srnode(snapid_t snapid
)
514 const sr_t
*cur_srnode
= get_projected_srnode();
518 new_srnode
= new sr_t(*cur_srnode
);
521 snapid
= mdcache
->get_global_snaprealm()->get_newest_seq();
522 new_srnode
= new sr_t();
523 new_srnode
->seq
= snapid
;
524 new_srnode
->created
= snapid
;
525 new_srnode
->current_parent_since
= get_oldest_snap();
530 const sr_t
*CInode::get_projected_srnode() const {
531 if (num_projected_srnodes
> 0) {
532 for (auto it
= projected_nodes
.rbegin(); it
!= projected_nodes
.rend(); ++it
)
533 if (it
->snapnode
!= projected_inode::UNDEF_SRNODE
)
537 return &snaprealm
->srnode
;
542 void CInode::project_snaprealm(sr_t
*new_srnode
)
544 dout(10) << __func__
<< " " << new_srnode
<< dendl
;
545 ceph_assert(projected_nodes
.back().snapnode
== projected_inode::UNDEF_SRNODE
);
546 projected_nodes
.back().snapnode
= new_srnode
;
547 ++num_projected_srnodes
;
550 void CInode::mark_snaprealm_global(sr_t
*new_srnode
)
552 ceph_assert(!is_dir());
553 // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since'
554 new_srnode
->last_destroyed
= new_srnode
->current_parent_since
;
555 new_srnode
->current_parent_since
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
556 new_srnode
->mark_parent_global();
559 void CInode::clear_snaprealm_global(sr_t
*new_srnode
)
561 // restore 'current_parent_since'
562 new_srnode
->current_parent_since
= new_srnode
->last_destroyed
;
563 new_srnode
->last_destroyed
= 0;
564 new_srnode
->seq
= mdcache
->get_global_snaprealm()->get_newest_seq();
565 new_srnode
->clear_parent_global();
568 bool CInode::is_projected_snaprealm_global() const
570 const sr_t
*srnode
= get_projected_srnode();
571 if (srnode
&& srnode
->is_parent_global())
576 void CInode::project_snaprealm_past_parent(SnapRealm
*newparent
)
578 sr_t
*new_snap
= project_snaprealm();
579 record_snaprealm_past_parent(new_snap
, newparent
);
583 /* if newparent != parent, add parent to past_parents
584 if parent DNE, we need to find what the parent actually is and fill that in */
585 void CInode::record_snaprealm_past_parent(sr_t
*new_snap
, SnapRealm
*newparent
)
587 ceph_assert(!new_snap
->is_parent_global());
588 SnapRealm
*oldparent
;
590 oldparent
= find_snaprealm();
592 oldparent
= snaprealm
->parent
;
595 if (newparent
!= oldparent
) {
596 snapid_t oldparentseq
= oldparent
->get_newest_seq();
597 if (oldparentseq
+ 1 > new_snap
->current_parent_since
) {
598 // copy old parent's snaps
599 const set
<snapid_t
>& snaps
= oldparent
->get_snaps();
600 auto p
= snaps
.lower_bound(new_snap
->current_parent_since
);
601 if (p
!= snaps
.end())
602 new_snap
->past_parent_snaps
.insert(p
, snaps
.end());
603 if (oldparentseq
> new_snap
->seq
)
604 new_snap
->seq
= oldparentseq
;
606 new_snap
->current_parent_since
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
610 void CInode::record_snaprealm_parent_dentry(sr_t
*new_snap
, SnapRealm
*oldparent
,
611 CDentry
*dn
, bool primary_dn
)
613 ceph_assert(new_snap
->is_parent_global());
616 oldparent
= dn
->get_dir()->inode
->find_snaprealm();
617 auto& snaps
= oldparent
->get_snaps();
620 auto p
= snaps
.lower_bound(dn
->first
);
621 if (p
!= snaps
.end())
622 new_snap
->past_parent_snaps
.insert(p
, snaps
.end());
624 // 'last_destroyed' is used as 'current_parent_since'
625 auto p
= snaps
.lower_bound(new_snap
->last_destroyed
);
626 if (p
!= snaps
.end())
627 new_snap
->past_parent_snaps
.insert(p
, snaps
.end());
628 new_snap
->last_destroyed
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
632 void CInode::early_pop_projected_snaprealm()
634 ceph_assert(!projected_nodes
.empty());
635 if (projected_nodes
.front().snapnode
!= projected_inode::UNDEF_SRNODE
) {
636 pop_projected_snaprealm(projected_nodes
.front().snapnode
, true);
637 projected_nodes
.front().snapnode
= projected_inode::UNDEF_SRNODE
;
638 --num_projected_srnodes
;
642 void CInode::pop_projected_snaprealm(sr_t
*next_snaprealm
, bool early
)
644 if (next_snaprealm
) {
645 dout(10) << __func__
<< (early
? " (early) " : " ")
646 << next_snaprealm
<< " seq " << next_snaprealm
->seq
<< dendl
;
650 auto old_flags
= snaprealm
->srnode
.flags
;
651 snaprealm
->srnode
= *next_snaprealm
;
652 delete next_snaprealm
;
654 if ((snaprealm
->srnode
.flags
^ old_flags
) & sr_t::PARENT_GLOBAL
) {
655 snaprealm
->adjust_parent();
658 if (snaprealm
->parent
)
659 dout(10) << " realm " << *snaprealm
<< " parent " << *snaprealm
->parent
<< dendl
;
661 dout(10) << __func__
<< (early
? " (early) null" : " null") << dendl
;
662 ceph_assert(snaprealm
);
663 snaprealm
->merge_to(NULL
);
668 // ====== CInode =======
672 InodeStoreBase::inode_const_ptr
InodeStoreBase::empty_inode
= InodeStoreBase::allocate_inode();
674 __u32
InodeStoreBase::hash_dentry_name(std::string_view dn
)
676 int which
= inode
->dir_layout
.dl_dir_hash
;
678 which
= CEPH_STR_HASH_LINUX
;
679 ceph_assert(ceph_str_hash_valid(which
));
680 return ceph_str_hash(which
, dn
.data(), dn
.length());
683 frag_t
InodeStoreBase::pick_dirfrag(std::string_view dn
)
685 if (dirfragtree
.empty())
686 return frag_t(); // avoid the string hash if we can.
688 __u32 h
= hash_dentry_name(dn
);
689 return dirfragtree
[h
];
692 std::pair
<bool, std::vector
<CDir
*>> CInode::get_dirfrags_under(frag_t fg
)
694 std::pair
<bool, std::vector
<CDir
*>> result
;
695 auto& all
= result
.first
;
696 auto& dirs
= result
.second
;
699 if (auto it
= dirfrags
.find(fg
); it
!= dirfrags
.end()){
701 dirs
.push_back(it
->second
);
706 for(auto &[_fg
, _dir
] : dirfrags
){
707 // frag_t.bits() can indicate the depth of the partition in the directory tree
709 // 01* : bit = 2, on the second floor
712 // 00* 01* 10* 11* -- > level 2, bit = 2
713 // so fragA.bits > fragB.bits means fragA is deeper than fragB
715 if (fg
.bits() >= _fg
.bits()) {
716 if (_fg
.contains(fg
)) {
721 if (fg
.contains(_fg
)) {
722 dirs
.push_back(_dir
);
723 // we can calculate how many sub slices a slice can be divided into
724 // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*)
725 // or 2^2 frags belonging to the second layer(00* 01* 10* 11*)
726 // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level
727 total
+= 1 << (24 - _fg
.bits());
732 // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache
733 all
= ((1<<(24-fg
.bits())) == total
);
737 void CInode::verify_dirfrags()
740 for (const auto &p
: dirfrags
) {
741 if (!dirfragtree
.is_leaf(p
.first
)) {
742 dout(0) << "have open dirfrag " << p
.first
<< " but not leaf in " << dirfragtree
743 << ": " << *p
.second
<< dendl
;
750 void CInode::force_dirfrags()
753 for (auto &p
: dirfrags
) {
754 if (!dirfragtree
.is_leaf(p
.first
)) {
755 dout(0) << "have open dirfrag " << p
.first
<< " but not leaf in " << dirfragtree
756 << ": " << *p
.second
<< dendl
;
763 dirfragtree
.get_leaves(leaves
);
764 for (const auto& leaf
: leaves
) {
765 mdcache
->get_force_dirfrag(dirfrag_t(ino(), leaf
), true);
772 CDir
*CInode::get_approx_dirfrag(frag_t fg
)
774 CDir
*dir
= get_dirfrag(fg
);
778 auto&& p
= get_dirfrags_under(fg
);
779 if (!p
.second
.empty())
780 return p
.second
.front();
783 while (fg
.bits() > 0) {
785 dir
= get_dirfrag(fg
);
791 CDir
*CInode::get_or_open_dirfrag(MDCache
*mdcache
, frag_t fg
)
793 ceph_assert(is_dir());
796 CDir
*dir
= get_dirfrag(fg
);
799 ceph_assert(is_auth() || mdcache
->mds
->is_any_replay());
800 dir
= new CDir(this, fg
, mdcache
, is_auth());
806 CDir
*CInode::add_dirfrag(CDir
*dir
)
808 auto em
= dirfrags
.emplace(std::piecewise_construct
, std::forward_as_tuple(dir
->dirfrag().frag
), std::forward_as_tuple(dir
));
809 ceph_assert(em
.second
);
811 if (stickydir_ref
> 0) {
812 dir
->state_set(CDir::STATE_STICKY
);
813 dir
->get(CDir::PIN_STICKY
);
821 void CInode::close_dirfrag(frag_t fg
)
823 dout(14) << __func__
<< " " << fg
<< dendl
;
824 ceph_assert(dirfrags
.count(fg
));
826 CDir
*dir
= dirfrags
[fg
];
827 dir
->remove_null_dentries();
833 if (stickydir_ref
> 0) {
834 dir
->state_clear(CDir::STATE_STICKY
);
835 dir
->put(CDir::PIN_STICKY
);
838 if (dir
->is_subtree_root())
841 // dump any remaining dentries, for debugging purposes
842 for (const auto &p
: dir
->items
)
843 dout(14) << __func__
<< " LEFTOVER dn " << *p
.second
<< dendl
;
845 ceph_assert(dir
->get_num_ref() == 0);
850 void CInode::close_dirfrags()
852 while (!dirfrags
.empty())
853 close_dirfrag(dirfrags
.begin()->first
);
856 bool CInode::has_subtree_root_dirfrag(int auth
)
858 if (num_subtree_roots
> 0) {
861 for (const auto &p
: dirfrags
) {
862 if (p
.second
->is_subtree_root() &&
863 p
.second
->dir_auth
.first
== auth
)
870 bool CInode::has_subtree_or_exporting_dirfrag()
872 if (num_subtree_roots
> 0 || num_exporting_dirs
> 0)
877 void CInode::get_stickydirs()
879 if (stickydir_ref
== 0) {
881 for (const auto &p
: dirfrags
) {
882 p
.second
->state_set(CDir::STATE_STICKY
);
883 p
.second
->get(CDir::PIN_STICKY
);
889 void CInode::put_stickydirs()
891 ceph_assert(stickydir_ref
> 0);
893 if (stickydir_ref
== 0) {
895 for (const auto &p
: dirfrags
) {
896 p
.second
->state_clear(CDir::STATE_STICKY
);
897 p
.second
->put(CDir::PIN_STICKY
);
908 void CInode::first_get()
912 parent
->get(CDentry::PIN_INODEPIN
);
915 void CInode::last_put()
919 parent
->put(CDentry::PIN_INODEPIN
);
924 if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
925 mdcache
->maybe_eval_stray(this, true);
928 void CInode::add_remote_parent(CDentry
*p
)
930 if (remote_parents
.empty())
931 get(PIN_REMOTEPARENT
);
932 remote_parents
.insert(p
);
934 void CInode::remove_remote_parent(CDentry
*p
)
936 remote_parents
.erase(p
);
937 if (remote_parents
.empty())
938 put(PIN_REMOTEPARENT
);
944 CDir
*CInode::get_parent_dir()
950 CDir
*CInode::get_projected_parent_dir()
952 CDentry
*p
= get_projected_parent_dn();
957 CInode
*CInode::get_parent_inode()
960 return parent
->dir
->inode
;
964 bool CInode::is_ancestor_of(const CInode
*other
) const
969 const CDentry
*pdn
= other
->get_oldest_parent_dn();
971 ceph_assert(other
->is_base());
974 other
= pdn
->get_dir()->get_inode();
979 bool CInode::is_projected_ancestor_of(const CInode
*other
) const
984 const CDentry
*pdn
= other
->get_projected_parent_dn();
986 ceph_assert(other
->is_base());
989 other
= pdn
->get_dir()->get_inode();
995 * Because a non-directory inode may have multiple links, the use_parent
996 * argument allows selecting which parent to use for path construction. This
997 * argument is only meaningful for the final component (i.e. the first of the
998 * nested calls) because directories cannot have multiple hard links. If
999 * use_parent is NULL and projected is true, the primary parent's projected
1000 * inode is used all the way up the path chain. Otherwise the primary parent
1001 * stable inode is used.
1003 void CInode::make_path_string(string
& s
, bool projected
, const CDentry
*use_parent
) const
1006 use_parent
= projected
? get_projected_parent_dn() : parent
;
1010 use_parent
->make_path_string(s
, projected
);
1011 } else if (is_root()) {
1013 } else if (is_mdsdir()) {
1015 uint64_t eino(ino());
1016 eino
-= MDS_INO_MDSDIR_OFFSET
;
1017 snprintf(t
, sizeof(t
), "~mds%" PRId64
, eino
);
1021 uint64_t eino(ino());
1022 snprintf(n
, sizeof(n
), "#%" PRIx64
, eino
);
1027 void CInode::make_path(filepath
& fp
, bool projected
) const
1029 const CDentry
*use_parent
= projected
? get_projected_parent_dn() : parent
;
1031 ceph_assert(!is_base());
1032 use_parent
->make_path(fp
, projected
);
1034 fp
= filepath(ino());
1038 void CInode::name_stray_dentry(string
& dname
)
1041 snprintf(s
, sizeof(s
), "%llx", (unsigned long long)ino().val
);
1045 version_t
CInode::pre_dirty()
1048 CDentry
* _cdentry
= get_projected_parent_dn();
1050 pv
= _cdentry
->pre_dirty(get_projected_version());
1051 dout(10) << "pre_dirty " << pv
<< " (current v " << get_inode()->version
<< ")" << dendl
;
1053 ceph_assert(is_base());
1054 pv
= get_projected_version() + 1;
1056 // force update backtrace for old format inode (see mempool_inode::decode)
1057 if (get_inode()->backtrace_version
== 0 && !projected_nodes
.empty()) {
1058 auto pi
= _get_projected_inode();
1059 if (pi
->backtrace_version
== 0)
1060 pi
->update_backtrace(pv
);
1065 void CInode::_mark_dirty(LogSegment
*ls
)
1067 if (!state_test(STATE_DIRTY
)) {
1068 state_set(STATE_DIRTY
);
1073 // move myself to this segment's dirty list
1075 ls
->dirty_inodes
.push_back(&item_dirty
);
1078 void CInode::mark_dirty(LogSegment
*ls
) {
1080 dout(10) << __func__
<< " " << *this << dendl
;
1083 NOTE: I may already be dirty, but this fn _still_ needs to be called so that
1084 the directory is (perhaps newly) dirtied, and so that parent_dir_version is
1088 // only auth can get dirty. "dirty" async data in replicas is relative to
1089 // filelock state, not the dirty flag.
1090 ceph_assert(is_auth());
1092 // touch my private version
1097 parent
->mark_dirty(get_version(), ls
);
1101 void CInode::mark_clean()
1103 dout(10) << __func__
<< " " << *this << dendl
;
1104 if (state_test(STATE_DIRTY
)) {
1105 state_clear(STATE_DIRTY
);
1108 // remove myself from ls dirty list
1109 item_dirty
.remove_myself();
1115 // per-inode storage
1116 // (currently for root inode only)
1118 struct C_IO_Inode_Stored
: public CInodeIOContext
{
1121 C_IO_Inode_Stored(CInode
*i
, version_t v
, Context
*f
) : CInodeIOContext(i
), version(v
), fin(f
) {}
1122 void finish(int r
) override
{
1123 in
->_stored(r
, version
, fin
);
1125 void print(ostream
& out
) const override
{
1126 out
<< "inode_store(" << in
->ino() << ")";
1130 object_t
InodeStoreBase::get_object_name(inodeno_t ino
, frag_t fg
, std::string_view suffix
)
1133 snprintf(n
, sizeof(n
), "%llx.%08llx", (long long unsigned)ino
, (long long unsigned)fg
);
1134 ceph_assert(strlen(n
) + suffix
.size() < sizeof n
);
1135 strncat(n
, suffix
.data(), suffix
.size());
1139 void CInode::store(MDSContext
*fin
)
1141 dout(10) << __func__
<< " " << get_version() << dendl
;
1142 ceph_assert(is_base());
1145 purge_stale_snap_data(snaprealm
->get_snaps());
1149 string magic
= CEPH_FS_ONDISK_MAGIC
;
1152 encode_store(bl
, mdcache
->mds
->mdsmap
->get_up_features());
1159 object_t oid
= CInode::get_object_name(ino(), frag_t(), ".inode");
1160 object_locator_t
oloc(mdcache
->mds
->get_metadata_pool());
1163 new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin
),
1164 mdcache
->mds
->finisher
);
1165 mdcache
->mds
->objecter
->mutate(oid
, oloc
, m
, snapc
,
1166 ceph::real_clock::now(), 0,
1170 void CInode::_stored(int r
, version_t v
, Context
*fin
)
1173 dout(1) << "store error " << r
<< " v " << v
<< " on " << *this << dendl
;
1174 mdcache
->mds
->clog
->error() << "failed to store inode " << ino()
1175 << " object: " << cpp_strerror(r
);
1176 mdcache
->mds
->handle_write_error(r
);
1181 dout(10) << __func__
<< " " << v
<< " on " << *this << dendl
;
1182 if (v
== get_projected_version())
1188 void CInode::flush(MDSContext
*fin
)
1190 dout(10) << __func__
<< " " << *this << dendl
;
1191 ceph_assert(is_auth() && can_auth_pin());
1193 MDSGatherBuilder
gather(g_ceph_context
);
1195 if (is_dirty_parent()) {
1196 store_backtrace(gather
.new_sub());
1200 store(gather
.new_sub());
1202 parent
->dir
->commit(0, gather
.new_sub());
1206 if (gather
.has_subs()) {
1207 gather
.set_finisher(fin
);
1214 struct C_IO_Inode_Fetched
: public CInodeIOContext
{
1217 C_IO_Inode_Fetched(CInode
*i
, Context
*f
) : CInodeIOContext(i
), fin(f
) {}
1218 void finish(int r
) override
{
1219 // Ignore 'r', because we fetch from two places, so r is usually CEPHFS_ENOENT
1220 in
->_fetched(bl
, bl2
, fin
);
1222 void print(ostream
& out
) const override
{
1223 out
<< "inode_fetch(" << in
->ino() << ")";
1227 void CInode::fetch(MDSContext
*fin
)
1229 dout(10) << __func__
<< dendl
;
1231 C_IO_Inode_Fetched
*c
= new C_IO_Inode_Fetched(this, fin
);
1232 C_GatherBuilder
gather(g_ceph_context
, new C_OnFinisher(c
, mdcache
->mds
->finisher
));
1234 object_t oid
= CInode::get_object_name(ino(), frag_t(), "");
1235 object_locator_t
oloc(mdcache
->mds
->get_metadata_pool());
1237 // Old on-disk format: inode stored in xattr of a dirfrag
1239 rd
.getxattr("inode", &c
->bl
, NULL
);
1240 mdcache
->mds
->objecter
->read(oid
, oloc
, rd
, CEPH_NOSNAP
, (bufferlist
*)NULL
, 0, gather
.new_sub());
1242 // Current on-disk format: inode stored in a .inode object
1243 object_t oid2
= CInode::get_object_name(ino(), frag_t(), ".inode");
1244 mdcache
->mds
->objecter
->read(oid2
, oloc
, 0, 0, CEPH_NOSNAP
, &c
->bl2
, 0, gather
.new_sub());
1249 void CInode::_fetched(bufferlist
& bl
, bufferlist
& bl2
, Context
*fin
)
1251 dout(10) << __func__
<< " got " << bl
.length() << " and " << bl2
.length() << dendl
;
1252 bufferlist::const_iterator p
;
1255 } else if (bl
.length()) {
1258 derr
<< "No data while reading inode " << ino() << dendl
;
1259 fin
->complete(-CEPHFS_ENOENT
);
1268 dout(10) << " magic is '" << magic
<< "' (expecting '"
1269 << CEPH_FS_ONDISK_MAGIC
<< "')" << dendl
;
1270 if (magic
!= CEPH_FS_ONDISK_MAGIC
) {
1271 dout(0) << "on disk magic '" << magic
<< "' != my magic '" << CEPH_FS_ONDISK_MAGIC
1273 fin
->complete(-CEPHFS_EINVAL
);
1276 dout(10) << "_fetched " << *this << dendl
;
1279 } catch (buffer::error
&err
) {
1280 derr
<< "Corrupt inode " << ino() << ": " << err
.what() << dendl
;
1281 fin
->complete(-CEPHFS_EINVAL
);
1286 void CInode::build_backtrace(int64_t pool
, inode_backtrace_t
& bt
)
1289 bt
.ancestors
.clear();
1293 CDentry
*pdn
= get_parent_dn();
1295 CInode
*diri
= pdn
->get_dir()->get_inode();
1296 bt
.ancestors
.push_back(inode_backpointer_t(diri
->ino(), pdn
->get_name(), in
->get_inode()->version
));
1298 pdn
= in
->get_parent_dn();
1300 bt
.old_pools
.reserve(get_inode()->old_pools
.size());
1301 for (auto &p
: get_inode()->old_pools
) {
1302 // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
1304 bt
.old_pools
.push_back(p
);
1308 struct C_IO_Inode_StoredBacktrace
: public CInodeIOContext
{
1311 C_IO_Inode_StoredBacktrace(CInode
*i
, version_t v
, Context
*f
) : CInodeIOContext(i
), version(v
), fin(f
) {}
1312 void finish(int r
) override
{
1313 in
->_stored_backtrace(r
, version
, fin
);
1315 void print(ostream
& out
) const override
{
1316 out
<< "backtrace_store(" << in
->ino() << ")";
1321 void CInode::_commit_ops(int r
, C_GatherBuilder
&gather_bld
,
1322 std::vector
<CInodeCommitOperation
> &ops_vec
,
1323 inode_backtrace_t
&bt
)
1325 dout(10) << __func__
<< dendl
;
1328 mdcache
->mds
->handle_write_error_with_lock(r
);
1333 object_t oid
= get_object_name(ino(), frag_t(), "");
1335 for (auto &op
: ops_vec
) {
1336 ObjectOperation obj_op
;
1337 object_locator_t
oloc(op
.get_pool());
1338 op
.update(obj_op
, bt
);
1339 mdcache
->mds
->objecter
->mutate(oid
, oloc
, obj_op
, snapc
,
1340 ceph::real_clock::now(),
1341 0, gather_bld
.new_sub());
1345 void CInode::_store_backtrace(std::vector
<CInodeCommitOperation
> &ops_vec
,
1346 inode_backtrace_t
&bt
, int op_prio
)
1348 dout(10) << __func__
<< " on " << *this << dendl
;
1349 ceph_assert(is_dirty_parent());
1352 op_prio
= CEPH_MSG_PRIO_DEFAULT
;
1356 const int64_t pool
= get_backtrace_pool();
1357 build_backtrace(pool
, bt
);
1359 std::string_view slink
= "";
1360 if (is_symlink() && mdcache
->get_symlink_recovery()) {
1364 ops_vec
.emplace_back(op_prio
, pool
, get_inode()->layout
,
1365 mdcache
->mds
->mdsmap
->get_up_features(), slink
);
1367 if (!state_test(STATE_DIRTYPOOL
) || get_inode()->old_pools
.empty()) {
1368 dout(20) << __func__
<< ": no dirtypool or no old pools" << dendl
;
1372 // In the case where DIRTYPOOL is set, we update all old pools backtraces
1373 // such that anyone reading them will see the new pool ID in
1374 // inode_backtrace_t::pool and go read everything else from there.
1375 for (const auto &p
: get_inode()->old_pools
) {
1379 dout(20) << __func__
<< ": updating old pool " << p
<< dendl
;
1381 ops_vec
.emplace_back(op_prio
, p
);
1385 void CInode::store_backtrace(MDSContext
*fin
, int op_prio
)
1387 std::vector
<CInodeCommitOperation
> ops_vec
;
1388 inode_backtrace_t bt
;
1389 auto version
= get_inode()->backtrace_version
;
1391 _store_backtrace(ops_vec
, bt
, op_prio
);
1393 C_GatherBuilder
gather(g_ceph_context
,
1395 new C_IO_Inode_StoredBacktrace(this, version
, fin
),
1396 mdcache
->mds
->finisher
));
1397 _commit_ops(0, gather
, ops_vec
, bt
);
1398 ceph_assert(gather
.has_subs());
1402 void CInode::store_backtrace(CInodeCommitOperations
&op
, int op_prio
)
1404 op
.version
= get_inode()->backtrace_version
;
1407 _store_backtrace(op
.ops_vec
, op
.bt
, op_prio
);
1410 void CInode::_stored_backtrace(int r
, version_t v
, Context
*fin
)
1412 if (r
== -CEPHFS_ENOENT
) {
1413 const int64_t pool
= get_backtrace_pool();
1414 bool exists
= mdcache
->mds
->objecter
->with_osdmap(
1415 [pool
](const OSDMap
&osd_map
) {
1416 return osd_map
.have_pg_pool(pool
);
1419 // This CEPHFS_ENOENT is because the pool doesn't exist (the user deleted it
1420 // out from under us), so the backtrace can never be written, so pretend
1421 // to succeed so that the user can proceed to e.g. delete the file.
1423 dout(4) << __func__
<< " got CEPHFS_ENOENT: a data pool was deleted "
1424 "beneath us!" << dendl
;
1430 dout(1) << "store backtrace error " << r
<< " v " << v
<< dendl
;
1431 mdcache
->mds
->clog
->error() << "failed to store backtrace on ino "
1432 << ino() << " object"
1433 << ", pool " << get_backtrace_pool()
1435 mdcache
->mds
->handle_write_error(r
);
1441 dout(10) << __func__
<< " v " << v
<< dendl
;
1444 if (v
== get_inode()->backtrace_version
)
1445 clear_dirty_parent();
1450 void CInode::fetch_backtrace(Context
*fin
, bufferlist
*backtrace
)
1452 mdcache
->fetch_backtrace(ino(), get_backtrace_pool(), *backtrace
, fin
);
1455 void CInode::mark_dirty_parent(LogSegment
*ls
, bool dirty_pool
)
1457 if (!state_test(STATE_DIRTYPARENT
)) {
1458 dout(10) << __func__
<< dendl
;
1459 state_set(STATE_DIRTYPARENT
);
1460 get(PIN_DIRTYPARENT
);
1464 state_set(STATE_DIRTYPOOL
);
1466 ls
->dirty_parent_inodes
.push_back(&item_dirty_parent
);
1469 void CInode::clear_dirty_parent()
1471 if (state_test(STATE_DIRTYPARENT
)) {
1472 dout(10) << __func__
<< dendl
;
1473 state_clear(STATE_DIRTYPARENT
);
1474 state_clear(STATE_DIRTYPOOL
);
1475 put(PIN_DIRTYPARENT
);
1476 item_dirty_parent
.remove_myself();
1480 void CInode::verify_diri_backtrace(bufferlist
&bl
, int err
)
1482 if (is_base() || is_dirty_parent() || !is_auth())
1485 dout(10) << __func__
<< dendl
;
1488 inode_backtrace_t backtrace
;
1490 decode(backtrace
, bl
);
1491 CDentry
*pdn
= get_parent_dn();
1492 if (backtrace
.ancestors
.empty() ||
1493 backtrace
.ancestors
[0].dname
!= pdn
->get_name() ||
1494 backtrace
.ancestors
[0].dirino
!= pdn
->get_dir()->ino())
1495 err
= -CEPHFS_EINVAL
;
1499 MDSRank
*mds
= mdcache
->mds
;
1500 mds
->clog
->error() << "bad backtrace on directory inode " << ino();
1501 ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace
> 1));
1503 mark_dirty_parent(mds
->mdlog
->get_current_segment(), false);
1504 mds
->mdlog
->flush();
1508 // ------------------
1512 void InodeStoreBase::encode_xattrs(bufferlist
&bl
) const {
1515 encode(*xattrs
, bl
);
1517 encode((__u32
)0, bl
);
1520 void InodeStoreBase::decode_xattrs(bufferlist::const_iterator
&p
) {
1522 mempool_xattr_map tmp
;
1523 decode_noshare(tmp
, p
);
1525 reset_xattrs(xattr_map_ptr());
1527 reset_xattrs(allocate_xattr_map(std::move(tmp
)));
1531 void InodeStoreBase::encode_old_inodes(bufferlist
&bl
, uint64_t features
) const {
1534 encode(*old_inodes
, bl
, features
);
1536 encode((__u32
)0, bl
);
1539 void InodeStoreBase::decode_old_inodes(bufferlist::const_iterator
&p
) {
1541 mempool_old_inode_map tmp
;
1544 reset_old_inodes(old_inode_map_ptr());
1546 reset_old_inodes(allocate_old_inode_map(std::move(tmp
)));
1550 void InodeStoreBase::encode_bare(bufferlist
&bl
, uint64_t features
,
1551 const bufferlist
*snap_blob
) const
1554 encode(*inode
, bl
, features
);
1555 if (inode
->is_symlink())
1556 encode(symlink
, bl
);
1557 encode(dirfragtree
, bl
);
1561 encode(*snap_blob
, bl
);
1563 encode(bufferlist(), bl
);
1564 encode_old_inodes(bl
, features
);
1565 encode(oldest_snap
, bl
);
1566 encode(damage_flags
, bl
);
1569 void InodeStoreBase::encode(bufferlist
&bl
, uint64_t features
,
1570 const bufferlist
*snap_blob
) const
1572 ENCODE_START(6, 4, bl
);
1573 encode_bare(bl
, features
, snap_blob
);
1577 void CInode::encode_store(bufferlist
& bl
, uint64_t features
)
1579 bufferlist snap_blob
;
1580 encode_snap_blob(snap_blob
);
1581 InodeStoreBase::encode(bl
, mdcache
->mds
->mdsmap
->get_up_features(),
1585 void InodeStoreBase::decode_bare(bufferlist::const_iterator
&bl
,
1586 bufferlist
& snap_blob
, __u8 struct_v
)
1590 auto _inode
= allocate_inode();
1591 decode(*_inode
, bl
);
1593 if (_inode
->is_symlink()) {
1596 symlink
= std::string_view(tmp
);
1598 decode(dirfragtree
, bl
);
1600 decode(snap_blob
, bl
);
1602 decode_old_inodes(bl
);
1603 if (struct_v
== 2 && _inode
->is_dir()) {
1604 bool default_layout_exists
;
1605 decode(default_layout_exists
, bl
);
1606 if (default_layout_exists
) {
1607 decode(struct_v
, bl
); // this was a default_file_layout
1608 decode(_inode
->layout
, bl
); // but we only care about the layout portion
1612 if (struct_v
>= 5) {
1613 // InodeStore is embedded in dentries without proper versioning, so
1614 // we consume up to the end of the buffer
1616 decode(oldest_snap
, bl
);
1620 decode(damage_flags
, bl
);
1624 reset_inode(std::move(_inode
));
1628 void InodeStoreBase::decode(bufferlist::const_iterator
&bl
, bufferlist
& snap_blob
)
1630 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl
);
1631 decode_bare(bl
, snap_blob
, struct_v
);
1635 void CInode::decode_store(bufferlist::const_iterator
& bl
)
1637 bufferlist snap_blob
;
1638 InodeStoreBase::decode(bl
, snap_blob
);
1639 decode_snap_blob(snap_blob
);
1642 // ------------------
1645 SimpleLock
* CInode::get_lock(int type
)
1648 case CEPH_LOCK_IVERSION
: return &versionlock
;
1649 case CEPH_LOCK_IFILE
: return &filelock
;
1650 case CEPH_LOCK_IAUTH
: return &authlock
;
1651 case CEPH_LOCK_ILINK
: return &linklock
;
1652 case CEPH_LOCK_IDFT
: return &dirfragtreelock
;
1653 case CEPH_LOCK_IXATTR
: return &xattrlock
;
1654 case CEPH_LOCK_ISNAP
: return &snaplock
;
1655 case CEPH_LOCK_INEST
: return &nestlock
;
1656 case CEPH_LOCK_IFLOCK
: return &flocklock
;
1657 case CEPH_LOCK_IPOLICY
: return &policylock
;
1662 void CInode::set_object_info(MDSCacheObjectInfo
&info
)
1668 void CInode::encode_lock_iauth(bufferlist
& bl
)
1670 ENCODE_START(1, 1, bl
);
1671 encode(get_inode()->version
, bl
);
1672 encode(get_inode()->ctime
, bl
);
1673 encode(get_inode()->mode
, bl
);
1674 encode(get_inode()->uid
, bl
);
1675 encode(get_inode()->gid
, bl
);
1679 void CInode::decode_lock_iauth(bufferlist::const_iterator
& p
)
1681 ceph_assert(!is_auth());
1682 auto _inode
= allocate_inode(*get_inode());
1684 decode(_inode
->version
, p
);
1687 if (_inode
->ctime
< tm
) _inode
->ctime
= tm
;
1688 decode(_inode
->mode
, p
);
1689 decode(_inode
->uid
, p
);
1690 decode(_inode
->gid
, p
);
1692 reset_inode(std::move(_inode
));
1695 void CInode::encode_lock_ilink(bufferlist
& bl
)
1697 ENCODE_START(1, 1, bl
);
1698 encode(get_inode()->version
, bl
);
1699 encode(get_inode()->ctime
, bl
);
1700 encode(get_inode()->nlink
, bl
);
1704 void CInode::decode_lock_ilink(bufferlist::const_iterator
& p
)
1706 ceph_assert(!is_auth());
1707 auto _inode
= allocate_inode(*get_inode());
1709 decode(_inode
->version
, p
);
1712 if (_inode
->ctime
< tm
) _inode
->ctime
= tm
;
1713 decode(_inode
->nlink
, p
);
1715 reset_inode(std::move(_inode
));
1718 void CInode::encode_lock_idft(bufferlist
& bl
)
1720 ENCODE_START(1, 1, bl
);
1722 encode(get_inode()->version
, bl
);
1724 // treat flushing as dirty when rejoining cache
1725 bool dirty
= dirfragtreelock
.is_dirty_or_flushing();
1729 // encode the raw tree
1730 encode(dirfragtree
, bl
);
1732 // also specify which frags are mine
1733 set
<frag_t
> myfrags
;
1734 auto&& dfls
= get_dirfrags();
1735 for (const auto& dir
: dfls
) {
1736 if (dir
->is_auth()) {
1737 frag_t fg
= dir
->get_frag();
1741 encode(myfrags
, bl
);
1746 void CInode::decode_lock_idft(bufferlist::const_iterator
& p
)
1753 decode(replica_dirty
, p
);
1754 if (replica_dirty
) {
1755 dout(10) << __func__
<< " setting dftlock dirty flag" << dendl
;
1756 dirfragtreelock
.mark_dirty(); // ok bc we're auth and caller will handle
1759 _inode
= allocate_inode(*get_inode());
1760 decode(_inode
->version
, p
);
1765 set
<frag_t
> authfrags
;
1766 decode(authfrags
, p
);
1768 // auth. believe replica's auth frags only.
1769 for (auto fg
: authfrags
) {
1770 if (!dirfragtree
.is_leaf(fg
)) {
1771 dout(10) << " forcing frag " << fg
<< " to leaf (split|merge)" << dendl
;
1772 dirfragtree
.force_to_leaf(g_ceph_context
, fg
);
1773 dirfragtreelock
.mark_dirty(); // ok bc we're auth and caller will handle
1777 // replica. take the new tree, BUT make sure any open
1778 // dirfrags remain leaves (they may have split _after_ this
1779 // dft was scattered, or we may still be be waiting on the
1780 // notify from the auth)
1781 dirfragtree
.swap(temp
);
1782 for (const auto &p
: dirfrags
) {
1783 if (!dirfragtree
.is_leaf(p
.first
)) {
1784 dout(10) << " forcing open dirfrag " << p
.first
<< " to leaf (racing with split|merge)" << dendl
;
1785 dirfragtree
.force_to_leaf(g_ceph_context
, p
.first
);
1787 if (p
.second
->is_auth())
1788 p
.second
->state_clear(CDir::STATE_DIRTYDFT
);
1791 if (g_conf()->mds_debug_frag
)
1797 reset_inode(std::move(_inode
));
1800 void CInode::encode_lock_ifile(bufferlist
& bl
)
1802 ENCODE_START(1, 1, bl
);
1804 encode(get_inode()->version
, bl
);
1805 encode(get_inode()->ctime
, bl
);
1806 encode(get_inode()->mtime
, bl
);
1807 encode(get_inode()->atime
, bl
);
1808 encode(get_inode()->time_warp_seq
, bl
);
1810 encode(get_inode()->layout
, bl
, mdcache
->mds
->mdsmap
->get_up_features());
1811 encode(get_inode()->size
, bl
);
1812 encode(get_inode()->truncate_seq
, bl
);
1813 encode(get_inode()->truncate_size
, bl
);
1814 encode(get_inode()->client_ranges
, bl
);
1815 encode(get_inode()->inline_data
, bl
);
1818 // treat flushing as dirty when rejoining cache
1819 bool dirty
= filelock
.is_dirty_or_flushing();
1822 dout(15) << __func__
<< " inode.dirstat is " << get_inode()->dirstat
<< dendl
;
1823 encode(get_inode()->dirstat
, bl
); // only meaningful if i am auth.
1826 for (const auto &p
: dirfrags
) {
1827 frag_t fg
= p
.first
;
1828 CDir
*dir
= p
.second
;
1829 if (is_auth() || dir
->is_auth()) {
1830 const auto& pf
= dir
->get_projected_fnode();
1831 dout(15) << fg
<< " " << *dir
<< dendl
;
1832 dout(20) << fg
<< " fragstat " << pf
->fragstat
<< dendl
;
1833 dout(20) << fg
<< " accounted_fragstat " << pf
->accounted_fragstat
<< dendl
;
1835 encode(dir
->first
, tmp
);
1836 encode(pf
->fragstat
, tmp
);
1837 encode(pf
->accounted_fragstat
, tmp
);
1842 bl
.claim_append(tmp
);
1846 void CInode::decode_lock_ifile(bufferlist::const_iterator
& p
)
1852 _inode
= allocate_inode(*get_inode());
1854 decode(_inode
->version
, p
);
1857 if (_inode
->ctime
< tm
) _inode
->ctime
= tm
;
1858 decode(_inode
->mtime
, p
);
1859 decode(_inode
->atime
, p
);
1860 decode(_inode
->time_warp_seq
, p
);
1862 decode(_inode
->layout
, p
);
1863 decode(_inode
->size
, p
);
1864 decode(_inode
->truncate_seq
, p
);
1865 decode(_inode
->truncate_size
, p
);
1866 decode(_inode
->client_ranges
, p
);
1867 decode(_inode
->inline_data
, p
);
1871 decode(replica_dirty
, p
);
1872 if (replica_dirty
) {
1873 dout(10) << __func__
<< " setting filelock dirty flag" << dendl
;
1874 filelock
.mark_dirty(); // ok bc we're auth and caller will handle
1878 frag_info_t dirstat
;
1881 dout(10) << " taking inode dirstat " << dirstat
<< " for " << *this << dendl
;
1882 _inode
->dirstat
= dirstat
; // take inode summation if replica
1886 dout(10) << " ...got " << n
<< " fragstats on " << *this << dendl
;
1890 frag_info_t fragstat
;
1891 frag_info_t accounted_fragstat
;
1894 decode(fragstat
, p
);
1895 decode(accounted_fragstat
, p
);
1896 dout(10) << fg
<< " [" << fgfirst
<< ",head] " << dendl
;
1897 dout(10) << fg
<< " fragstat " << fragstat
<< dendl
;
1898 dout(20) << fg
<< " accounted_fragstat " << accounted_fragstat
<< dendl
;
1900 CDir
*dir
= get_dirfrag(fg
);
1902 ceph_assert(dir
); // i am auth; i had better have this dir open
1903 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
1904 << " on " << *dir
<< dendl
;
1905 dir
->first
= fgfirst
;
1906 auto _fnode
= CDir::allocate_fnode(*dir
->get_fnode());
1907 _fnode
->fragstat
= fragstat
;
1908 _fnode
->accounted_fragstat
= accounted_fragstat
;
1909 dir
->reset_fnode(std::move(_fnode
));
1910 if (!(fragstat
== accounted_fragstat
)) {
1911 dout(10) << fg
<< " setting filelock updated flag" << dendl
;
1912 filelock
.mark_dirty(); // ok bc we're auth and caller will handle
1915 if (dir
&& dir
->is_auth()) {
1916 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
1917 << " on " << *dir
<< dendl
;
1918 dir
->first
= fgfirst
;
1919 const auto& pf
= dir
->get_projected_fnode();
1920 finish_scatter_update(&filelock
, dir
,
1921 _inode
->dirstat
.version
, pf
->accounted_fragstat
.version
);
1928 reset_inode(std::move(_inode
));
1931 void CInode::encode_lock_inest(bufferlist
& bl
)
1933 ENCODE_START(1, 1, bl
);
1935 encode(get_inode()->version
, bl
);
1937 // treat flushing as dirty when rejoining cache
1938 bool dirty
= nestlock
.is_dirty_or_flushing();
1941 dout(15) << __func__
<< " inode.rstat is " << get_inode()->rstat
<< dendl
;
1942 encode(get_inode()->rstat
, bl
); // only meaningful if i am auth.
1945 for (const auto &p
: dirfrags
) {
1946 frag_t fg
= p
.first
;
1947 CDir
*dir
= p
.second
;
1948 if (is_auth() || dir
->is_auth()) {
1949 const auto& pf
= dir
->get_projected_fnode();
1950 dout(10) << __func__
<< " " << fg
<< " dir " << *dir
<< dendl
;
1951 dout(10) << __func__
<< " " << fg
<< " rstat " << pf
->rstat
<< dendl
;
1952 dout(10) << __func__
<< " " << fg
<< " accounted_rstat " << pf
->rstat
<< dendl
;
1953 dout(10) << __func__
<< " " << fg
<< " dirty_old_rstat " << dir
->dirty_old_rstat
<< dendl
;
1955 encode(dir
->first
, tmp
);
1956 encode(pf
->rstat
, tmp
);
1957 encode(pf
->accounted_rstat
, tmp
);
1958 encode(dir
->dirty_old_rstat
, tmp
);
1963 bl
.claim_append(tmp
);
1967 void CInode::decode_lock_inest(bufferlist::const_iterator
& p
)
1974 decode(replica_dirty
, p
);
1975 if (replica_dirty
) {
1976 dout(10) << __func__
<< " setting nestlock dirty flag" << dendl
;
1977 nestlock
.mark_dirty(); // ok bc we're auth and caller will handle
1980 _inode
= allocate_inode(*get_inode());
1981 decode(_inode
->version
, p
);
1986 dout(10) << __func__
<< " taking inode rstat " << rstat
<< " for " << *this << dendl
;
1987 _inode
->rstat
= rstat
; // take inode summation if replica
1995 nest_info_t accounted_rstat
;
1996 decltype(CDir::dirty_old_rstat
) dirty_old_rstat
;
2000 decode(accounted_rstat
, p
);
2001 decode(dirty_old_rstat
, p
);
2002 dout(10) << __func__
<< " " << fg
<< " [" << fgfirst
<< ",head]" << dendl
;
2003 dout(10) << __func__
<< " " << fg
<< " rstat " << rstat
<< dendl
;
2004 dout(10) << __func__
<< " " << fg
<< " accounted_rstat " << accounted_rstat
<< dendl
;
2005 dout(10) << __func__
<< " " << fg
<< " dirty_old_rstat " << dirty_old_rstat
<< dendl
;
2006 CDir
*dir
= get_dirfrag(fg
);
2008 ceph_assert(dir
); // i am auth; i had better have this dir open
2009 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
2010 << " on " << *dir
<< dendl
;
2011 dir
->first
= fgfirst
;
2012 auto _fnode
= CDir::allocate_fnode(*dir
->get_fnode());
2013 _fnode
->rstat
= rstat
;
2014 _fnode
->accounted_rstat
= accounted_rstat
;
2015 dir
->reset_fnode(std::move(_fnode
));
2016 dir
->dirty_old_rstat
.swap(dirty_old_rstat
);
2017 if (!(rstat
== accounted_rstat
) || !dir
->dirty_old_rstat
.empty()) {
2018 dout(10) << fg
<< " setting nestlock updated flag" << dendl
;
2019 nestlock
.mark_dirty(); // ok bc we're auth and caller will handle
2022 if (dir
&& dir
->is_auth()) {
2023 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
2024 << " on " << *dir
<< dendl
;
2025 dir
->first
= fgfirst
;
2026 const auto& pf
= dir
->get_projected_fnode();
2027 finish_scatter_update(&nestlock
, dir
,
2028 _inode
->rstat
.version
, pf
->accounted_rstat
.version
);
2035 reset_inode(std::move(_inode
));
2038 void CInode::encode_lock_ixattr(bufferlist
& bl
)
2040 ENCODE_START(2, 1, bl
);
2041 encode(get_inode()->version
, bl
);
2042 encode(get_inode()->ctime
, bl
);
2044 encode(get_inode()->xattr_version
, bl
);
2048 void CInode::decode_lock_ixattr(bufferlist::const_iterator
& p
)
2050 ceph_assert(!is_auth());
2051 auto _inode
= allocate_inode(*get_inode());
2053 decode(_inode
->version
, p
);
2056 if (_inode
->ctime
< tm
)
2059 if (struct_v
>= 2) {
2060 decode(_inode
->xattr_version
, p
);
2063 reset_inode(std::move(_inode
));
2066 void CInode::encode_lock_isnap(bufferlist
& bl
)
2068 ENCODE_START(1, 1, bl
);
2069 encode(get_inode()->version
, bl
);
2070 encode(get_inode()->ctime
, bl
);
2075 void CInode::decode_lock_isnap(bufferlist::const_iterator
& p
)
2077 ceph_assert(!is_auth());
2078 auto _inode
= allocate_inode(*get_inode());
2080 decode(_inode
->version
, p
);
2083 if (_inode
->ctime
< tm
) _inode
->ctime
= tm
;
2086 reset_inode(std::move(_inode
));
2089 void CInode::encode_lock_iflock(bufferlist
& bl
)
2091 ENCODE_START(1, 1, bl
);
2092 encode(get_inode()->version
, bl
);
2093 _encode_file_locks(bl
);
2097 void CInode::decode_lock_iflock(bufferlist::const_iterator
& p
)
2099 ceph_assert(!is_auth());
2100 auto _inode
= allocate_inode(*get_inode());
2102 decode(_inode
->version
, p
);
2103 _decode_file_locks(p
);
2105 reset_inode(std::move(_inode
));
2108 void CInode::encode_lock_ipolicy(bufferlist
& bl
)
2110 ENCODE_START(2, 1, bl
);
2112 encode(get_inode()->version
, bl
);
2113 encode(get_inode()->ctime
, bl
);
2114 encode(get_inode()->layout
, bl
, mdcache
->mds
->mdsmap
->get_up_features());
2115 encode(get_inode()->quota
, bl
);
2116 encode(get_inode()->export_pin
, bl
);
2117 encode(get_inode()->export_ephemeral_distributed_pin
, bl
);
2118 encode(get_inode()->export_ephemeral_random_pin
, bl
);
2123 void CInode::decode_lock_ipolicy(bufferlist::const_iterator
& p
)
2125 ceph_assert(!is_auth());
2126 auto _inode
= allocate_inode(*get_inode());
2129 decode(_inode
->version
, p
);
2132 if (_inode
->ctime
< tm
)
2134 decode(_inode
->layout
, p
);
2135 decode(_inode
->quota
, p
);
2136 decode(_inode
->export_pin
, p
);
2137 if (struct_v
>= 2) {
2138 decode(_inode
->export_ephemeral_distributed_pin
, p
);
2139 decode(_inode
->export_ephemeral_random_pin
, p
);
2144 bool pin_updated
= (get_inode()->export_pin
!= _inode
->export_pin
) ||
2145 (get_inode()->export_ephemeral_distributed_pin
!=
2146 _inode
->export_ephemeral_distributed_pin
);
2147 reset_inode(std::move(_inode
));
2148 maybe_export_pin(pin_updated
);
2151 void CInode::encode_lock_state(int type
, bufferlist
& bl
)
2153 ENCODE_START(1, 1, bl
);
2156 encode(parent
->first
, bl
);
2159 case CEPH_LOCK_IAUTH
:
2160 encode_lock_iauth(bl
);
2163 case CEPH_LOCK_ILINK
:
2164 encode_lock_ilink(bl
);
2167 case CEPH_LOCK_IDFT
:
2168 encode_lock_idft(bl
);
2171 case CEPH_LOCK_IFILE
:
2172 encode_lock_ifile(bl
);
2175 case CEPH_LOCK_INEST
:
2176 encode_lock_inest(bl
);
2179 case CEPH_LOCK_IXATTR
:
2180 encode_lock_ixattr(bl
);
2183 case CEPH_LOCK_ISNAP
:
2184 encode_lock_isnap(bl
);
2187 case CEPH_LOCK_IFLOCK
:
2188 encode_lock_iflock(bl
);
2191 case CEPH_LOCK_IPOLICY
:
2192 encode_lock_ipolicy(bl
);
2201 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2203 void CInode::decode_lock_state(int type
, const bufferlist
& bl
)
2205 auto p
= bl
.cbegin();
2212 decode(newfirst
, p
);
2213 if (!is_auth() && newfirst
!= first
) {
2214 dout(10) << __func__
<< " first " << first
<< " -> " << newfirst
<< dendl
;
2218 decode(newfirst
, p
);
2219 if (!parent
->is_auth() && newfirst
!= parent
->first
) {
2220 dout(10) << __func__
<< " parent first " << first
<< " -> " << newfirst
<< dendl
;
2221 parent
->first
= newfirst
;
2226 case CEPH_LOCK_IAUTH
:
2227 decode_lock_iauth(p
);
2230 case CEPH_LOCK_ILINK
:
2231 decode_lock_ilink(p
);
2234 case CEPH_LOCK_IDFT
:
2235 decode_lock_idft(p
);
2238 case CEPH_LOCK_IFILE
:
2239 decode_lock_ifile(p
);
2242 case CEPH_LOCK_INEST
:
2243 decode_lock_inest(p
);
2246 case CEPH_LOCK_IXATTR
:
2247 decode_lock_ixattr(p
);
2250 case CEPH_LOCK_ISNAP
:
2251 decode_lock_isnap(p
);
2254 case CEPH_LOCK_IFLOCK
:
2255 decode_lock_iflock(p
);
2258 case CEPH_LOCK_IPOLICY
:
2259 decode_lock_ipolicy(p
);
2269 bool CInode::is_dirty_scattered()
2272 filelock
.is_dirty_or_flushing() ||
2273 nestlock
.is_dirty_or_flushing() ||
2274 dirfragtreelock
.is_dirty_or_flushing();
2277 void CInode::clear_scatter_dirty()
2279 filelock
.remove_dirty();
2280 nestlock
.remove_dirty();
2281 dirfragtreelock
.remove_dirty();
2284 void CInode::clear_dirty_scattered(int type
)
2286 dout(10) << __func__
<< " " << type
<< " on " << *this << dendl
;
2287 ceph_assert(is_dir());
2289 case CEPH_LOCK_IFILE
:
2290 item_dirty_dirfrag_dir
.remove_myself();
2293 case CEPH_LOCK_INEST
:
2294 item_dirty_dirfrag_nest
.remove_myself();
2297 case CEPH_LOCK_IDFT
:
2298 item_dirty_dirfrag_dirfragtree
.remove_myself();
2308 * when we initially scatter a lock, we need to check if any of the dirfrags
2309 * have out of date accounted_rstat/fragstat. if so, mark the lock stale.
2311 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2312 void CInode::start_scatter(ScatterLock
*lock
)
2314 dout(10) << __func__
<< " " << *lock
<< " on " << *this << dendl
;
2315 ceph_assert(is_auth());
2316 const auto& pi
= get_projected_inode();
2318 for (const auto &p
: dirfrags
) {
2319 frag_t fg
= p
.first
;
2320 CDir
*dir
= p
.second
;
2321 const auto& pf
= dir
->get_projected_fnode();
2322 dout(20) << fg
<< " " << *dir
<< dendl
;
2324 if (!dir
->is_auth())
2327 switch (lock
->get_type()) {
2328 case CEPH_LOCK_IFILE
:
2329 finish_scatter_update(lock
, dir
, pi
->dirstat
.version
, pf
->accounted_fragstat
.version
);
2332 case CEPH_LOCK_INEST
:
2333 finish_scatter_update(lock
, dir
, pi
->rstat
.version
, pf
->accounted_rstat
.version
);
2336 case CEPH_LOCK_IDFT
:
2337 dir
->state_clear(CDir::STATE_DIRTYDFT
);
2344 class C_Inode_FragUpdate
: public MDSLogContextBase
{
2349 MDSRank
*get_mds() override
{return in
->mdcache
->mds
;}
2350 void finish(int r
) override
{
2351 in
->_finish_frag_update(dir
, mut
);
2355 C_Inode_FragUpdate(CInode
*i
, CDir
*d
, MutationRef
& m
) : in(i
), dir(d
), mut(m
) {}
2358 void CInode::finish_scatter_update(ScatterLock
*lock
, CDir
*dir
,
2359 version_t inode_version
, version_t dir_accounted_version
)
2361 frag_t fg
= dir
->get_frag();
2362 ceph_assert(dir
->is_auth());
2364 if (dir
->is_frozen()) {
2365 dout(10) << __func__
<< " " << fg
<< " frozen, marking " << *lock
<< " stale " << *dir
<< dendl
;
2366 } else if (dir
->get_version() == 0) {
2367 dout(10) << __func__
<< " " << fg
<< " not loaded, marking " << *lock
<< " stale " << *dir
<< dendl
;
2369 if (dir_accounted_version
!= inode_version
) {
2370 dout(10) << __func__
<< " " << fg
<< " journaling accounted scatterstat update v" << inode_version
<< dendl
;
2372 MDLog
*mdlog
= mdcache
->mds
->mdlog
;
2373 MutationRef
mut(new MutationImpl());
2374 mut
->ls
= mdlog
->get_current_segment();
2376 auto pf
= dir
->project_fnode(mut
);
2378 std::string_view ename
;
2379 switch (lock
->get_type()) {
2380 case CEPH_LOCK_IFILE
:
2381 pf
->fragstat
.version
= inode_version
;
2382 pf
->accounted_fragstat
= pf
->fragstat
;
2383 ename
= "lock ifile accounted scatter stat update";
2385 case CEPH_LOCK_INEST
:
2386 pf
->rstat
.version
= inode_version
;
2387 pf
->accounted_rstat
= pf
->rstat
;
2388 ename
= "lock inest accounted scatter stat update";
2390 if (!is_auth() && lock
->get_state() == LOCK_MIX
) {
2391 dout(10) << __func__
<< " try to assimilate dirty rstat on "
2393 dir
->assimilate_dirty_rstat_inodes(mut
);
2401 EUpdate
*le
= new EUpdate(mdlog
, ename
);
2402 mdlog
->start_entry(le
);
2403 le
->metablob
.add_dir_context(dir
);
2404 le
->metablob
.add_dir(dir
, true);
2406 ceph_assert(!dir
->is_frozen());
2409 if (lock
->get_type() == CEPH_LOCK_INEST
&&
2410 !is_auth() && lock
->get_state() == LOCK_MIX
) {
2411 dout(10) << __func__
<< " finish assimilating dirty rstat on "
2413 dir
->assimilate_dirty_rstat_inodes_finish(&le
->metablob
);
2415 if (!(pf
->rstat
== pf
->accounted_rstat
)) {
2416 if (!mut
->is_wrlocked(&nestlock
)) {
2417 mdcache
->mds
->locker
->wrlock_force(&nestlock
, mut
);
2420 mdcache
->mds
->locker
->mark_updated_scatterlock(&nestlock
);
2421 mut
->ls
->dirty_dirfrag_nest
.push_back(&item_dirty_dirfrag_nest
);
2425 pf
->version
= dir
->pre_dirty();
2427 mdlog
->submit_entry(le
, new C_Inode_FragUpdate(this, dir
, mut
));
2429 dout(10) << __func__
<< " " << fg
<< " accounted " << *lock
2430 << " scatter stat unchanged at v" << dir_accounted_version
<< dendl
;
2435 void CInode::_finish_frag_update(CDir
*dir
, MutationRef
& mut
)
2437 dout(10) << __func__
<< " on " << *dir
<< dendl
;
2439 mdcache
->mds
->locker
->drop_locks(mut
.get());
2445 * when we gather a lock, we need to assimilate dirfrag changes into the inode
2446 * state. it's possible we can't update the dirfrag accounted_rstat/fragstat
2447 * because the frag is auth and frozen, or that the replica couldn't for the same
2448 * reason. hopefully it will get updated the next time the lock cycles.
2450 * we have two dimensions of behavior:
2451 * - we may be (auth and !frozen), and able to update, or not.
2452 * - the frag may be stale, or not.
2454 * if the frag is non-stale, we want to assimilate the diff into the
2455 * inode, regardless of whether it's auth or updateable.
2457 * if we update the frag, we want to set accounted_fragstat = frag,
2458 * both if we took the diff or it was stale and we are making it
2461 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2462 void CInode::finish_scatter_gather_update(int type
, MutationRef
& mut
)
2464 LogChannelRef clog
= mdcache
->mds
->clog
;
2466 dout(10) << __func__
<< " " << type
<< " on " << *this << dendl
;
2467 ceph_assert(is_auth());
2470 case CEPH_LOCK_IFILE
:
2472 fragtree_t tmpdft
= dirfragtree
;
2473 struct frag_info_t dirstat
;
2474 bool dirstat_valid
= true;
2477 ceph_assert(is_auth());
2478 auto pi
= _get_projected_inode();
2480 bool touched_mtime
= false, touched_chattr
= false;
2481 dout(20) << " orig dirstat " << pi
->dirstat
<< dendl
;
2482 pi
->dirstat
.version
++;
2483 for (const auto &p
: dirfrags
) {
2484 frag_t fg
= p
.first
;
2485 CDir
*dir
= p
.second
;
2486 dout(20) << fg
<< " " << *dir
<< dendl
;
2489 if (dir
->get_version() != 0) {
2490 update
= dir
->is_auth() && !dir
->is_frozen();
2493 dirstat_valid
= false;
2496 CDir::fnode_const_ptr pf
;
2499 pf
= dir
->project_fnode(mut
);
2501 pf
= dir
->get_projected_fnode();
2504 if (pf
->accounted_fragstat
.version
== pi
->dirstat
.version
- 1) {
2505 dout(20) << fg
<< " fragstat " << pf
->fragstat
<< dendl
;
2506 dout(20) << fg
<< " accounted_fragstat " << pf
->accounted_fragstat
<< dendl
;
2507 pi
->dirstat
.add_delta(pf
->fragstat
, pf
->accounted_fragstat
, &touched_mtime
, &touched_chattr
);
2509 dout(20) << fg
<< " skipping STALE accounted_fragstat " << pf
->accounted_fragstat
<< dendl
;
2512 if (pf
->fragstat
.nfiles
< 0 ||
2513 pf
->fragstat
.nsubdirs
< 0) {
2514 clog
->error() << "bad/negative dir size on "
2515 << dir
->dirfrag() << " " << pf
->fragstat
;
2516 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter
);
2518 auto _pf
= const_cast<fnode_t
*>(pf
.get());
2519 if (pf
->fragstat
.nfiles
< 0)
2520 _pf
->fragstat
.nfiles
= 0;
2521 if (pf
->fragstat
.nsubdirs
< 0)
2522 _pf
->fragstat
.nsubdirs
= 0;
2526 auto _pf
= const_cast<fnode_t
*>(pf
.get());
2527 _pf
->accounted_fragstat
= _pf
->fragstat
;
2528 _pf
->fragstat
.version
= _pf
->accounted_fragstat
.version
= pi
->dirstat
.version
;
2529 _pf
->version
= dir
->pre_dirty();
2530 dout(10) << fg
<< " updated accounted_fragstat " << pf
->fragstat
<< " on " << *dir
<< dendl
;
2533 tmpdft
.force_to_leaf(g_ceph_context
, fg
);
2534 dirstat
.add(pf
->fragstat
);
2537 pi
->mtime
= pi
->ctime
= pi
->dirstat
.mtime
;
2539 pi
->change_attr
= pi
->dirstat
.change_attr
;
2540 dout(20) << " final dirstat " << pi
->dirstat
<< dendl
;
2542 if (dirstat_valid
&& !dirstat
.same_sums(pi
->dirstat
)) {
2544 tmpdft
.get_leaves_under(frag_t(), leaves
);
2545 for (const auto& leaf
: leaves
) {
2546 if (!dirfrags
.count(leaf
)) {
2547 dirstat_valid
= false;
2551 if (dirstat_valid
) {
2552 if (state_test(CInode::STATE_REPAIRSTATS
)) {
2553 dout(20) << " dirstat mismatch, fixing" << dendl
;
2555 clog
->error() << "unmatched fragstat on " << ino() << ", inode has "
2556 << pi
->dirstat
<< ", dirfrags have " << dirstat
;
2557 ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter
);
2559 // trust the dirfrags for now
2560 version_t v
= pi
->dirstat
.version
;
2561 if (pi
->dirstat
.mtime
> dirstat
.mtime
)
2562 dirstat
.mtime
= pi
->dirstat
.mtime
;
2563 if (pi
->dirstat
.change_attr
> dirstat
.change_attr
)
2564 dirstat
.change_attr
= pi
->dirstat
.change_attr
;
2565 pi
->dirstat
= dirstat
;
2566 pi
->dirstat
.version
= v
;
2570 if (pi
->dirstat
.nfiles
< 0 || pi
->dirstat
.nsubdirs
< 0) {
2572 make_path_string(path
);
2573 clog
->error() << "Inconsistent statistics detected: fragstat on inode "
2574 << ino() << " (" << path
<< "), inode has " << pi
->dirstat
;
2575 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter
);
2577 if (pi
->dirstat
.nfiles
< 0)
2578 pi
->dirstat
.nfiles
= 0;
2579 if (pi
->dirstat
.nsubdirs
< 0)
2580 pi
->dirstat
.nsubdirs
= 0;
2585 case CEPH_LOCK_INEST
:
2588 ceph_assert(is_auth());
2590 fragtree_t tmpdft
= dirfragtree
;
2592 bool rstat_valid
= true;
2595 if (const sr_t
*srnode
= get_projected_srnode(); srnode
)
2596 rstat
.rsnaps
= srnode
->snaps
.size();
2598 auto pi
= _get_projected_inode();
2599 dout(20) << " orig rstat " << pi
->rstat
<< dendl
;
2600 pi
->rstat
.version
++;
2601 for (const auto &p
: dirfrags
) {
2602 frag_t fg
= p
.first
;
2603 CDir
*dir
= p
.second
;
2604 dout(20) << fg
<< " " << *dir
<< dendl
;
2607 if (dir
->get_version() != 0) {
2608 update
= dir
->is_auth() && !dir
->is_frozen();
2611 rstat_valid
= false;
2614 CDir::fnode_const_ptr pf
;
2617 pf
= dir
->project_fnode(mut
);
2619 pf
= dir
->get_projected_fnode();
2622 if (pf
->accounted_rstat
.version
== pi
->rstat
.version
-1) {
2623 // only pull this frag's dirty rstat inodes into the frag if
2624 // the frag is non-stale and updateable. if it's stale,
2625 // that info will just get thrown out!
2627 dir
->assimilate_dirty_rstat_inodes(mut
);
2629 dout(20) << fg
<< " rstat " << pf
->rstat
<< dendl
;
2630 dout(20) << fg
<< " accounted_rstat " << pf
->accounted_rstat
<< dendl
;
2631 dout(20) << fg
<< " dirty_old_rstat " << dir
->dirty_old_rstat
<< dendl
;
2632 mdcache
->project_rstat_frag_to_inode(pf
->rstat
, pf
->accounted_rstat
,
2633 dir
->first
, CEPH_NOSNAP
, this, true);
2634 for (auto &p
: dir
->dirty_old_rstat
) {
2635 mdcache
->project_rstat_frag_to_inode(p
.second
.rstat
, p
.second
.accounted_rstat
,
2636 p
.second
.first
, p
.first
, this, true);
2638 if (update
) // dir contents not valid if frozen or non-auth
2639 dir
->check_rstats();
2641 dout(20) << fg
<< " skipping STALE accounted_rstat " << pf
->accounted_rstat
<< dendl
;
2644 auto _pf
= const_cast<fnode_t
*>(pf
.get());
2645 _pf
->accounted_rstat
= pf
->rstat
;
2646 _pf
->rstat
.version
= _pf
->accounted_rstat
.version
= pi
->rstat
.version
;
2647 _pf
->version
= dir
->pre_dirty();
2648 dir
->dirty_old_rstat
.clear();
2649 dir
->check_rstats();
2650 dout(10) << fg
<< " updated accounted_rstat " << pf
->rstat
<< " on " << *dir
<< dendl
;
2653 tmpdft
.force_to_leaf(g_ceph_context
, fg
);
2654 rstat
.add(pf
->rstat
);
2656 dout(20) << " final rstat " << pi
->rstat
<< dendl
;
2658 if (rstat_valid
&& !rstat
.same_sums(pi
->rstat
)) {
2660 tmpdft
.get_leaves_under(frag_t(), leaves
);
2661 for (const auto& leaf
: leaves
) {
2662 if (!dirfrags
.count(leaf
)) {
2663 rstat_valid
= false;
2668 if (state_test(CInode::STATE_REPAIRSTATS
)) {
2669 dout(20) << " rstat mismatch, fixing" << dendl
;
2671 clog
->error() << "inconsistent rstat on inode " << ino()
2672 << ", inode has " << pi
->rstat
2673 << ", directory fragments have " << rstat
;
2674 ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter
);
2676 // trust the dirfrag for now
2677 version_t v
= pi
->rstat
.version
;
2678 if (pi
->rstat
.rctime
> rstat
.rctime
)
2679 rstat
.rctime
= pi
->rstat
.rctime
;
2681 pi
->rstat
.version
= v
;
2685 mdcache
->broadcast_quota_to_client(this);
2689 case CEPH_LOCK_IDFT
:
2697 void CInode::finish_scatter_gather_update_accounted(int type
, EMetaBlob
*metablob
)
2699 dout(10) << __func__
<< " " << type
<< " on " << *this << dendl
;
2700 ceph_assert(is_auth());
2702 for (const auto &p
: dirfrags
) {
2703 CDir
*dir
= p
.second
;
2704 if (!dir
->is_auth() || dir
->get_version() == 0 || dir
->is_frozen())
2707 if (type
== CEPH_LOCK_IDFT
)
2708 continue; // nothing to do.
2710 if (type
== CEPH_LOCK_INEST
)
2711 dir
->assimilate_dirty_rstat_inodes_finish(metablob
);
2713 dout(10) << " journaling updated frag accounted_ on " << *dir
<< dendl
;
2714 ceph_assert(dir
->is_projected());
2715 metablob
->add_dir(dir
, true);
2721 bool CInode::is_frozen() const
2723 if (is_frozen_inode()) return true;
2724 if (parent
&& parent
->dir
->is_frozen()) return true;
2728 bool CInode::is_frozen_dir() const
2730 if (parent
&& parent
->dir
->is_frozen_dir()) return true;
2734 bool CInode::is_freezing() const
2736 if (is_freezing_inode()) return true;
2737 if (parent
&& parent
->dir
->is_freezing()) return true;
2741 void CInode::add_dir_waiter(frag_t fg
, MDSContext
*c
)
2743 if (waiting_on_dir
.empty())
2745 waiting_on_dir
[fg
].push_back(c
);
2746 dout(10) << __func__
<< " frag " << fg
<< " " << c
<< " on " << *this << dendl
;
2749 void CInode::take_dir_waiting(frag_t fg
, MDSContext::vec
& ls
)
2751 if (waiting_on_dir
.empty())
2754 auto it
= waiting_on_dir
.find(fg
);
2755 if (it
!= waiting_on_dir
.end()) {
2756 dout(10) << __func__
<< " frag " << fg
<< " on " << *this << dendl
;
2757 auto& waiting
= it
->second
;
2758 ls
.insert(ls
.end(), waiting
.begin(), waiting
.end());
2759 waiting_on_dir
.erase(it
);
2761 if (waiting_on_dir
.empty())
2766 void CInode::add_waiter(uint64_t tag
, MDSContext
*c
)
2768 dout(10) << __func__
<< " tag " << std::hex
<< tag
<< std::dec
<< " " << c
2769 << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH
)
2770 << " !frozen " << !is_frozen_inode()
2771 << " !freezing " << !is_freezing_inode()
2773 // wait on the directory?
2774 // make sure its not the inode that is explicitly ambiguous|freezing|frozen
2775 if (((tag
& WAIT_SINGLEAUTH
) && !state_test(STATE_AMBIGUOUSAUTH
)) ||
2776 ((tag
& WAIT_UNFREEZE
) &&
2777 !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) {
2778 dout(15) << "passing waiter up tree" << dendl
;
2779 parent
->dir
->add_waiter(tag
, c
);
2782 dout(15) << "taking waiter here" << dendl
;
2783 MDSCacheObject::add_waiter(tag
, c
);
2786 void CInode::take_waiting(uint64_t mask
, MDSContext::vec
& ls
)
2788 if ((mask
& WAIT_DIR
) && !waiting_on_dir
.empty()) {
2789 // take all dentry waiters
2790 while (!waiting_on_dir
.empty()) {
2791 auto it
= waiting_on_dir
.begin();
2792 dout(10) << __func__
<< " dirfrag " << it
->first
<< " on " << *this << dendl
;
2793 auto& waiting
= it
->second
;
2794 ls
.insert(ls
.end(), waiting
.begin(), waiting
.end());
2795 waiting_on_dir
.erase(it
);
2801 MDSCacheObject::take_waiting(mask
, ls
);
2804 void CInode::maybe_finish_freeze_inode()
2806 CDir
*dir
= get_parent_dir();
2807 if (auth_pins
> auth_pin_freeze_allowance
|| dir
->frozen_inode_suppressed
)
2810 dout(10) << "maybe_finish_freeze_inode - frozen" << dendl
;
2811 ceph_assert(auth_pins
== auth_pin_freeze_allowance
);
2814 state_clear(STATE_FREEZING
);
2815 state_set(STATE_FROZEN
);
2817 item_freezing_inode
.remove_myself();
2818 dir
->num_frozen_inodes
++;
2820 finish_waiting(WAIT_FROZEN
);
2823 bool CInode::freeze_inode(int auth_pin_allowance
)
2825 CDir
*dir
= get_parent_dir();
2828 ceph_assert(auth_pin_allowance
> 0); // otherwise we need to adjust parent's nested_auth_pins
2829 ceph_assert(auth_pins
>= auth_pin_allowance
);
2830 if (auth_pins
== auth_pin_allowance
&& !dir
->frozen_inode_suppressed
) {
2831 dout(10) << "freeze_inode - frozen" << dendl
;
2832 if (!state_test(STATE_FROZEN
)) {
2834 state_set(STATE_FROZEN
);
2835 dir
->num_frozen_inodes
++;
2840 dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance
<< dendl
;
2841 auth_pin_freeze_allowance
= auth_pin_allowance
;
2842 dir
->freezing_inodes
.push_back(&item_freezing_inode
);
2845 state_set(STATE_FREEZING
);
2847 if (!dir
->lock_caches_with_auth_pins
.empty())
2848 mdcache
->mds
->locker
->invalidate_lock_caches(dir
);
2850 const static int lock_types
[] = {
2851 CEPH_LOCK_IVERSION
, CEPH_LOCK_IFILE
, CEPH_LOCK_IAUTH
, CEPH_LOCK_ILINK
, CEPH_LOCK_IDFT
,
2852 CEPH_LOCK_IXATTR
, CEPH_LOCK_ISNAP
, CEPH_LOCK_INEST
, CEPH_LOCK_IFLOCK
, CEPH_LOCK_IPOLICY
, 0
2854 for (int i
= 0; lock_types
[i
]; ++i
) {
2855 auto lock
= get_lock(lock_types
[i
]);
2856 if (lock
->is_cached())
2857 mdcache
->mds
->locker
->invalidate_lock_caches(lock
);
2859 // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed
2860 // and finish freezing the inode
2861 return state_test(STATE_FROZEN
);
2864 void CInode::unfreeze_inode(MDSContext::vec
& finished
)
2866 dout(10) << __func__
<< dendl
;
2867 if (state_test(STATE_FREEZING
)) {
2868 state_clear(STATE_FREEZING
);
2870 item_freezing_inode
.remove_myself();
2871 } else if (state_test(STATE_FROZEN
)) {
2872 state_clear(STATE_FROZEN
);
2874 get_parent_dir()->num_frozen_inodes
--;
2877 take_waiting(WAIT_UNFREEZE
, finished
);
2880 void CInode::unfreeze_inode()
2882 MDSContext::vec finished
;
2883 unfreeze_inode(finished
);
2884 mdcache
->mds
->queue_waiters(finished
);
2887 void CInode::freeze_auth_pin()
2889 ceph_assert(state_test(CInode::STATE_FROZEN
));
2890 state_set(CInode::STATE_FROZENAUTHPIN
);
2891 get_parent_dir()->num_frozen_inodes
++;
2894 void CInode::unfreeze_auth_pin()
2896 ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN
));
2897 state_clear(CInode::STATE_FROZENAUTHPIN
);
2898 get_parent_dir()->num_frozen_inodes
--;
2899 if (!state_test(STATE_FREEZING
|STATE_FROZEN
)) {
2900 MDSContext::vec finished
;
2901 take_waiting(WAIT_UNFREEZE
, finished
);
2902 mdcache
->mds
->queue_waiters(finished
);
2906 void CInode::clear_ambiguous_auth(MDSContext::vec
& finished
)
2908 ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH
));
2909 state_clear(CInode::STATE_AMBIGUOUSAUTH
);
2910 take_waiting(CInode::WAIT_SINGLEAUTH
, finished
);
2913 void CInode::clear_ambiguous_auth()
2915 MDSContext::vec finished
;
2916 clear_ambiguous_auth(finished
);
2917 mdcache
->mds
->queue_waiters(finished
);
2921 bool CInode::can_auth_pin(int *err_ret
) const {
2925 } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
2926 err
= ERR_EXPORTING_INODE
;
2929 return parent
->can_auth_pin(err_ret
);
2937 void CInode::auth_pin(void *by
)
2943 #ifdef MDS_AUTHPIN_SET
2944 auth_pin_set
.insert(by
);
2947 dout(10) << "auth_pin by " << by
<< " on " << *this << " now " << auth_pins
<< dendl
;
2950 parent
->adjust_nested_auth_pins(1, this);
2953 void CInode::auth_unpin(void *by
)
2957 #ifdef MDS_AUTHPIN_SET
2959 auto it
= auth_pin_set
.find(by
);
2960 ceph_assert(it
!= auth_pin_set
.end());
2961 auth_pin_set
.erase(it
);
2968 dout(10) << "auth_unpin by " << by
<< " on " << *this << " now " << auth_pins
<< dendl
;
2970 ceph_assert(auth_pins
>= 0);
2973 parent
->adjust_nested_auth_pins(-1, by
);
2975 if (is_freezing_inode())
2976 maybe_finish_freeze_inode();
2981 mds_authority_t
CInode::authority() const
2983 if (inode_auth
.first
>= 0)
2987 return parent
->dir
->authority();
2989 // new items that are not yet linked in (in the committed plane) belong
2990 // to their first parent.
2991 if (!projected_parent
.empty())
2992 return projected_parent
.front()->dir
->authority();
2994 return CDIR_AUTH_UNDEF
;
3000 snapid_t
CInode::get_oldest_snap()
3003 if (is_any_old_inodes())
3004 t
= get_old_inodes()->begin()->second
.first
;
3005 return std::min(t
, oldest_snap
);
3008 const CInode::mempool_old_inode
& CInode::cow_old_inode(snapid_t follows
, bool cow_head
)
3010 ceph_assert(follows
>= first
);
3012 const auto& pi
= cow_head
? get_projected_inode() : get_previous_projected_inode();
3013 const auto& px
= cow_head
? get_projected_xattrs() : get_previous_projected_xattrs();
3015 auto _old_inodes
= allocate_old_inode_map();
3017 *_old_inodes
= *old_inodes
;
3019 mempool_old_inode
&old
= (*_old_inodes
)[follows
];
3023 dout(10) << " " << px
->size() << " xattrs cowed, " << *px
<< dendl
;
3027 if (first
< oldest_snap
)
3028 oldest_snap
= first
;
3030 old
.inode
.trim_client_ranges(follows
);
3032 if (g_conf()->mds_snap_rstat
&&
3033 !(old
.inode
.rstat
== old
.inode
.accounted_rstat
))
3034 dirty_old_rstats
.insert(follows
);
3038 dout(10) << __func__
<< " " << (cow_head
? "head" : "previous_head" )
3039 << " to [" << old
.first
<< "," << follows
<< "] on "
3042 reset_old_inodes(std::move(_old_inodes
));
3046 void CInode::pre_cow_old_inode()
3048 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
3049 if (first
<= follows
)
3050 cow_old_inode(follows
, true);
3053 bool CInode::has_snap_data(snapid_t snapid
)
3055 bool found
= snapid
>= first
&& snapid
<= last
;
3056 if (!found
&& is_any_old_inodes()) {
3057 auto p
= old_inodes
->lower_bound(snapid
);
3058 if (p
!= old_inodes
->end()) {
3059 if (p
->second
.first
> snapid
) {
3060 if (p
!= old_inodes
->begin())
3063 if (p
->second
.first
<= snapid
&& snapid
<= p
->first
) {
3071 void CInode::purge_stale_snap_data(const set
<snapid_t
>& snaps
)
3073 dout(10) << __func__
<< " " << snaps
<< dendl
;
3075 if (!get_old_inodes())
3078 std::vector
<snapid_t
> to_remove
;
3079 for (auto p
: *get_old_inodes()) {
3080 const snapid_t
&id
= p
.first
;
3081 const auto &s
= snaps
.lower_bound(p
.second
.first
);
3082 if (s
== snaps
.end() || *s
> id
) {
3083 dout(10) << " purging old_inode [" << p
.second
.first
<< "," << id
<< "]" << dendl
;
3084 to_remove
.push_back(id
);
3088 if (to_remove
.size() == get_old_inodes()->size()) {
3089 reset_old_inodes(old_inode_map_ptr());
3090 } else if (!to_remove
.empty()) {
3091 auto _old_inodes
= allocate_old_inode_map(*get_old_inodes());
3092 for (auto id
: to_remove
)
3093 _old_inodes
->erase(id
);
3094 reset_old_inodes(std::move(_old_inodes
));
3099 * pick/create an old_inode
3101 snapid_t
CInode::pick_old_inode(snapid_t snap
) const
3103 if (is_any_old_inodes()) {
3104 auto it
= old_inodes
->lower_bound(snap
); // p is first key >= to snap
3105 if (it
!= old_inodes
->end() && it
->second
.first
<= snap
) {
3106 dout(10) << __func__
<< " snap " << snap
<< " -> [" << it
->second
.first
<< "," << it
->first
<< "]" << dendl
;
3110 dout(10) << __func__
<< " snap " << snap
<< " -> nothing" << dendl
;
3114 void CInode::open_snaprealm(bool nosplit
)
3117 SnapRealm
*parent
= find_snaprealm();
3118 snaprealm
= new SnapRealm(mdcache
, this);
3120 dout(10) << __func__
<< " " << snaprealm
3121 << " parent is " << parent
3123 dout(30) << " siblings are " << parent
->open_children
<< dendl
;
3124 snaprealm
->parent
= parent
;
3126 parent
->split_at(snaprealm
);
3127 parent
->open_children
.insert(snaprealm
);
3131 void CInode::close_snaprealm(bool nojoin
)
3134 dout(15) << __func__
<< " " << *snaprealm
<< dendl
;
3135 if (snaprealm
->parent
) {
3136 snaprealm
->parent
->open_children
.erase(snaprealm
);
3138 //snaprealm->parent->join(snaprealm);
3145 SnapRealm
*CInode::find_snaprealm() const
3147 const CInode
*cur
= this;
3148 while (!cur
->snaprealm
) {
3149 const CDentry
*pdn
= cur
->get_oldest_parent_dn();
3152 cur
= pdn
->get_dir()->get_inode();
3154 return cur
->snaprealm
;
3157 void CInode::encode_snap_blob(bufferlist
&snapbl
)
3161 encode(snaprealm
->srnode
, snapbl
);
3162 dout(20) << __func__
<< " " << *snaprealm
<< dendl
;
3165 void CInode::decode_snap_blob(const bufferlist
& snapbl
)
3168 if (snapbl
.length()) {
3170 auto old_flags
= snaprealm
->srnode
.flags
;
3171 auto p
= snapbl
.cbegin();
3172 decode(snaprealm
->srnode
, p
);
3174 if ((snaprealm
->srnode
.flags
^ old_flags
) & sr_t::PARENT_GLOBAL
) {
3175 snaprealm
->adjust_parent();
3178 dout(20) << __func__
<< " " << *snaprealm
<< dendl
;
3179 } else if (snaprealm
&&
3180 !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675
3181 ceph_assert(mdcache
->mds
->is_any_replay());
3182 snaprealm
->merge_to(NULL
);
3186 void CInode::encode_snap(bufferlist
& bl
)
3188 ENCODE_START(1, 1, bl
);
3190 encode_snap_blob(snapbl
);
3192 encode(oldest_snap
, bl
);
3196 void CInode::decode_snap(bufferlist::const_iterator
& p
)
3201 decode(oldest_snap
, p
);
3202 decode_snap_blob(snapbl
);
3206 // =============================================
3208 client_t
CInode::calc_ideal_loner()
3210 if (mdcache
->is_readonly())
3212 if (!get_mds_caps_wanted().empty())
3216 client_t loner
= -1;
3217 for (const auto &p
: client_caps
) {
3218 if (!p
.second
.is_stale() &&
3220 !has_subtree_or_exporting_dirfrag() :
3221 (p
.second
.wanted() & (CEPH_CAP_ANY_WR
|CEPH_CAP_FILE_RD
)))) {
3231 bool CInode::choose_ideal_loner()
3233 want_loner_cap
= calc_ideal_loner();
3234 int changed
= false;
3235 if (loner_cap
>= 0 && loner_cap
!= want_loner_cap
) {
3236 if (!try_drop_loner())
3241 if (want_loner_cap
>= 0) {
3242 if (loner_cap
< 0) {
3243 set_loner_cap(want_loner_cap
);
3246 ceph_assert(loner_cap
== want_loner_cap
);
3251 bool CInode::try_set_loner()
3253 ceph_assert(want_loner_cap
>= 0);
3254 if (loner_cap
>= 0 && loner_cap
!= want_loner_cap
)
3256 set_loner_cap(want_loner_cap
);
3260 void CInode::set_loner_cap(client_t l
)
3263 authlock
.set_excl_client(loner_cap
);
3264 filelock
.set_excl_client(loner_cap
);
3265 linklock
.set_excl_client(loner_cap
);
3266 xattrlock
.set_excl_client(loner_cap
);
3269 bool CInode::try_drop_loner()
3274 int other_allowed
= get_caps_allowed_by_type(CAP_ANY
);
3275 Capability
*cap
= get_client_cap(loner_cap
);
3277 (cap
->issued() & ~other_allowed
) == 0) {
3285 // choose new lock state during recovery, based on issued caps
3286 void CInode::choose_lock_state(SimpleLock
*lock
, int allissued
)
3288 int shift
= lock
->get_cap_shift();
3289 int issued
= (allissued
>> shift
) & lock
->get_cap_mask();
3291 if (lock
->is_xlocked()) {
3293 } else if (lock
->get_state() != LOCK_MIX
) {
3294 if (issued
& (CEPH_CAP_GEXCL
| CEPH_CAP_GBUFFER
))
3295 lock
->set_state(LOCK_EXCL
);
3296 else if (issued
& CEPH_CAP_GWR
) {
3297 if (issued
& (CEPH_CAP_GCACHE
| CEPH_CAP_GSHARED
))
3298 lock
->set_state(LOCK_EXCL
);
3300 lock
->set_state(LOCK_MIX
);
3301 } else if (lock
->is_dirty()) {
3302 if (is_replicated())
3303 lock
->set_state(LOCK_MIX
);
3305 lock
->set_state(LOCK_LOCK
);
3307 lock
->set_state(LOCK_SYNC
);
3310 // our states have already been chosen during rejoin.
3311 if (lock
->is_xlocked())
3312 ceph_assert(lock
->get_state() == LOCK_LOCK
);
3316 void CInode::choose_lock_states(int dirty_caps
)
3318 int issued
= get_caps_issued() | dirty_caps
;
3319 if (is_auth() && (issued
& (CEPH_CAP_ANY_EXCL
|CEPH_CAP_ANY_WR
)))
3320 choose_ideal_loner();
3321 choose_lock_state(&filelock
, issued
);
3322 choose_lock_state(&nestlock
, issued
);
3323 choose_lock_state(&dirfragtreelock
, issued
);
3324 choose_lock_state(&authlock
, issued
);
3325 choose_lock_state(&xattrlock
, issued
);
3326 choose_lock_state(&linklock
, issued
);
3329 int CInode::count_nonstale_caps()
3332 for (const auto &p
: client_caps
) {
3333 if (!p
.second
.is_stale())
3339 bool CInode::multiple_nonstale_caps()
3342 for (const auto &p
: client_caps
) {
3343 if (!p
.second
.is_stale()) {
3352 void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map
<int32_t,int32_t>& m
)
3354 bool old_empty
= mds_caps_wanted
.empty();
3355 mds_caps_wanted
.swap(m
);
3356 if (old_empty
!= (bool)mds_caps_wanted
.empty()) {
3358 adjust_num_caps_notable(1);
3360 adjust_num_caps_notable(-1);
3364 void CInode::set_mds_caps_wanted(mds_rank_t mds
, int32_t wanted
)
3366 bool old_empty
= mds_caps_wanted
.empty();
3368 mds_caps_wanted
[mds
] = wanted
;
3370 adjust_num_caps_notable(1);
3371 } else if (!old_empty
) {
3372 mds_caps_wanted
.erase(mds
);
3373 if (mds_caps_wanted
.empty())
3374 adjust_num_caps_notable(-1);
3378 Capability
*CInode::add_client_cap(client_t client
, Session
*session
,
3379 SnapRealm
*conrealm
, bool new_inode
)
3381 ceph_assert(last
== CEPH_NOSNAP
);
3382 if (client_caps
.empty()) {
3385 containing_realm
= conrealm
;
3387 containing_realm
= find_snaprealm();
3388 containing_realm
->inodes_with_caps
.push_back(&item_caps
);
3389 dout(10) << __func__
<< " first cap, joining realm " << *containing_realm
<< dendl
;
3391 mdcache
->num_inodes_with_caps
++;
3393 parent
->dir
->adjust_num_inodes_with_caps(1);
3396 uint64_t cap_id
= new_inode
? 1 : ++mdcache
->last_cap_id
;
3397 auto ret
= client_caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
),
3398 std::forward_as_tuple(this, session
, cap_id
));
3399 ceph_assert(ret
.second
== true);
3400 Capability
*cap
= &ret
.first
->second
;
3402 cap
->client_follows
= first
-1;
3403 containing_realm
->add_cap(client
, cap
);
3408 void CInode::remove_client_cap(client_t client
)
3410 auto it
= client_caps
.find(client
);
3411 ceph_assert(it
!= client_caps
.end());
3412 Capability
*cap
= &it
->second
;
3414 cap
->item_session_caps
.remove_myself();
3415 cap
->item_revoking_caps
.remove_myself();
3416 cap
->item_client_revoking_caps
.remove_myself();
3417 containing_realm
->remove_cap(client
, cap
);
3419 if (client
== loner_cap
)
3422 if (cap
->is_wanted_notable())
3423 adjust_num_caps_notable(-1);
3425 client_caps
.erase(it
);
3426 if (client_caps
.empty()) {
3427 dout(10) << __func__
<< " last cap, leaving realm " << *containing_realm
<< dendl
;
3429 item_caps
.remove_myself();
3430 containing_realm
= NULL
;
3431 mdcache
->num_inodes_with_caps
--;
3433 parent
->dir
->adjust_num_inodes_with_caps(-1);
3436 //clean up advisory locks
3437 bool fcntl_removed
= fcntl_locks
? fcntl_locks
->remove_all_from(client
) : false;
3438 bool flock_removed
= flock_locks
? flock_locks
->remove_all_from(client
) : false;
3439 if (fcntl_removed
|| flock_removed
) {
3440 MDSContext::vec waiters
;
3441 take_waiting(CInode::WAIT_FLOCK
, waiters
);
3442 mdcache
->mds
->queue_waiters(waiters
);
3446 void CInode::move_to_realm(SnapRealm
*realm
)
3448 dout(10) << __func__
<< " joining realm " << *realm
3449 << ", leaving realm " << *containing_realm
<< dendl
;
3450 for (auto& p
: client_caps
) {
3451 containing_realm
->remove_cap(p
.first
, &p
.second
);
3452 realm
->add_cap(p
.first
, &p
.second
);
3454 item_caps
.remove_myself();
3455 realm
->inodes_with_caps
.push_back(&item_caps
);
3456 containing_realm
= realm
;
3459 Capability
*CInode::reconnect_cap(client_t client
, const cap_reconnect_t
& icr
, Session
*session
)
3461 Capability
*cap
= get_client_cap(client
);
3464 cap
->merge(icr
.capinfo
.wanted
, icr
.capinfo
.issued
);
3466 cap
= add_client_cap(client
, session
);
3467 cap
->set_cap_id(icr
.capinfo
.cap_id
);
3468 cap
->set_wanted(icr
.capinfo
.wanted
);
3469 cap
->issue_norevoke(icr
.capinfo
.issued
);
3472 cap
->set_last_issue_stamp(ceph_clock_now());
3476 void CInode::clear_client_caps_after_export()
3478 while (!client_caps
.empty())
3479 remove_client_cap(client_caps
.begin()->first
);
3481 want_loner_cap
= -1;
3482 if (!get_mds_caps_wanted().empty()) {
3483 mempool::mds_co::compact_map
<int32_t,int32_t> empty
;
3484 set_mds_caps_wanted(empty
);
3488 void CInode::export_client_caps(map
<client_t
,Capability::Export
>& cl
)
3490 for (const auto &p
: client_caps
) {
3491 cl
[p
.first
] = p
.second
.make_export();
3496 int CInode::get_caps_liked() const
3499 return CEPH_CAP_PIN
| CEPH_CAP_ANY_EXCL
| CEPH_CAP_ANY_SHARED
; // but not, say, FILE_RD|WR|WRBUFFER
3501 return CEPH_CAP_ANY
& ~CEPH_CAP_FILE_LAZYIO
;
3504 int CInode::get_caps_allowed_ever() const
3508 allowed
= CEPH_CAP_PIN
| CEPH_CAP_ANY_EXCL
| CEPH_CAP_ANY_SHARED
;
3510 allowed
= CEPH_CAP_ANY
;
3513 (filelock
.gcaps_allowed_ever() << filelock
.get_cap_shift()) |
3514 (authlock
.gcaps_allowed_ever() << authlock
.get_cap_shift()) |
3515 (xattrlock
.gcaps_allowed_ever() << xattrlock
.get_cap_shift()) |
3516 (linklock
.gcaps_allowed_ever() << linklock
.get_cap_shift()));
3519 int CInode::get_caps_allowed_by_type(int type
) const
3523 (filelock
.gcaps_allowed(type
) << filelock
.get_cap_shift()) |
3524 (authlock
.gcaps_allowed(type
) << authlock
.get_cap_shift()) |
3525 (xattrlock
.gcaps_allowed(type
) << xattrlock
.get_cap_shift()) |
3526 (linklock
.gcaps_allowed(type
) << linklock
.get_cap_shift());
3529 int CInode::get_caps_careful() const
3532 (filelock
.gcaps_careful() << filelock
.get_cap_shift()) |
3533 (authlock
.gcaps_careful() << authlock
.get_cap_shift()) |
3534 (xattrlock
.gcaps_careful() << xattrlock
.get_cap_shift()) |
3535 (linklock
.gcaps_careful() << linklock
.get_cap_shift());
3538 int CInode::get_xlocker_mask(client_t client
) const
3541 (filelock
.gcaps_xlocker_mask(client
) << filelock
.get_cap_shift()) |
3542 (authlock
.gcaps_xlocker_mask(client
) << authlock
.get_cap_shift()) |
3543 (xattrlock
.gcaps_xlocker_mask(client
) << xattrlock
.get_cap_shift()) |
3544 (linklock
.gcaps_xlocker_mask(client
) << linklock
.get_cap_shift());
3547 int CInode::get_caps_allowed_for_client(Session
*session
, Capability
*cap
,
3548 const mempool_inode
*file_i
) const
3550 client_t client
= session
->get_client();
3552 if (client
== get_loner()) {
3553 // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked
3555 get_caps_allowed_by_type(CAP_LONER
) |
3556 (get_caps_allowed_by_type(CAP_XLOCKER
) & get_xlocker_mask(client
));
3558 allowed
= get_caps_allowed_by_type(CAP_ANY
);
3562 allowed
&= ~CEPH_CAP_ANY_DIR_OPS
;
3563 if (cap
&& (allowed
& CEPH_CAP_FILE_EXCL
))
3564 allowed
|= cap
->get_lock_cache_allowed();
3566 if (file_i
->inline_data
.version
== CEPH_INLINE_NONE
&&
3567 file_i
->layout
.pool_ns
.empty()) {
3570 if ((file_i
->inline_data
.version
!= CEPH_INLINE_NONE
&&
3571 cap
->is_noinline()) ||
3572 (!file_i
->layout
.pool_ns
.empty() &&
3573 cap
->is_nopoolns()))
3574 allowed
&= ~(CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_WR
);
3576 auto& conn
= session
->get_connection();
3577 if ((file_i
->inline_data
.version
!= CEPH_INLINE_NONE
&&
3578 !conn
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) ||
3579 (!file_i
->layout
.pool_ns
.empty() &&
3580 !conn
->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
)))
3581 allowed
&= ~(CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_WR
);
3587 // caps issued, wanted
3588 int CInode::get_caps_issued(int *ploner
, int *pother
, int *pxlocker
,
3589 int shift
, int mask
)
3592 int loner
= 0, other
= 0, xlocker
= 0;
3597 for (const auto &p
: client_caps
) {
3598 int i
= p
.second
.issued();
3600 if (p
.first
== loner_cap
)
3604 xlocker
|= get_xlocker_mask(p
.first
) & i
;
3606 if (ploner
) *ploner
= (loner
>> shift
) & mask
;
3607 if (pother
) *pother
= (other
>> shift
) & mask
;
3608 if (pxlocker
) *pxlocker
= (xlocker
>> shift
) & mask
;
3609 return (c
>> shift
) & mask
;
3612 bool CInode::is_any_caps_wanted() const
3614 for (const auto &p
: client_caps
) {
3615 if (p
.second
.wanted())
3621 int CInode::get_caps_wanted(int *ploner
, int *pother
, int shift
, int mask
) const
3624 int loner
= 0, other
= 0;
3625 for (const auto &p
: client_caps
) {
3626 if (!p
.second
.is_stale()) {
3627 int t
= p
.second
.wanted();
3629 if (p
.first
== loner_cap
)
3634 //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
3637 for (const auto &p
: mds_caps_wanted
) {
3640 //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
3642 if (ploner
) *ploner
= (loner
>> shift
) & mask
;
3643 if (pother
) *pother
= (other
>> shift
) & mask
;
3644 return (w
>> shift
) & mask
;
3647 bool CInode::issued_caps_need_gather(SimpleLock
*lock
)
3649 int loner_issued
, other_issued
, xlocker_issued
;
3650 get_caps_issued(&loner_issued
, &other_issued
, &xlocker_issued
,
3651 lock
->get_cap_shift(), lock
->get_cap_mask());
3652 if ((loner_issued
& ~lock
->gcaps_allowed(CAP_LONER
)) ||
3653 (other_issued
& ~lock
->gcaps_allowed(CAP_ANY
)) ||
3654 (xlocker_issued
& ~lock
->gcaps_allowed(CAP_XLOCKER
)))
3659 void CInode::adjust_num_caps_notable(int d
)
3661 if (!is_clientwriteable()) {
3662 if (!num_caps_notable
&& d
> 0)
3663 mdcache
->open_file_table
.add_inode(this);
3664 else if (num_caps_notable
> 0 && num_caps_notable
== -d
)
3665 mdcache
->open_file_table
.remove_inode(this);
3668 num_caps_notable
+=d
;
3669 ceph_assert(num_caps_notable
>= 0);
3672 void CInode::mark_clientwriteable()
3674 if (last
!= CEPH_NOSNAP
)
3676 if (!state_test(STATE_CLIENTWRITEABLE
)) {
3677 if (num_caps_notable
== 0)
3678 mdcache
->open_file_table
.add_inode(this);
3679 state_set(STATE_CLIENTWRITEABLE
);
3683 void CInode::clear_clientwriteable()
3685 if (state_test(STATE_CLIENTWRITEABLE
)) {
3686 if (num_caps_notable
== 0)
3687 mdcache
->open_file_table
.remove_inode(this);
3688 state_clear(STATE_CLIENTWRITEABLE
);
3692 // =============================================
3694 int CInode::encode_inodestat(bufferlist
& bl
, Session
*session
,
3695 SnapRealm
*dir_realm
,
3700 client_t client
= session
->get_client();
3701 ceph_assert(snapid
);
3706 const mempool_inode
*oi
= get_inode().get();
3707 const mempool_inode
*pi
= get_projected_inode().get();
3709 const mempool_xattr_map
*pxattrs
= nullptr;
3711 if (snapid
!= CEPH_NOSNAP
) {
3713 // for now at least, old_inodes is only defined/valid on the auth
3717 if (is_any_old_inodes()) {
3718 auto it
= old_inodes
->lower_bound(snapid
);
3719 if (it
!= old_inodes
->end()) {
3720 if (it
->second
.first
> snapid
) {
3721 if (it
!= old_inodes
->begin())
3724 if (it
->second
.first
<= snapid
&& snapid
<= it
->first
) {
3725 dout(15) << __func__
<< " snapid " << snapid
3726 << " to old_inode [" << it
->second
.first
<< "," << it
->first
<< "]"
3727 << " " << it
->second
.inode
.rstat
3729 pi
= oi
= &it
->second
.inode
;
3730 pxattrs
= &it
->second
.xattrs
;
3732 // snapshoted remote dentry can result this
3733 dout(0) << __func__
<< " old_inode for snapid " << snapid
3734 << " not found" << dendl
;
3737 } else if (snapid
< first
|| snapid
> last
) {
3738 // snapshoted remote dentry can result this
3739 dout(0) << __func__
<< " [" << first
<< "," << last
<< "]"
3740 << " not match snapid " << snapid
<< dendl
;
3745 std::map
<std::string
, std::string
> snap_metadata
;
3746 SnapRealm
*realm
= find_snaprealm();
3747 if (snapid
!= CEPH_NOSNAP
&& realm
) {
3748 // add snapshot timestamp vxattr
3749 map
<snapid_t
,const SnapInfo
*> infomap
;
3750 realm
->get_snap_info(infomap
,
3753 if (!infomap
.empty()) {
3754 ceph_assert(infomap
.size() == 1);
3755 const SnapInfo
*si
= infomap
.begin()->second
;
3756 snap_btime
= si
->stamp
;
3757 snap_metadata
= si
->metadata
;
3762 bool no_caps
= !valid
||
3763 session
->is_stale() ||
3764 (dir_realm
&& realm
!= dir_realm
) ||
3766 state_test(CInode::STATE_EXPORTINGCAPS
);
3768 dout(20) << __func__
<< " no caps"
3769 << (!valid
?", !valid":"")
3770 << (session
->is_stale()?", session stale ":"")
3771 << ((dir_realm
&& realm
!= dir_realm
)?", snaprealm differs ":"")
3772 << (is_frozen()?", frozen inode":"")
3773 << (state_test(CInode::STATE_EXPORTINGCAPS
)?", exporting caps":"")
3777 // "fake" a version that is old (stable) version, +1 if projected.
3778 version_t version
= (oi
->version
* 2) + is_projected();
3780 Capability
*cap
= get_client_cap(client
);
3781 bool pfile
= filelock
.is_xlocked_by_client(client
) || get_loner() == client
;
3782 //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL));
3783 bool pauth
= authlock
.is_xlocked_by_client(client
) || get_loner() == client
;
3784 bool plink
= linklock
.is_xlocked_by_client(client
) || get_loner() == client
;
3785 bool pxattr
= xattrlock
.is_xlocked_by_client(client
) || get_loner() == client
;
3787 bool plocal
= versionlock
.get_last_wrlock_client() == client
;
3788 bool ppolicy
= policylock
.is_xlocked_by_client(client
) || get_loner()==client
;
3790 const mempool_inode
*any_i
= (pfile
|pauth
|plink
|pxattr
|plocal
) ? pi
: oi
;
3792 dout(20) << " pfile " << pfile
<< " pauth " << pauth
3793 << " plink " << plink
<< " pxattr " << pxattr
3794 << " plocal " << plocal
3795 << " ctime " << any_i
->ctime
3796 << " valid=" << valid
<< dendl
;
3799 const mempool_inode
*file_i
= pfile
? pi
:oi
;
3800 file_layout_t layout
;
3802 layout
= (ppolicy
? pi
: oi
)->layout
;
3804 layout
= file_i
->layout
;
3807 // max_size is min of projected, actual
3809 std::min(oi
->get_client_range(client
),
3810 pi
->get_client_range(client
));
3813 version_t inline_version
= 0;
3814 bufferlist inline_data
;
3815 if (file_i
->inline_data
.version
== CEPH_INLINE_NONE
) {
3816 inline_version
= CEPH_INLINE_NONE
;
3817 } else if ((!cap
&& !no_caps
) ||
3818 (cap
&& cap
->client_inline_version
< file_i
->inline_data
.version
) ||
3819 (getattr_caps
& CEPH_CAP_FILE_RD
)) { // client requests inline data
3820 inline_version
= file_i
->inline_data
.version
;
3821 if (file_i
->inline_data
.length() > 0)
3822 file_i
->inline_data
.get_data(inline_data
);
3825 // nest (do same as file... :/)
3827 cap
->last_rbytes
= file_i
->rstat
.rbytes
;
3828 cap
->last_rsize
= file_i
->rstat
.rsize();
3832 const mempool_inode
*auth_i
= pauth
? pi
:oi
;
3835 const mempool_inode
*link_i
= plink
? pi
:oi
;
3838 const mempool_inode
*xattr_i
= pxattr
? pi
:oi
;
3842 version_t xattr_version
;
3843 if ((!cap
&& !no_caps
) ||
3844 (cap
&& cap
->client_xattr_version
< xattr_i
->xattr_version
) ||
3845 (getattr_caps
& CEPH_CAP_XATTR_SHARED
)) { // client requests xattrs
3847 pxattrs
= pxattr
? get_projected_xattrs().get() : get_xattrs().get();
3848 xattr_version
= xattr_i
->xattr_version
;
3856 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap
) +
3857 sizeof(struct ceph_file_layout
) +
3858 sizeof(struct ceph_timespec
) * 3 + 4 + // ctime ~ time_warp_seq
3859 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink
3860 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec
) + // dirstat.nfiles ~ rstat.rctime
3861 sizeof(__u32
) + sizeof(__u32
) * 2 * dirfragtree
._splits
.size() + // dirfragtree
3862 sizeof(__u32
) + symlink
.length() + // symlink
3863 sizeof(struct ceph_dir_layout
); // dir_layout
3865 if (xattr_version
) {
3866 bytes
+= sizeof(__u32
) + sizeof(__u32
); // xattr buffer len + number entries
3868 for (const auto &p
: *pxattrs
)
3869 bytes
+= sizeof(__u32
) * 2 + p
.first
.length() + p
.second
.length();
3872 bytes
+= sizeof(__u32
); // xattr buffer len
3875 sizeof(version_t
) + sizeof(__u32
) + inline_data
.length() + // inline data
3876 1 + 1 + 8 + 8 + 4 + // quota
3877 4 + layout
.pool_ns
.size() + // pool ns
3878 sizeof(struct ceph_timespec
) + 8; // btime + change_attr
3880 if (bytes
> max_bytes
)
3881 return -CEPHFS_ENOSPC
;
3886 struct ceph_mds_reply_cap ecap
;
3887 if (snapid
!= CEPH_NOSNAP
) {
3889 * snapped inodes (files or dirs) only get read-only caps. always
3890 * issue everything possible, since it is read only.
3892 * if a snapped inode has caps, limit issued caps based on the
3895 * if it is a live inode, limit issued caps based on the lock
3898 * do NOT adjust cap issued state, because the client always
3899 * tracks caps per-snap and the mds does either per-interval or
3902 ecap
.caps
= valid
? get_caps_allowed_by_type(CAP_ANY
) : CEPH_STAT_CAP_INODE
;
3903 if (last
== CEPH_NOSNAP
|| is_any_caps())
3904 ecap
.caps
= ecap
.caps
& get_caps_allowed_for_client(session
, nullptr, file_i
);
3909 if (!no_caps
&& !cap
) {
3911 cap
= add_client_cap(client
, session
, realm
);
3913 choose_ideal_loner();
3917 if (!no_caps
&& cap
) {
3918 int likes
= get_caps_liked();
3919 int allowed
= get_caps_allowed_for_client(session
, cap
, file_i
);
3920 issue
= (cap
->wanted() | likes
) & allowed
;
3921 cap
->issue_norevoke(issue
, true);
3922 issue
= cap
->pending();
3923 dout(10) << "encode_inodestat issuing " << ccap_string(issue
)
3924 << " seq " << cap
->get_last_seq() << dendl
;
3925 } else if (cap
&& cap
->is_new() && !dir_realm
) {
3926 // alway issue new caps to client, otherwise the caps get lost
3927 ceph_assert(cap
->is_stale());
3928 ceph_assert(!cap
->pending());
3929 issue
= CEPH_CAP_PIN
;
3930 cap
->issue_norevoke(issue
, true);
3931 dout(10) << "encode_inodestat issuing " << ccap_string(issue
)
3932 << " seq " << cap
->get_last_seq()
3933 << "(stale&new caps)" << dendl
;
3937 cap
->set_last_issue();
3938 cap
->set_last_issue_stamp(ceph_clock_now());
3940 ecap
.wanted
= cap
->wanted();
3941 ecap
.cap_id
= cap
->get_cap_id();
3942 ecap
.seq
= cap
->get_last_seq();
3943 ecap
.mseq
= cap
->get_mseq();
3944 ecap
.realm
= realm
->inode
->ino();
3954 ecap
.flags
= is_auth() ? CEPH_CAP_FLAG_AUTH
: 0;
3955 dout(10) << "encode_inodestat caps " << ccap_string(ecap
.caps
)
3956 << " seq " << ecap
.seq
<< " mseq " << ecap
.mseq
3957 << " xattrv " << xattr_version
<< dendl
;
3959 if (inline_data
.length() && cap
) {
3960 if ((cap
->pending() | getattr_caps
) & CEPH_CAP_FILE_SHARED
) {
3961 dout(10) << "including inline version " << inline_version
<< dendl
;
3962 cap
->client_inline_version
= inline_version
;
3964 dout(10) << "dropping inline version " << inline_version
<< dendl
;
3966 inline_data
.clear();
3970 // include those xattrs?
3971 if (xattr_version
&& cap
) {
3972 if ((cap
->pending() | getattr_caps
) & CEPH_CAP_XATTR_SHARED
) {
3973 dout(10) << "including xattrs version " << xattr_version
<< dendl
;
3974 cap
->client_xattr_version
= xattr_version
;
3976 dout(10) << "dropping xattrs version " << xattr_version
<< dendl
;
3981 // The end result of encode_xattrs() is equivalent to:
3984 // if (xattr_version) {
3986 // encode(*pxattrs, bl);
3988 // encode((__u32)0, bl);
3993 // But encoding xattrs into the 'xbl' requires a memory allocation.
3994 // The 'bl' should have enough pre-allocated memory in most cases.
3995 // Encoding xattrs directly into it can avoid the extra allocation.
3996 auto encode_xattrs
= [xattr_version
, pxattrs
, &bl
]() {
3998 if (xattr_version
) {
4000 auto filler
= bl
.append_hole(sizeof(xbl_len
));
4001 const auto starting_bl_len
= bl
.length();
4003 encode(*pxattrs
, bl
);
4005 encode((__u32
)0, bl
);
4006 xbl_len
= bl
.length() - starting_bl_len
;
4007 filler
.copy_in(sizeof(xbl_len
), (char *)&xbl_len
);
4009 encode((__u32
)0, bl
);
4014 * note: encoding matches MClientReply::InodeStat
4016 if (session
->info
.has_feature(CEPHFS_FEATURE_REPLY_ENCODING
)) {
4017 ENCODE_START(6, 1, bl
);
4018 encode(oi
->ino
, bl
);
4020 encode(oi
->rdev
, bl
);
4021 encode(version
, bl
);
4022 encode(xattr_version
, bl
);
4025 ceph_file_layout legacy_layout
;
4026 layout
.to_legacy(&legacy_layout
);
4027 encode(legacy_layout
, bl
);
4029 encode(any_i
->ctime
, bl
);
4030 encode(file_i
->mtime
, bl
);
4031 encode(file_i
->atime
, bl
);
4032 encode(file_i
->time_warp_seq
, bl
);
4033 encode(file_i
->size
, bl
);
4034 encode(max_size
, bl
);
4035 encode(file_i
->truncate_size
, bl
);
4036 encode(file_i
->truncate_seq
, bl
);
4037 encode(auth_i
->mode
, bl
);
4038 encode((uint32_t)auth_i
->uid
, bl
);
4039 encode((uint32_t)auth_i
->gid
, bl
);
4040 encode(link_i
->nlink
, bl
);
4041 encode(file_i
->dirstat
.nfiles
, bl
);
4042 encode(file_i
->dirstat
.nsubdirs
, bl
);
4043 encode(file_i
->rstat
.rbytes
, bl
);
4044 encode(file_i
->rstat
.rfiles
, bl
);
4045 encode(file_i
->rstat
.rsubdirs
, bl
);
4046 encode(file_i
->rstat
.rctime
, bl
);
4047 dirfragtree
.encode(bl
);
4048 encode(symlink
, bl
);
4049 encode(file_i
->dir_layout
, bl
);
4051 encode(inline_version
, bl
);
4052 encode(inline_data
, bl
);
4053 const mempool_inode
*policy_i
= ppolicy
? pi
: oi
;
4054 encode(policy_i
->quota
, bl
);
4055 encode(layout
.pool_ns
, bl
);
4056 encode(any_i
->btime
, bl
);
4057 encode(any_i
->change_attr
, bl
);
4058 encode(file_i
->export_pin
, bl
);
4059 encode(snap_btime
, bl
);
4060 encode(file_i
->rstat
.rsnaps
, bl
);
4061 encode(snap_metadata
, bl
);
4062 encode(file_i
->fscrypt
, bl
);
4066 ceph_assert(session
->get_connection());
4068 encode(oi
->ino
, bl
);
4070 encode(oi
->rdev
, bl
);
4071 encode(version
, bl
);
4072 encode(xattr_version
, bl
);
4075 ceph_file_layout legacy_layout
;
4076 layout
.to_legacy(&legacy_layout
);
4077 encode(legacy_layout
, bl
);
4079 encode(any_i
->ctime
, bl
);
4080 encode(file_i
->mtime
, bl
);
4081 encode(file_i
->atime
, bl
);
4082 encode(file_i
->time_warp_seq
, bl
);
4083 encode(file_i
->size
, bl
);
4084 encode(max_size
, bl
);
4085 encode(file_i
->truncate_size
, bl
);
4086 encode(file_i
->truncate_seq
, bl
);
4087 encode(auth_i
->mode
, bl
);
4088 encode((uint32_t)auth_i
->uid
, bl
);
4089 encode((uint32_t)auth_i
->gid
, bl
);
4090 encode(link_i
->nlink
, bl
);
4091 encode(file_i
->dirstat
.nfiles
, bl
);
4092 encode(file_i
->dirstat
.nsubdirs
, bl
);
4093 encode(file_i
->rstat
.rbytes
, bl
);
4094 encode(file_i
->rstat
.rfiles
, bl
);
4095 encode(file_i
->rstat
.rsubdirs
, bl
);
4096 encode(file_i
->rstat
.rctime
, bl
);
4097 dirfragtree
.encode(bl
);
4098 encode(symlink
, bl
);
4099 auto& conn
= session
->get_connection();
4100 if (conn
->has_feature(CEPH_FEATURE_DIRLAYOUTHASH
)) {
4101 encode(file_i
->dir_layout
, bl
);
4104 if (conn
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
4105 encode(inline_version
, bl
);
4106 encode(inline_data
, bl
);
4108 if (conn
->has_feature(CEPH_FEATURE_MDS_QUOTA
)) {
4109 const mempool_inode
*policy_i
= ppolicy
? pi
: oi
;
4110 encode(policy_i
->quota
, bl
);
4112 if (conn
->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
)) {
4113 encode(layout
.pool_ns
, bl
);
4115 if (conn
->has_feature(CEPH_FEATURE_FS_BTIME
)) {
4116 encode(any_i
->btime
, bl
);
4117 encode(any_i
->change_attr
, bl
);
4124 void CInode::encode_cap_message(const ref_t
<MClientCaps
> &m
, Capability
*cap
)
4128 client_t client
= cap
->get_client();
4130 bool pfile
= filelock
.is_xlocked_by_client(client
) || (cap
->issued() & CEPH_CAP_FILE_EXCL
);
4131 bool pauth
= authlock
.is_xlocked_by_client(client
);
4132 bool plink
= linklock
.is_xlocked_by_client(client
);
4133 bool pxattr
= xattrlock
.is_xlocked_by_client(client
);
4135 const mempool_inode
*oi
= get_inode().get();
4136 const mempool_inode
*pi
= get_projected_inode().get();
4137 const mempool_inode
*i
= (pfile
|pauth
|plink
|pxattr
) ? pi
: oi
;
4139 dout(20) << __func__
<< " pfile " << pfile
4140 << " pauth " << pauth
<< " plink " << plink
<< " pxattr " << pxattr
4141 << " ctime " << i
->ctime
<< dendl
;
4144 m
->set_layout(i
->layout
);
4146 m
->truncate_seq
= i
->truncate_seq
;
4147 m
->truncate_size
= i
->truncate_size
;
4148 m
->mtime
= i
->mtime
;
4149 m
->atime
= i
->atime
;
4150 m
->ctime
= i
->ctime
;
4151 m
->btime
= i
->btime
;
4152 m
->change_attr
= i
->change_attr
;
4153 m
->time_warp_seq
= i
->time_warp_seq
;
4154 m
->nfiles
= i
->dirstat
.nfiles
;
4155 m
->nsubdirs
= i
->dirstat
.nsubdirs
;
4157 if (cap
->client_inline_version
< i
->inline_data
.version
) {
4158 m
->inline_version
= cap
->client_inline_version
= i
->inline_data
.version
;
4159 if (i
->inline_data
.length() > 0)
4160 i
->inline_data
.get_data(m
->inline_data
);
4162 m
->inline_version
= 0;
4165 // max_size is min of projected, actual.
4166 uint64_t oldms
= oi
->get_client_range(client
);
4167 uint64_t newms
= pi
->get_client_range(client
);
4168 m
->max_size
= std::min(oldms
, newms
);
4171 m
->head
.mode
= i
->mode
;
4172 m
->head
.uid
= i
->uid
;
4173 m
->head
.gid
= i
->gid
;
4176 m
->head
.nlink
= i
->nlink
;
4180 const auto& ix
= pxattr
? get_projected_xattrs() : get_xattrs();
4181 if ((cap
->pending() & CEPH_CAP_XATTR_SHARED
) &&
4182 i
->xattr_version
> cap
->client_xattr_version
) {
4183 dout(10) << " including xattrs v " << i
->xattr_version
<< dendl
;
4185 encode(*ix
, m
->xattrbl
);
4187 encode((__u32
)0, m
->xattrbl
);
4188 m
->head
.xattr_version
= i
->xattr_version
;
4189 cap
->client_xattr_version
= i
->xattr_version
;
4195 void CInode::_encode_base(bufferlist
& bl
, uint64_t features
)
4197 ENCODE_START(1, 1, bl
);
4199 encode(*get_inode(), bl
, features
);
4200 encode(symlink
, bl
);
4201 encode(dirfragtree
, bl
);
4203 encode_old_inodes(bl
, features
);
4204 encode(damage_flags
, bl
);
4208 void CInode::_decode_base(bufferlist::const_iterator
& p
)
4213 auto _inode
= allocate_inode();
4215 reset_inode(std::move(_inode
));
4220 symlink
= std::string_view(tmp
);
4222 decode(dirfragtree
, p
);
4224 decode_old_inodes(p
);
4225 decode(damage_flags
, p
);
4230 void CInode::_encode_locks_full(bufferlist
& bl
)
4233 encode(authlock
, bl
);
4234 encode(linklock
, bl
);
4235 encode(dirfragtreelock
, bl
);
4236 encode(filelock
, bl
);
4237 encode(xattrlock
, bl
);
4238 encode(snaplock
, bl
);
4239 encode(nestlock
, bl
);
4240 encode(flocklock
, bl
);
4241 encode(policylock
, bl
);
4243 encode(loner_cap
, bl
);
4245 void CInode::_decode_locks_full(bufferlist::const_iterator
& p
)
4248 decode(authlock
, p
);
4249 decode(linklock
, p
);
4250 decode(dirfragtreelock
, p
);
4251 decode(filelock
, p
);
4252 decode(xattrlock
, p
);
4253 decode(snaplock
, p
);
4254 decode(nestlock
, p
);
4255 decode(flocklock
, p
);
4256 decode(policylock
, p
);
4258 decode(loner_cap
, p
);
4259 set_loner_cap(loner_cap
);
4260 want_loner_cap
= loner_cap
; // for now, we'll eval() shortly.
4263 void CInode::_encode_locks_state_for_replica(bufferlist
& bl
, bool need_recover
)
4265 ENCODE_START(1, 1, bl
);
4266 authlock
.encode_state_for_replica(bl
);
4267 linklock
.encode_state_for_replica(bl
);
4268 dirfragtreelock
.encode_state_for_replica(bl
);
4269 filelock
.encode_state_for_replica(bl
);
4270 nestlock
.encode_state_for_replica(bl
);
4271 xattrlock
.encode_state_for_replica(bl
);
4272 snaplock
.encode_state_for_replica(bl
);
4273 flocklock
.encode_state_for_replica(bl
);
4274 policylock
.encode_state_for_replica(bl
);
4275 encode(need_recover
, bl
);
4279 void CInode::_encode_locks_state_for_rejoin(bufferlist
& bl
, int rep
)
4281 authlock
.encode_state_for_replica(bl
);
4282 linklock
.encode_state_for_replica(bl
);
4283 dirfragtreelock
.encode_state_for_rejoin(bl
, rep
);
4284 filelock
.encode_state_for_rejoin(bl
, rep
);
4285 nestlock
.encode_state_for_rejoin(bl
, rep
);
4286 xattrlock
.encode_state_for_replica(bl
);
4287 snaplock
.encode_state_for_replica(bl
);
4288 flocklock
.encode_state_for_replica(bl
);
4289 policylock
.encode_state_for_replica(bl
);
4292 void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator
& p
, bool is_new
)
4295 authlock
.decode_state(p
, is_new
);
4296 linklock
.decode_state(p
, is_new
);
4297 dirfragtreelock
.decode_state(p
, is_new
);
4298 filelock
.decode_state(p
, is_new
);
4299 nestlock
.decode_state(p
, is_new
);
4300 xattrlock
.decode_state(p
, is_new
);
4301 snaplock
.decode_state(p
, is_new
);
4302 flocklock
.decode_state(p
, is_new
);
4303 policylock
.decode_state(p
, is_new
);
4306 decode(need_recover
, p
);
4307 if (need_recover
&& is_new
) {
4308 // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
4309 // and change the object when replaying unsafe requests.
4310 authlock
.mark_need_recover();
4311 linklock
.mark_need_recover();
4312 dirfragtreelock
.mark_need_recover();
4313 filelock
.mark_need_recover();
4314 nestlock
.mark_need_recover();
4315 xattrlock
.mark_need_recover();
4316 snaplock
.mark_need_recover();
4317 flocklock
.mark_need_recover();
4318 policylock
.mark_need_recover();
4322 void CInode::_decode_locks_rejoin(bufferlist::const_iterator
& p
, MDSContext::vec
& waiters
,
4323 list
<SimpleLock
*>& eval_locks
, bool survivor
)
4325 authlock
.decode_state_rejoin(p
, waiters
, survivor
);
4326 linklock
.decode_state_rejoin(p
, waiters
, survivor
);
4327 dirfragtreelock
.decode_state_rejoin(p
, waiters
, survivor
);
4328 filelock
.decode_state_rejoin(p
, waiters
, survivor
);
4329 nestlock
.decode_state_rejoin(p
, waiters
, survivor
);
4330 xattrlock
.decode_state_rejoin(p
, waiters
, survivor
);
4331 snaplock
.decode_state_rejoin(p
, waiters
, survivor
);
4332 flocklock
.decode_state_rejoin(p
, waiters
, survivor
);
4333 policylock
.decode_state_rejoin(p
, waiters
, survivor
);
4335 if (!dirfragtreelock
.is_stable() && !dirfragtreelock
.is_wrlocked())
4336 eval_locks
.push_back(&dirfragtreelock
);
4337 if (!filelock
.is_stable() && !filelock
.is_wrlocked())
4338 eval_locks
.push_back(&filelock
);
4339 if (!nestlock
.is_stable() && !nestlock
.is_wrlocked())
4340 eval_locks
.push_back(&nestlock
);
4346 void CInode::encode_export(bufferlist
& bl
)
4348 ENCODE_START(5, 4, bl
);
4349 _encode_base(bl
, mdcache
->mds
->mdsmap
->get_up_features());
4355 encode(get_replicas(), bl
);
4357 // include scatterlock info for any bounding CDirs
4358 bufferlist bounding
;
4359 if (get_inode()->is_dir())
4360 for (const auto &p
: dirfrags
) {
4361 CDir
*dir
= p
.second
;
4362 if (dir
->state_test(CDir::STATE_EXPORTBOUND
)) {
4363 encode(p
.first
, bounding
);
4364 encode(dir
->get_fnode()->fragstat
, bounding
);
4365 encode(dir
->get_fnode()->accounted_fragstat
, bounding
);
4366 encode(dir
->get_fnode()->rstat
, bounding
);
4367 encode(dir
->get_fnode()->accounted_rstat
, bounding
);
4368 dout(10) << " encoded fragstat/rstat info for " << *dir
<< dendl
;
4371 encode(bounding
, bl
);
4373 _encode_locks_full(bl
);
4375 _encode_file_locks(bl
);
4379 get(PIN_TEMPEXPORTING
);
4382 void CInode::finish_export()
4384 state
&= MASK_STATE_EXPORT_KEPT
;
4389 //dirlock.clear_updated();
4393 put(PIN_TEMPEXPORTING
);
4396 void CInode::decode_import(bufferlist::const_iterator
& p
,
4406 s
&= MASK_STATE_EXPORTED
;
4408 set_ephemeral_pin((s
& STATE_DISTEPHEMERALPIN
),
4409 (s
& STATE_RANDEPHEMERALPIN
));
4410 state_set(STATE_AUTH
| s
);
4417 if (is_dirty_parent()) {
4418 get(PIN_DIRTYPARENT
);
4419 mark_dirty_parent(ls
);
4424 decode(get_replicas(), p
);
4425 if (is_replicated())
4426 get(PIN_REPLICATED
);
4429 // decode fragstat info on bounding cdirs
4430 bufferlist bounding
;
4431 decode(bounding
, p
);
4432 auto q
= bounding
.cbegin();
4436 CDir
*dir
= get_dirfrag(fg
);
4437 ceph_assert(dir
); // we should have all bounds open
4439 // Only take the remote's fragstat/rstat if we are non-auth for
4440 // this dirfrag AND the lock is NOT in a scattered (MIX) state.
4441 // We know lock is stable, and MIX is the only state in which
4442 // the inode auth (who sent us this data) may not have the best
4445 // HMM: Are there cases where dir->is_auth() is an insufficient
4446 // check because the dirfrag is under migration? That implies
4447 // it is frozen (and in a SYNC or LOCK state). FIXME.
4449 auto _fnode
= CDir::allocate_fnode(*dir
->get_fnode());
4450 if (dir
->is_auth() ||
4451 filelock
.get_state() == LOCK_MIX
) {
4452 dout(10) << " skipped fragstat info for " << *dir
<< dendl
;
4457 decode(_fnode
->fragstat
, q
);
4458 decode(_fnode
->accounted_fragstat
, q
);
4459 dout(10) << " took fragstat info for " << *dir
<< dendl
;
4461 if (dir
->is_auth() ||
4462 nestlock
.get_state() == LOCK_MIX
) {
4463 dout(10) << " skipped rstat info for " << *dir
<< dendl
;
4468 decode(_fnode
->rstat
, q
);
4469 decode(_fnode
->accounted_rstat
, q
);
4470 dout(10) << " took rstat info for " << *dir
<< dendl
;
4472 dir
->reset_fnode(std::move(_fnode
));
4475 _decode_locks_full(p
);
4477 _decode_file_locks(p
);
4483 void InodeStoreBase::dump(Formatter
*f
) const
4486 f
->dump_string("symlink", symlink
);
4488 f
->open_array_section("xattrs");
4490 for (const auto& [key
, val
] : *xattrs
) {
4491 f
->open_object_section("xattr");
4492 f
->dump_string("key", key
);
4493 std::string
v(val
.c_str(), val
.length());
4494 f
->dump_string("val", v
);
4499 f
->open_object_section("dirfragtree");
4500 dirfragtree
.dump(f
);
4501 f
->close_section(); // dirfragtree
4503 f
->open_array_section("old_inodes");
4505 for (const auto &p
: *old_inodes
) {
4506 f
->open_object_section("old_inode");
4507 // The key is the last snapid, the first is in the mempool_old_inode
4508 f
->dump_int("last", p
.first
);
4510 f
->close_section(); // old_inode
4513 f
->close_section(); // old_inodes
4515 f
->dump_unsigned("oldest_snap", oldest_snap
);
4516 f
->dump_unsigned("damage_flags", damage_flags
);
4520 void decode_json_obj(mempool::mds_co::string
& t
, JSONObj
*obj
){
4522 t
= mempool::mds_co::string(std::string_view(obj
->get_data()));
4525 void InodeStoreBase::decode_json(JSONObj
*obj
)
4528 auto _inode
= allocate_inode();
4529 _inode
->decode_json(obj
);
4530 reset_inode(std::move(_inode
));
4533 JSONDecoder::decode_json("symlink", symlink
, obj
, true);
4534 // JSONDecoder::decode_json("dirfragtree", dirfragtree, obj, true); // cann't decode it now
4538 mempool_xattr_map tmp
;
4539 JSONDecoder::decode_json("xattrs", tmp
, xattrs_cb
, obj
, true);
4541 reset_xattrs(xattr_map_ptr());
4543 reset_xattrs(allocate_xattr_map(std::move(tmp
)));
4545 // JSONDecoder::decode_json("old_inodes", old_inodes, InodeStoreBase::old_indoes_cb, obj, true); // cann't decode old_inodes now
4546 JSONDecoder::decode_json("oldest_snap", oldest_snap
.val
, obj
, true);
4547 JSONDecoder::decode_json("damage_flags", damage_flags
, obj
, true);
4549 //JSONDecoder::decode_json("snap_blob", srnode, obj, true); // cann't decode it now
4550 //snap_blob = srnode;
4553 void InodeStoreBase::xattrs_cb(InodeStoreBase::mempool_xattr_map
& c
, JSONObj
*obj
){
4556 JSONDecoder::decode_json("key", k
, obj
, true);
4558 JSONDecoder::decode_json("val", v
, obj
, true);
4559 c
[k
.c_str()] = buffer::copy(v
.c_str(), v
.size());
4562 void InodeStoreBase::old_indoes_cb(InodeStoreBase::mempool_old_inode_map
& c
, JSONObj
*obj
){
4565 JSONDecoder::decode_json("last", s
.val
, obj
, true);
4566 InodeStoreBase::mempool_old_inode i
;
4567 // i.decode_json(obj); // cann't decode now, simon
4571 void InodeStore::generate_test_instances(std::list
<InodeStore
*> &ls
)
4573 InodeStore
*populated
= new InodeStore
;
4574 populated
->get_inode()->ino
= 0xdeadbeef;
4575 populated
->symlink
= "rhubarb";
4576 ls
.push_back(populated
);
4579 void InodeStoreBare::generate_test_instances(std::list
<InodeStoreBare
*> &ls
)
4581 InodeStoreBare
*populated
= new InodeStoreBare
;
4582 populated
->get_inode()->ino
= 0xdeadbeef;
4583 populated
->symlink
= "rhubarb";
4584 ls
.push_back(populated
);
4587 void CInode::validate_disk_state(CInode::validated_data
*results
,
4590 class ValidationContinuation
: public MDSContinuation
{
4594 CInode::validated_data
*results
;
4606 ValidationContinuation(CInode
*i
,
4607 CInode::validated_data
*data_r
,
4609 MDSContinuation(i
->mdcache
->mds
->server
),
4614 set_callback(START
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_start
));
4615 set_callback(BACKTRACE
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_backtrace
));
4616 set_callback(INODE
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_inode_disk
));
4617 set_callback(DIRFRAGS
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_dirfrags
));
4620 ~ValidationContinuation() override
{
4623 in
->mdcache
->num_shadow_inodes
--;
4628 * Fetch backtrace and set tag if tag is non-empty
4630 void fetch_backtrace_and_tag(CInode
*in
,
4631 std::string_view tag
, bool is_internal
,
4632 Context
*fin
, int *bt_r
, bufferlist
*bt
)
4634 const int64_t pool
= in
->get_backtrace_pool();
4635 object_t oid
= CInode::get_object_name(in
->ino(), frag_t(), "");
4637 ObjectOperation fetch
;
4638 fetch
.getxattr("parent", bt
, bt_r
);
4639 in
->mdcache
->mds
->objecter
->read(oid
, object_locator_t(pool
), fetch
, CEPH_NOSNAP
,
4641 if (in
->mdcache
->mds
->logger
) {
4642 in
->mdcache
->mds
->logger
->inc(l_mds_openino_backtrace_fetch
);
4643 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_backtrace_fetch
);
4648 ObjectOperation scrub_tag
;
4650 encode(tag
, tag_bl
);
4651 scrub_tag
.setxattr("scrub_tag", tag_bl
);
4653 in
->mdcache
->mds
->objecter
->mutate(oid
, object_locator_t(pool
), scrub_tag
, snapc
,
4654 ceph::real_clock::now(),
4656 if (in
->mdcache
->mds
->logger
)
4657 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_set_tag
);
4661 bool _start(int rval
) {
4662 ceph_assert(in
->can_auth_pin());
4665 if (in
->is_dirty()) {
4666 MDCache
*mdcache
= in
->mdcache
; // For the benefit of dout
4667 auto ino
= [this]() { return in
->ino(); }; // For the benefit of dout
4668 dout(20) << "validating a dirty CInode; results will be inconclusive"
4672 C_OnFinisher
*conf
= new C_OnFinisher(get_io_callback(BACKTRACE
),
4673 in
->mdcache
->mds
->finisher
);
4675 std::string_view tag
= in
->scrub_infop
->header
->get_tag();
4676 bool is_internal
= in
->scrub_infop
->header
->is_internal_tag();
4677 // Rather than using the usual CInode::fetch_backtrace,
4678 // use a special variant that optionally writes a tag in the same
4680 fetch_backtrace_and_tag(in
, tag
, is_internal
, conf
, &results
->backtrace
.ondisk_read_retval
, &bl
);
4684 bool _backtrace(int rval
) {
4685 // set up basic result reporting and make sure we got the data
4686 results
->performed_validation
= true; // at least, some of it!
4687 results
->backtrace
.checked
= true;
4689 const int64_t pool
= in
->get_backtrace_pool();
4690 inode_backtrace_t
& memory_backtrace
= results
->backtrace
.memory_value
;
4691 in
->build_backtrace(pool
, memory_backtrace
);
4692 bool equivalent
, divergent
;
4695 MDCache
*mdcache
= in
->mdcache
; // For the benefit of dout
4696 auto ino
= [this]() { return in
->ino(); }; // For the benefit of dout
4698 // Ignore rval because it's the result of a FAILOK operation
4699 // from fetch_backtrace_and_tag: the real result is in
4700 // backtrace.ondisk_read_retval
4701 dout(20) << "ondisk_read_retval: " << results
->backtrace
.ondisk_read_retval
<< dendl
;
4702 if (results
->backtrace
.ondisk_read_retval
!= 0) {
4703 results
->backtrace
.error_str
<< "failed to read off disk; see retval";
4704 // we probably have a new unwritten file!
4705 // so skip the backtrace scrub for this entry and say that all's well
4706 if (in
->is_dirty_parent()) {
4707 dout(20) << "forcing backtrace as passed since inode is dirty parent" << dendl
;
4708 results
->backtrace
.passed
= true;
4713 // extract the backtrace, and compare it to a newly-constructed one
4715 auto p
= bl
.cbegin();
4717 decode(results
->backtrace
.ondisk_value
, p
);
4718 dout(10) << "decoded " << bl
.length() << " bytes of backtrace successfully" << dendl
;
4719 } catch (buffer::error
&) {
4720 if (results
->backtrace
.ondisk_read_retval
== 0 && rval
!= 0) {
4721 // Cases where something has clearly gone wrong with the overall
4722 // fetch op, though we didn't get a nonzero rc from the getxattr
4723 // operation. e.g. object missing.
4724 results
->backtrace
.ondisk_read_retval
= rval
;
4726 results
->backtrace
.error_str
<< "failed to decode on-disk backtrace ("
4727 << bl
.length() << " bytes)!";
4728 // we probably have a new unwritten file!
4729 // so skip the backtrace scrub for this entry and say that all's well
4730 if (in
->is_dirty_parent()) {
4731 dout(20) << "decode failed; forcing backtrace as passed since "
4732 "inode is dirty parent" << dendl
;
4733 results
->backtrace
.passed
= true;
4739 memory_newer
= memory_backtrace
.compare(results
->backtrace
.ondisk_value
,
4740 &equivalent
, &divergent
);
4742 if (divergent
|| memory_newer
< 0) {
4743 // we're divergent, or on-disk version is newer
4744 results
->backtrace
.error_str
<< "On-disk backtrace is divergent or newer";
4745 /* if the backtraces are divergent and the link count is 0, then
4746 * most likely its a stray entry that's being purged and things are
4747 * well and there's no reason for alarm
4749 if (divergent
&& (in
->is_dirty_parent() || in
->get_inode()->nlink
== 0)) {
4750 results
->backtrace
.passed
= true;
4751 dout(20) << "divergent backtraces are acceptable when dn "
4752 "is being purged or has been renamed or moved to a "
4753 "different directory " << *in
<< dendl
;
4756 results
->backtrace
.passed
= true;
4760 if (!results
->backtrace
.passed
&& in
->scrub_infop
->header
->get_repair()) {
4762 in
->make_path_string(path
);
4763 in
->mdcache
->mds
->clog
->warn() << "bad backtrace on inode " << in
->ino()
4764 << "(" << path
<< "), rewriting it";
4765 in
->mark_dirty_parent(in
->mdcache
->mds
->mdlog
->get_current_segment(),
4767 // Flag that we repaired this BT so that it won't go into damagetable
4768 results
->backtrace
.repaired
= true;
4769 if (in
->mdcache
->mds
->logger
)
4770 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_backtrace_repaired
);
4773 // If the inode's number was free in the InoTable, fix that
4776 InoTable
*inotable
= mdcache
->mds
->inotable
;
4778 dout(10) << "scrub: inotable ino = " << in
->ino() << dendl
;
4779 dout(10) << "scrub: inotable free says "
4780 << inotable
->is_marked_free(in
->ino()) << dendl
;
4782 if (inotable
->is_marked_free(in
->ino())) {
4783 LogChannelRef clog
= in
->mdcache
->mds
->clog
;
4784 clog
->error() << "scrub: inode wrongly marked free: " << in
->ino();
4786 if (in
->scrub_infop
->header
->get_repair()) {
4787 bool repaired
= inotable
->repair(in
->ino());
4789 clog
->error() << "inode table repaired for inode: " << in
->ino();
4792 if (in
->mdcache
->mds
->logger
)
4793 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_inotable_repaired
);
4795 clog
->error() << "Cannot repair inotable while other operations"
4804 if (in
->mdcache
->mds
->logger
)
4805 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_dir_inodes
);
4806 return validate_directory_data();
4808 if (in
->mdcache
->mds
->logger
)
4809 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_file_inodes
);
4810 // TODO: validate on-disk inode for normal files
4815 bool validate_directory_data() {
4816 ceph_assert(in
->is_dir());
4818 if (in
->is_base()) {
4820 shadow_in
= new CInode(in
->mdcache
);
4821 in
->mdcache
->create_unlinked_system_inode(shadow_in
, in
->ino(), in
->get_inode()->mode
);
4822 in
->mdcache
->num_shadow_inodes
++;
4824 shadow_in
->fetch(get_internal_callback(INODE
));
4825 if (in
->mdcache
->mds
->logger
)
4826 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_dir_base_inodes
);
4829 // TODO: validate on-disk inode for non-base directories
4830 if (in
->mdcache
->mds
->logger
)
4831 in
->mdcache
->mds
->logger
->inc(l_mds_scrub_dirfrag_rstats
);
4832 results
->inode
.passed
= true;
4833 return check_dirfrag_rstats();
4837 bool _inode_disk(int rval
) {
4838 const auto& si
= shadow_in
->get_inode();
4839 const auto& i
= in
->get_inode();
4841 results
->inode
.checked
= true;
4842 results
->inode
.ondisk_read_retval
= rval
;
4843 results
->inode
.ondisk_value
= *si
;
4844 results
->inode
.memory_value
= *i
;
4846 if (si
->version
> i
->version
) {
4848 results
->inode
.error_str
<< "On-disk inode is newer than in-memory one; ";
4851 bool divergent
= false;
4852 int r
= i
->compare(*si
, &divergent
);
4853 results
->inode
.passed
= !divergent
&& r
>= 0;
4854 if (!results
->inode
.passed
) {
4855 results
->inode
.error_str
<<
4856 "On-disk inode is divergent or newer than in-memory one; ";
4861 return check_dirfrag_rstats();
4864 bool check_dirfrag_rstats() {
4865 if (in
->has_subtree_root_dirfrag()) {
4866 in
->mdcache
->rdlock_dirfrags_stats(in
, get_internal_callback(DIRFRAGS
));
4869 return immediate(DIRFRAGS
, 0);
4873 bool _dirfrags(int rval
) {
4874 // basic reporting setup
4875 results
->raw_stats
.checked
= true;
4876 results
->raw_stats
.ondisk_read_retval
= rval
;
4878 results
->raw_stats
.memory_value
.dirstat
= in
->get_inode()->dirstat
;
4879 results
->raw_stats
.memory_value
.rstat
= in
->get_inode()->rstat
;
4880 frag_info_t
& dir_info
= results
->raw_stats
.ondisk_value
.dirstat
;
4881 nest_info_t
& nest_info
= results
->raw_stats
.ondisk_value
.rstat
;
4884 results
->raw_stats
.error_str
<< "Failed to read dirfrags off disk";
4888 // check each dirfrag...
4889 for (const auto &p
: in
->dirfrags
) {
4890 CDir
*dir
= p
.second
;
4891 ceph_assert(dir
->get_version() > 0);
4892 nest_info
.add(dir
->get_fnode()->accounted_rstat
);
4893 dir_info
.add(dir
->get_fnode()->accounted_fragstat
);
4895 nest_info
.rsubdirs
++; // it gets one to account for self
4896 if (const sr_t
*srnode
= in
->get_projected_srnode(); srnode
)
4897 nest_info
.rsnaps
+= srnode
->snaps
.size();
4899 // ...and that their sum matches our inode settings
4900 if (!dir_info
.same_sums(in
->get_inode()->dirstat
) ||
4901 !nest_info
.same_sums(in
->get_inode()->rstat
)) {
4902 if (in
->scrub_infop
->header
->get_repair()) {
4903 results
->raw_stats
.error_str
4904 << "freshly-calculated rstats don't match existing ones (will be fixed)";
4905 in
->mdcache
->repair_inode_stats(in
);
4906 results
->raw_stats
.repaired
= true;
4908 results
->raw_stats
.error_str
4909 << "freshly-calculated rstats don't match existing ones";
4911 if (in
->is_dirty()) {
4912 MDCache
*mdcache
= in
->mdcache
; // for dout()
4913 auto ino
= [this]() { return in
->ino(); }; // for dout()
4914 dout(20) << "raw stats most likely wont match since inode is dirty; "
4915 "please rerun scrub when system is stable; "
4916 "assuming passed for now;" << dendl
;
4917 results
->raw_stats
.passed
= true;
4922 results
->raw_stats
.passed
= true;
4924 MDCache
*mdcache
= in
->mdcache
; // for dout()
4925 auto ino
= [this]() { return in
->ino(); }; // for dout()
4926 dout(20) << "raw stats check passed on " << *in
<< dendl
;
4933 void _done() override
{
4934 if ((!results
->raw_stats
.checked
|| results
->raw_stats
.passed
) &&
4935 (!results
->backtrace
.checked
|| results
->backtrace
.passed
) &&
4936 (!results
->inode
.checked
|| results
->inode
.passed
))
4937 results
->passed_validation
= true;
4939 // Flag that we did some repair work so that our repair operation
4940 // can be flushed at end of scrub
4941 if (results
->backtrace
.repaired
||
4942 results
->inode
.repaired
||
4943 results
->raw_stats
.repaired
)
4944 in
->scrub_infop
->header
->set_repaired();
4946 fin
->complete(get_rval());
4948 in
->auth_unpin(this);
4953 dout(10) << "scrub starting validate_disk_state on " << *this << dendl
;
4954 ValidationContinuation
*vc
= new ValidationContinuation(this,
4960 void CInode::validated_data::dump(Formatter
*f
) const
4962 f
->open_object_section("results");
4964 f
->dump_bool("performed_validation", performed_validation
);
4965 f
->dump_bool("passed_validation", passed_validation
);
4966 f
->open_object_section("backtrace");
4968 f
->dump_bool("checked", backtrace
.checked
);
4969 f
->dump_bool("passed", backtrace
.passed
);
4970 f
->dump_int("read_ret_val", backtrace
.ondisk_read_retval
);
4971 f
->dump_stream("ondisk_value") << backtrace
.ondisk_value
;
4972 f
->dump_stream("memoryvalue") << backtrace
.memory_value
;
4973 f
->dump_string("error_str", backtrace
.error_str
.str());
4975 f
->close_section(); // backtrace
4976 f
->open_object_section("raw_stats");
4978 f
->dump_bool("checked", raw_stats
.checked
);
4979 f
->dump_bool("passed", raw_stats
.passed
);
4980 f
->dump_int("read_ret_val", raw_stats
.ondisk_read_retval
);
4981 f
->dump_stream("ondisk_value.dirstat") << raw_stats
.ondisk_value
.dirstat
;
4982 f
->dump_stream("ondisk_value.rstat") << raw_stats
.ondisk_value
.rstat
;
4983 f
->dump_stream("memory_value.dirstat") << raw_stats
.memory_value
.dirstat
;
4984 f
->dump_stream("memory_value.rstat") << raw_stats
.memory_value
.rstat
;
4985 f
->dump_string("error_str", raw_stats
.error_str
.str());
4987 f
->close_section(); // raw_stats
4988 // dump failure return code
4990 if (backtrace
.checked
&& backtrace
.ondisk_read_retval
)
4991 rc
= backtrace
.ondisk_read_retval
;
4992 if (inode
.checked
&& inode
.ondisk_read_retval
)
4993 rc
= inode
.ondisk_read_retval
;
4994 if (raw_stats
.checked
&& raw_stats
.ondisk_read_retval
)
4995 rc
= raw_stats
.ondisk_read_retval
;
4996 f
->dump_int("return_code", rc
);
4998 f
->close_section(); // results
5001 bool CInode::validated_data::all_damage_repaired() const
5004 (raw_stats
.checked
&& !raw_stats
.passed
&& !raw_stats
.repaired
)
5006 (backtrace
.checked
&& !backtrace
.passed
&& !backtrace
.repaired
)
5008 (inode
.checked
&& !inode
.passed
&& !inode
.repaired
);
5013 void CInode::dump(Formatter
*f
, int flags
) const
5015 if (flags
& DUMP_PATH
) {
5017 make_path_string(path
, true);
5020 f
->dump_string("path", path
);
5023 if (flags
& DUMP_INODE_STORE_BASE
)
5024 InodeStoreBase::dump(f
);
5026 if (flags
& DUMP_MDS_CACHE_OBJECT
)
5027 MDSCacheObject::dump(f
);
5029 if (flags
& DUMP_LOCKS
) {
5030 f
->open_object_section("versionlock");
5031 versionlock
.dump(f
);
5034 f
->open_object_section("authlock");
5038 f
->open_object_section("linklock");
5042 f
->open_object_section("dirfragtreelock");
5043 dirfragtreelock
.dump(f
);
5046 f
->open_object_section("filelock");
5050 f
->open_object_section("xattrlock");
5054 f
->open_object_section("snaplock");
5058 f
->open_object_section("nestlock");
5062 f
->open_object_section("flocklock");
5066 f
->open_object_section("policylock");
5071 if (flags
& DUMP_STATE
) {
5072 f
->open_array_section("states");
5073 MDSCacheObject::dump_states(f
);
5074 if (state_test(STATE_EXPORTING
))
5075 f
->dump_string("state", "exporting");
5076 if (state_test(STATE_OPENINGDIR
))
5077 f
->dump_string("state", "openingdir");
5078 if (state_test(STATE_FREEZING
))
5079 f
->dump_string("state", "freezing");
5080 if (state_test(STATE_FROZEN
))
5081 f
->dump_string("state", "frozen");
5082 if (state_test(STATE_AMBIGUOUSAUTH
))
5083 f
->dump_string("state", "ambiguousauth");
5084 if (state_test(STATE_EXPORTINGCAPS
))
5085 f
->dump_string("state", "exportingcaps");
5086 if (state_test(STATE_NEEDSRECOVER
))
5087 f
->dump_string("state", "needsrecover");
5088 if (state_test(STATE_PURGING
))
5089 f
->dump_string("state", "purging");
5090 if (state_test(STATE_DIRTYPARENT
))
5091 f
->dump_string("state", "dirtyparent");
5092 if (state_test(STATE_DIRTYRSTAT
))
5093 f
->dump_string("state", "dirtyrstat");
5094 if (state_test(STATE_STRAYPINNED
))
5095 f
->dump_string("state", "straypinned");
5096 if (state_test(STATE_FROZENAUTHPIN
))
5097 f
->dump_string("state", "frozenauthpin");
5098 if (state_test(STATE_DIRTYPOOL
))
5099 f
->dump_string("state", "dirtypool");
5100 if (state_test(STATE_ORPHAN
))
5101 f
->dump_string("state", "orphan");
5102 if (state_test(STATE_MISSINGOBJS
))
5103 f
->dump_string("state", "missingobjs");
5107 if (flags
& DUMP_CAPS
) {
5108 f
->open_array_section("client_caps");
5109 for (const auto &p
: client_caps
) {
5110 auto &client
= p
.first
;
5111 auto cap
= &p
.second
;
5112 f
->open_object_section("client_cap");
5113 f
->dump_int("client_id", client
.v
);
5114 f
->dump_string("pending", ccap_string(cap
->pending()));
5115 f
->dump_string("issued", ccap_string(cap
->issued()));
5116 f
->dump_string("wanted", ccap_string(cap
->wanted()));
5117 f
->dump_int("last_sent", cap
->get_last_seq());
5122 f
->dump_int("loner", loner_cap
.v
);
5123 f
->dump_int("want_loner", want_loner_cap
.v
);
5125 f
->open_array_section("mds_caps_wanted");
5126 for (const auto &p
: mds_caps_wanted
) {
5127 f
->open_object_section("mds_cap_wanted");
5128 f
->dump_int("rank", p
.first
);
5129 f
->dump_string("cap", ccap_string(p
.second
));
5135 if (flags
& DUMP_DIRFRAGS
) {
5136 f
->open_array_section("dirfrags");
5137 auto&& dfs
= get_dirfrags();
5138 for(const auto &dir
: dfs
) {
5139 f
->open_object_section("dir");
5140 dir
->dump(f
, CDir::DUMP_DEFAULT
| CDir::DUMP_ITEMS
);
5141 dir
->check_rstats();
5148 /****** Scrub Stuff *****/
5149 void CInode::scrub_info_create() const
5151 dout(25) << __func__
<< dendl
;
5152 ceph_assert(!scrub_infop
);
5154 // break out of const-land to set up implicit initial state
5155 CInode
*me
= const_cast<CInode
*>(this);
5156 const auto& pi
= me
->get_projected_inode();
5158 std::unique_ptr
<scrub_info_t
> si(new scrub_info_t());
5159 si
->last_scrub_stamp
= pi
->last_scrub_stamp
;
5160 si
->last_scrub_version
= pi
->last_scrub_version
;
5162 me
->scrub_infop
.swap(si
);
5165 void CInode::scrub_maybe_delete_info()
5168 !scrub_infop
->scrub_in_progress
&&
5169 !scrub_infop
->last_scrub_dirty
) {
5170 scrub_infop
.reset();
5174 void CInode::scrub_initialize(ScrubHeaderRef
& header
)
5176 dout(20) << __func__
<< " with scrub_version " << get_version() << dendl
;
5179 scrub_infop
->scrub_in_progress
= true;
5180 scrub_infop
->queued_frags
.clear();
5181 scrub_infop
->header
= header
;
5182 header
->inc_num_pending();
5183 // right now we don't handle remote inodes
5186 void CInode::scrub_aborted() {
5187 dout(20) << __func__
<< dendl
;
5188 ceph_assert(scrub_is_in_progress());
5190 scrub_infop
->scrub_in_progress
= false;
5191 scrub_infop
->header
->dec_num_pending();
5192 scrub_maybe_delete_info();
5195 void CInode::scrub_finished() {
5196 dout(20) << __func__
<< dendl
;
5197 ceph_assert(scrub_is_in_progress());
5199 scrub_infop
->last_scrub_version
= get_version();
5200 scrub_infop
->last_scrub_stamp
= ceph_clock_now();
5201 scrub_infop
->last_scrub_dirty
= true;
5202 scrub_infop
->scrub_in_progress
= false;
5203 scrub_infop
->header
->dec_num_pending();
5206 int64_t CInode::get_backtrace_pool() const
5209 return mdcache
->mds
->get_metadata_pool();
5211 // Files are required to have an explicit layout that specifies
5213 ceph_assert(get_inode()->layout
.pool_id
!= -1);
5214 return get_inode()->layout
.pool_id
;
5218 void CInode::queue_export_pin(mds_rank_t export_pin
)
5220 if (state_test(CInode::STATE_QUEUEDEXPORTPIN
))
5224 if (export_pin
>= 0)
5225 target
= export_pin
;
5226 else if (export_pin
== MDS_RANK_EPHEMERAL_RAND
)
5227 target
= mdcache
->hash_into_rank_bucket(ino());
5229 target
= MDS_RANK_NONE
;
5231 unsigned min_frag_bits
= mdcache
->get_ephemeral_dist_frag_bits();
5233 for (auto& p
: dirfrags
) {
5234 CDir
*dir
= p
.second
;
5235 if (!dir
->is_auth())
5238 if (export_pin
== MDS_RANK_EPHEMERAL_DIST
) {
5239 if (dir
->get_frag().bits() < min_frag_bits
) {
5244 target
= mdcache
->hash_into_rank_bucket(ino(), dir
->get_frag());
5247 if (target
!= MDS_RANK_NONE
) {
5248 if (dir
->is_subtree_root()) {
5249 // set auxsubtree bit or export it
5250 if (!dir
->state_test(CDir::STATE_AUXSUBTREE
) ||
5251 target
!= dir
->get_dir_auth().first
)
5254 // create aux subtree or export it
5258 // clear aux subtrees ?
5259 queue
= dir
->state_test(CDir::STATE_AUXSUBTREE
);
5266 state_set(CInode::STATE_QUEUEDEXPORTPIN
);
5267 mdcache
->export_pin_queue
.insert(this);
5271 void CInode::maybe_export_pin(bool update
)
5273 if (!g_conf()->mds_bal_export_pin
)
5275 if (!is_dir() || !is_normal())
5278 dout(15) << __func__
<< " update=" << update
<< " " << *this << dendl
;
5280 mds_rank_t export_pin
= get_export_pin(false);
5281 if (export_pin
== MDS_RANK_NONE
&& !update
)
5284 check_pin_policy(export_pin
);
5285 queue_export_pin(export_pin
);
5288 void CInode::set_ephemeral_pin(bool dist
, bool rand
)
5292 state
|= STATE_DISTEPHEMERALPIN
;
5294 state
|= STATE_RANDEPHEMERALPIN
;
5298 if (state_test(state
) != state
) {
5299 dout(10) << "set ephemeral (" << (dist
? "dist" : "")
5300 << (rand
? " rand" : "") << ") pin on " << *this << dendl
;
5301 if (!is_ephemerally_pinned()) {
5302 auto p
= mdcache
->export_ephemeral_pins
.insert(this);
5303 ceph_assert(p
.second
);
5309 void CInode::clear_ephemeral_pin(bool dist
, bool rand
)
5313 state
|= STATE_DISTEPHEMERALPIN
;
5315 state
|= STATE_RANDEPHEMERALPIN
;
5317 if (state_test(state
)) {
5318 dout(10) << "clear ephemeral (" << (dist
? "dist" : "")
5319 << (rand
? " rand" : "") << ") pin on " << *this << dendl
;
5321 if (!is_ephemerally_pinned()) {
5322 auto count
= mdcache
->export_ephemeral_pins
.erase(this);
5323 ceph_assert(count
== 1);
5328 void CInode::maybe_ephemeral_rand(double threshold
)
5330 if (!mdcache
->get_export_ephemeral_random_config()) {
5331 dout(15) << __func__
<< " config false: cannot ephemeral random pin " << *this << dendl
;
5332 clear_ephemeral_pin(false, true);
5334 } else if (!is_dir() || !is_normal()) {
5335 dout(15) << __func__
<< " !dir or !normal: cannot ephemeral random pin " << *this << dendl
;
5336 clear_ephemeral_pin(false, true);
5338 } else if (get_inode()->nlink
== 0) {
5339 dout(15) << __func__
<< " unlinked directory: cannot ephemeral random pin " << *this << dendl
;
5340 clear_ephemeral_pin(false, true);
5342 } else if (state_test(CInode::STATE_RANDEPHEMERALPIN
)) {
5343 dout(10) << __func__
<< " already ephemeral random pinned: requeueing " << *this << dendl
;
5344 queue_export_pin(MDS_RANK_EPHEMERAL_RAND
);
5348 /* not precomputed? */
5349 if (threshold
< 0.0) {
5350 threshold
= get_ephemeral_rand();
5352 if (threshold
<= 0.0) {
5355 double n
= ceph::util::generate_random_number(0.0, 1.0);
5357 dout(15) << __func__
<< " rand " << n
<< " <?= " << threshold
5358 << " " << *this << dendl
;
5360 if (n
<= threshold
) {
5361 dout(10) << __func__
<< " randomly export pinning " << *this << dendl
;
5362 set_ephemeral_pin(false, true);
5363 queue_export_pin(MDS_RANK_EPHEMERAL_RAND
);
5367 void CInode::setxattr_ephemeral_rand(double probability
)
5369 ceph_assert(is_dir());
5370 _get_projected_inode()->export_ephemeral_random_pin
= probability
;
5373 void CInode::setxattr_ephemeral_dist(bool val
)
5375 ceph_assert(is_dir());
5376 _get_projected_inode()->export_ephemeral_distributed_pin
= val
;
5379 void CInode::set_export_pin(mds_rank_t rank
)
5381 ceph_assert(is_dir());
5382 _get_projected_inode()->export_pin
= rank
;
5383 maybe_export_pin(true);
5386 mds_rank_t
CInode::get_export_pin(bool inherit
) const
5388 if (!g_conf()->mds_bal_export_pin
)
5389 return MDS_RANK_NONE
;
5391 /* An inode that is export pinned may not necessarily be a subtree root, we
5392 * need to traverse the parents. A base or system inode cannot be pinned.
5393 * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5394 * have a parent yet.
5396 mds_rank_t r_target
= MDS_RANK_NONE
;
5397 const CInode
*in
= this;
5398 const CDir
*dir
= nullptr;
5400 if (in
->is_system())
5402 const CDentry
*pdn
= in
->get_parent_dn();
5405 if (in
->get_inode()->nlink
== 0) {
5406 // ignore export pin for unlinked directory
5410 if (in
->get_inode()->export_pin
>= 0) {
5411 return in
->get_inode()->export_pin
;
5412 } else if (in
->get_inode()->export_ephemeral_distributed_pin
&&
5413 mdcache
->get_export_ephemeral_distributed_config()) {
5415 return mdcache
->hash_into_rank_bucket(in
->ino(), dir
->get_frag());
5416 return MDS_RANK_EPHEMERAL_DIST
;
5417 } else if (r_target
!= MDS_RANK_NONE
&& in
->get_inode()->export_ephemeral_random_pin
> 0.0) {
5419 } else if (r_target
== MDS_RANK_NONE
&& in
->is_ephemeral_rand() &&
5420 mdcache
->get_export_ephemeral_random_config()) {
5421 /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
5423 return MDS_RANK_EPHEMERAL_RAND
;
5425 r_target
= MDS_RANK_EPHEMERAL_RAND
;
5427 r_target
= mdcache
->hash_into_rank_bucket(in
->ino());
5432 dir
= pdn
->get_dir();
5435 return MDS_RANK_NONE
;
5438 void CInode::check_pin_policy(mds_rank_t export_pin
)
5440 if (export_pin
== MDS_RANK_EPHEMERAL_DIST
) {
5441 set_ephemeral_pin(true, false);
5442 clear_ephemeral_pin(false, true);
5443 } else if (export_pin
== MDS_RANK_EPHEMERAL_RAND
) {
5444 set_ephemeral_pin(false, true);
5445 clear_ephemeral_pin(true, false);
5446 } else if (is_ephemerally_pinned()) {
5447 // export_pin >= 0 || export_pin == MDS_RANK_NONE
5448 clear_ephemeral_pin(true, true);
5449 if (export_pin
!= get_inode()->export_pin
) // inherited export_pin
5450 queue_export_pin(MDS_RANK_NONE
);
5454 double CInode::get_ephemeral_rand() const
5456 /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5457 * have a parent yet.
5459 const CInode
*in
= this;
5460 double max
= mdcache
->export_ephemeral_random_max
;
5462 if (in
->is_system())
5464 const CDentry
*pdn
= in
->get_parent_dn();
5467 // ignore export pin for unlinked directory
5468 if (in
->get_inode()->nlink
== 0)
5471 if (in
->get_inode()->export_ephemeral_random_pin
> 0.0)
5472 return std::min(in
->get_inode()->export_ephemeral_random_pin
, max
);
5474 /* An export_pin overrides only if no closer parent (incl. this one) has a
5477 if (in
->get_inode()->export_pin
>= 0 ||
5478 in
->get_inode()->export_ephemeral_distributed_pin
)
5481 in
= pdn
->get_dir()->inode
;
5486 void CInode::get_nested_dirfrags(std::vector
<CDir
*>& v
) const
5488 for (const auto &p
: dirfrags
) {
5489 const auto& dir
= p
.second
;
5490 if (!dir
->is_subtree_root())
5495 void CInode::get_subtree_dirfrags(std::vector
<CDir
*>& v
) const
5497 for (const auto &p
: dirfrags
) {
5498 const auto& dir
= p
.second
;
5499 if (dir
->is_subtree_root())
5504 MEMPOOL_DEFINE_OBJECT_FACTORY(CInode
, co_inode
, mds_co
);