1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/int_types.h"
16 #include "common/errno.h"
31 #include "events/EUpdate.h"
33 #include "osdc/Objecter.h"
37 #include "LogSegment.h"
39 #include "common/Clock.h"
41 #include "common/config.h"
42 #include "global/global_context.h"
43 #include "include/ceph_assert.h"
45 #include "mds/MDSContinuation.h"
46 #include "mds/InoTable.h"
47 #include "cephfs_features.h"
49 #define dout_context g_ceph_context
50 #define dout_subsys ceph_subsys_mds
52 #define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") "
55 class CInodeIOContext
: public MDSIOContextBase
59 MDSRank
*get_mds() override
{return in
->mdcache
->mds
;}
61 explicit CInodeIOContext(CInode
*in_
) : in(in_
) {
62 ceph_assert(in
!= NULL
);
66 sr_t
* const CInode::projected_inode::UNDEF_SRNODE
= (sr_t
*)(unsigned long)-1;
68 LockType
CInode::versionlock_type(CEPH_LOCK_IVERSION
);
69 LockType
CInode::authlock_type(CEPH_LOCK_IAUTH
);
70 LockType
CInode::linklock_type(CEPH_LOCK_ILINK
);
71 LockType
CInode::dirfragtreelock_type(CEPH_LOCK_IDFT
);
72 LockType
CInode::filelock_type(CEPH_LOCK_IFILE
);
73 LockType
CInode::xattrlock_type(CEPH_LOCK_IXATTR
);
74 LockType
CInode::snaplock_type(CEPH_LOCK_ISNAP
);
75 LockType
CInode::nestlock_type(CEPH_LOCK_INEST
);
76 LockType
CInode::flocklock_type(CEPH_LOCK_IFLOCK
);
77 LockType
CInode::policylock_type(CEPH_LOCK_IPOLICY
);
79 std::string_view
CInode::pin_name(int p
) const
82 case PIN_DIRFRAG
: return "dirfrag";
83 case PIN_CAPS
: return "caps";
84 case PIN_IMPORTING
: return "importing";
85 case PIN_OPENINGDIR
: return "openingdir";
86 case PIN_REMOTEPARENT
: return "remoteparent";
87 case PIN_BATCHOPENJOURNAL
: return "batchopenjournal";
88 case PIN_SCATTERED
: return "scattered";
89 case PIN_STICKYDIRS
: return "stickydirs";
90 //case PIN_PURGING: return "purging";
91 case PIN_FREEZING
: return "freezing";
92 case PIN_FROZEN
: return "frozen";
93 case PIN_IMPORTINGCAPS
: return "importingcaps";
94 case PIN_EXPORTINGCAPS
: return "exportingcaps";
95 case PIN_PASTSNAPPARENT
: return "pastsnapparent";
96 case PIN_OPENINGSNAPPARENTS
: return "openingsnapparents";
97 case PIN_TRUNCATING
: return "truncating";
98 case PIN_STRAY
: return "stray";
99 case PIN_NEEDSNAPFLUSH
: return "needsnapflush";
100 case PIN_DIRTYRSTAT
: return "dirtyrstat";
101 case PIN_DIRTYPARENT
: return "dirtyparent";
102 case PIN_DIRWAITER
: return "dirwaiter";
103 case PIN_SCRUBQUEUE
: return "scrubqueue";
104 default: return generic_pin_name(p
);
108 //int cinode_pins[CINODE_NUM_PINS]; // counts
109 ostream
& CInode::print_db_line_prefix(ostream
& out
)
111 return out
<< ceph_clock_now() << " mds." << mdcache
->mds
->get_nodeid() << ".cache.ino(" << inode
.ino
<< ") ";
115 * write caps and lock ids
117 struct cinode_lock_info_t cinode_lock_info
[] = {
118 { CEPH_LOCK_IFILE
, CEPH_CAP_ANY_FILE_WR
},
119 { CEPH_LOCK_IAUTH
, CEPH_CAP_AUTH_EXCL
},
120 { CEPH_LOCK_ILINK
, CEPH_CAP_LINK_EXCL
},
121 { CEPH_LOCK_IXATTR
, CEPH_CAP_XATTR_EXCL
},
123 int num_cinode_locks
= sizeof(cinode_lock_info
) / sizeof(cinode_lock_info
[0]);
125 ostream
& operator<<(ostream
& out
, const CInode
& in
)
128 in
.make_path_string(path
, true);
130 out
<< "[inode " << in
.inode
.ino
;
132 << (in
.is_multiversion() ? "...":"")
133 << in
.first
<< "," << in
.last
<< "]";
134 out
<< " " << path
<< (in
.is_dir() ? "/":"");
138 if (in
.is_replicated())
139 out
<< in
.get_replicas();
141 mds_authority_t a
= in
.authority();
142 out
<< " rep@" << a
.first
;
143 if (a
.second
!= CDIR_AUTH_UNKNOWN
)
144 out
<< "," << a
.second
;
145 out
<< "." << in
.get_replica_nonce();
149 out
<< " symlink='" << in
.symlink
<< "'";
150 if (in
.is_dir() && !in
.dirfragtree
.empty())
151 out
<< " " << in
.dirfragtree
;
153 out
<< " v" << in
.get_version();
154 if (in
.get_projected_version() > in
.get_version())
155 out
<< " pv" << in
.get_projected_version();
157 if (in
.get_num_auth_pins()) {
158 out
<< " ap=" << in
.get_num_auth_pins();
159 #ifdef MDS_AUTHPIN_SET
160 in
.print_authpin_set(out
);
165 out
<< " snaprealm=" << in
.snaprealm
;
167 if (in
.state_test(CInode::STATE_AMBIGUOUSAUTH
)) out
<< " AMBIGAUTH";
168 if (in
.state_test(CInode::STATE_NEEDSRECOVER
)) out
<< " needsrecover";
169 if (in
.state_test(CInode::STATE_RECOVERING
)) out
<< " recovering";
170 if (in
.state_test(CInode::STATE_DIRTYPARENT
)) out
<< " dirtyparent";
171 if (in
.state_test(CInode::STATE_MISSINGOBJS
)) out
<< " missingobjs";
172 if (in
.is_freezing_inode()) out
<< " FREEZING=" << in
.auth_pin_freeze_allowance
;
173 if (in
.is_frozen_inode()) out
<< " FROZEN";
174 if (in
.is_frozen_auth_pin()) out
<< " FROZEN_AUTHPIN";
176 const CInode::mempool_inode
*pi
= in
.get_projected_inode();
177 if (pi
->is_truncating())
178 out
<< " truncating(" << pi
->truncate_from
<< " to " << pi
->truncate_size
<< ")";
180 if (in
.inode
.is_dir()) {
181 out
<< " " << in
.inode
.dirstat
;
182 if (g_conf()->mds_debug_scatterstat
&& in
.is_projected()) {
183 const CInode::mempool_inode
*pi
= in
.get_projected_inode();
184 out
<< "->" << pi
->dirstat
;
187 out
<< " s=" << in
.inode
.size
;
188 if (in
.inode
.nlink
!= 1)
189 out
<< " nl=" << in
.inode
.nlink
;
193 out
<< " " << in
.inode
.rstat
;
194 if (!(in
.inode
.rstat
== in
.inode
.accounted_rstat
))
195 out
<< "/" << in
.inode
.accounted_rstat
;
196 if (g_conf()->mds_debug_scatterstat
&& in
.is_projected()) {
197 const CInode::mempool_inode
*pi
= in
.get_projected_inode();
198 out
<< "->" << pi
->rstat
;
199 if (!(pi
->rstat
== pi
->accounted_rstat
))
200 out
<< "/" << pi
->accounted_rstat
;
203 if (!in
.client_need_snapflush
.empty())
204 out
<< " need_snapflush=" << in
.client_need_snapflush
;
208 if (!in
.authlock
.is_sync_and_unlocked())
209 out
<< " " << in
.authlock
;
210 if (!in
.linklock
.is_sync_and_unlocked())
211 out
<< " " << in
.linklock
;
212 if (in
.inode
.is_dir()) {
213 if (!in
.dirfragtreelock
.is_sync_and_unlocked())
214 out
<< " " << in
.dirfragtreelock
;
215 if (!in
.snaplock
.is_sync_and_unlocked())
216 out
<< " " << in
.snaplock
;
217 if (!in
.nestlock
.is_sync_and_unlocked())
218 out
<< " " << in
.nestlock
;
219 if (!in
.policylock
.is_sync_and_unlocked())
220 out
<< " " << in
.policylock
;
222 if (!in
.flocklock
.is_sync_and_unlocked())
223 out
<< " " << in
.flocklock
;
225 if (!in
.filelock
.is_sync_and_unlocked())
226 out
<< " " << in
.filelock
;
227 if (!in
.xattrlock
.is_sync_and_unlocked())
228 out
<< " " << in
.xattrlock
;
229 if (!in
.versionlock
.is_sync_and_unlocked())
230 out
<< " " << in
.versionlock
;
232 // hack: spit out crap on which clients have caps
233 if (in
.inode
.client_ranges
.size())
234 out
<< " cr=" << in
.inode
.client_ranges
;
236 if (!in
.get_client_caps().empty()) {
239 for (const auto &p
: in
.get_client_caps()) {
240 if (!first
) out
<< ",";
241 out
<< p
.first
<< "="
242 << ccap_string(p
.second
.pending());
243 if (p
.second
.issued() != p
.second
.pending())
244 out
<< "/" << ccap_string(p
.second
.issued());
245 out
<< "/" << ccap_string(p
.second
.wanted())
246 << "@" << p
.second
.get_last_seq();
250 if (in
.get_loner() >= 0 || in
.get_wanted_loner() >= 0) {
251 out
<< ",l=" << in
.get_loner();
252 if (in
.get_loner() != in
.get_wanted_loner())
253 out
<< "(" << in
.get_wanted_loner() << ")";
256 if (!in
.get_mds_caps_wanted().empty()) {
259 for (const auto &p
: in
.get_mds_caps_wanted()) {
262 out
<< p
.first
<< '=' << ccap_string(p
.second
);
268 if (in
.get_num_ref()) {
270 in
.print_pin_set(out
);
273 if (in
.inode
.export_pin
!= MDS_RANK_NONE
) {
274 out
<< " export_pin=" << in
.inode
.export_pin
;
282 ostream
& operator<<(ostream
& out
, const CInode::scrub_stamp_info_t
& si
)
284 out
<< "{scrub_start_version: " << si
.scrub_start_version
285 << ", scrub_start_stamp: " << si
.scrub_start_stamp
286 << ", last_scrub_version: " << si
.last_scrub_version
287 << ", last_scrub_stamp: " << si
.last_scrub_stamp
;
291 CInode::CInode(MDCache
*c
, bool auth
, snapid_t f
, snapid_t l
)
297 item_open_file(this),
298 item_dirty_parent(this),
299 item_dirty_dirfrag_dir(this),
300 item_dirty_dirfrag_nest(this),
301 item_dirty_dirfrag_dirfragtree(this),
303 versionlock(this, &versionlock_type
),
304 authlock(this, &authlock_type
),
305 linklock(this, &linklock_type
),
306 dirfragtreelock(this, &dirfragtreelock_type
),
307 filelock(this, &filelock_type
),
308 xattrlock(this, &xattrlock_type
),
309 snaplock(this, &snaplock_type
),
310 nestlock(this, &nestlock_type
),
311 flocklock(this, &flocklock_type
),
312 policylock(this, &policylock_type
)
314 if (auth
) state_set(STATE_AUTH
);
317 void CInode::print(ostream
& out
)
322 void CInode::add_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
)
324 dout(10) << __func__
<< " client." << client
<< " snapid " << snapid
<< " on " << snapin
<< dendl
;
326 if (client_need_snapflush
.empty()) {
327 get(CInode::PIN_NEEDSNAPFLUSH
);
329 // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially
330 // long periods waiting for clients to flush their snaps.
331 auth_pin(this); // pin head inode...
334 auto &clients
= client_need_snapflush
[snapid
];
336 snapin
->auth_pin(this); // ...and pin snapped/old inode!
338 clients
.insert(client
);
341 void CInode::remove_need_snapflush(CInode
*snapin
, snapid_t snapid
, client_t client
)
343 dout(10) << __func__
<< " client." << client
<< " snapid " << snapid
<< " on " << snapin
<< dendl
;
344 auto it
= client_need_snapflush
.find(snapid
);
345 if (it
== client_need_snapflush
.end()) {
346 dout(10) << " snapid not found" << dendl
;
349 size_t n
= it
->second
.erase(client
);
351 dout(10) << " client not found" << dendl
;
354 if (it
->second
.empty()) {
355 client_need_snapflush
.erase(it
);
356 snapin
->auth_unpin(this);
358 if (client_need_snapflush
.empty()) {
359 put(CInode::PIN_NEEDSNAPFLUSH
);
365 pair
<bool,bool> CInode::split_need_snapflush(CInode
*cowin
, CInode
*in
)
367 dout(10) << __func__
<< " [" << cowin
->first
<< "," << cowin
->last
<< "] for " << *cowin
<< dendl
;
368 bool cowin_need_flush
= false;
369 bool orig_need_flush
= false;
370 auto it
= client_need_snapflush
.lower_bound(cowin
->first
);
371 while (it
!= client_need_snapflush
.end() && it
->first
< in
->first
) {
372 ceph_assert(!it
->second
.empty());
373 if (cowin
->last
>= it
->first
) {
374 cowin
->auth_pin(this);
375 cowin_need_flush
= true;
378 it
= client_need_snapflush
.erase(it
);
380 in
->auth_unpin(this);
383 if (it
!= client_need_snapflush
.end() && it
->first
<= in
->last
)
384 orig_need_flush
= true;
386 return make_pair(cowin_need_flush
, orig_need_flush
);
389 void CInode::mark_dirty_rstat()
391 if (!state_test(STATE_DIRTYRSTAT
)) {
392 dout(10) << __func__
<< dendl
;
393 state_set(STATE_DIRTYRSTAT
);
395 CDentry
*pdn
= get_projected_parent_dn();
396 if (pdn
->is_auth()) {
397 CDir
*pdir
= pdn
->dir
;
398 pdir
->dirty_rstat_inodes
.push_back(&dirty_rstat_item
);
399 mdcache
->mds
->locker
->mark_updated_scatterlock(&pdir
->inode
->nestlock
);
401 // under cross-MDS rename.
402 // DIRTYRSTAT flag will get cleared when rename finishes
403 ceph_assert(state_test(STATE_AMBIGUOUSAUTH
));
407 void CInode::clear_dirty_rstat()
409 if (state_test(STATE_DIRTYRSTAT
)) {
410 dout(10) << __func__
<< dendl
;
411 state_clear(STATE_DIRTYRSTAT
);
413 dirty_rstat_item
.remove_myself();
417 CInode::projected_inode
&CInode::project_inode(bool xattr
, bool snap
)
419 auto &pi
= projected_nodes
.empty() ?
420 projected_nodes
.emplace_back(inode
) :
421 projected_nodes
.emplace_back(projected_nodes
.back().inode
);
423 if (scrub_infop
&& scrub_infop
->last_scrub_dirty
) {
424 pi
.inode
.last_scrub_stamp
= scrub_infop
->last_scrub_stamp
;
425 pi
.inode
.last_scrub_version
= scrub_infop
->last_scrub_version
;
426 scrub_infop
->last_scrub_dirty
= false;
427 scrub_maybe_delete_info();
431 pi
.xattrs
.reset(new mempool_xattr_map(*get_projected_xattrs()));
432 ++num_projected_xattrs
;
439 dout(15) << __func__
<< " " << pi
.inode
.ino
<< dendl
;
443 void CInode::pop_and_dirty_projected_inode(LogSegment
*ls
)
445 ceph_assert(!projected_nodes
.empty());
446 auto &front
= projected_nodes
.front();
447 dout(15) << __func__
<< " " << front
.inode
.ino
448 << " v" << front
.inode
.version
<< dendl
;
449 int64_t old_pool
= inode
.layout
.pool_id
;
451 mark_dirty(front
.inode
.version
, ls
);
452 bool new_export_pin
= inode
.export_pin
!= front
.inode
.export_pin
;
455 maybe_export_pin(true);
457 if (inode
.is_backtrace_updated())
458 mark_dirty_parent(ls
, old_pool
!= inode
.layout
.pool_id
);
461 --num_projected_xattrs
;
462 xattrs
= *front
.xattrs
;
465 if (projected_nodes
.front().snapnode
!= projected_inode::UNDEF_SRNODE
) {
466 pop_projected_snaprealm(projected_nodes
.front().snapnode
, false);
467 --num_projected_srnodes
;
470 projected_nodes
.pop_front();
473 CInode::mempool_xattr_map
*CInode::get_projected_xattrs()
475 if (num_projected_xattrs
> 0) {
476 for (auto it
= projected_nodes
.rbegin(); it
!= projected_nodes
.rend(); ++it
)
478 return it
->xattrs
.get();
483 CInode::mempool_xattr_map
*CInode::get_previous_projected_xattrs()
485 if (num_projected_xattrs
> 0) {
486 for (auto it
= ++projected_nodes
.rbegin(); it
!= projected_nodes
.rend(); ++it
)
488 return it
->xattrs
.get();
493 sr_t
*CInode::prepare_new_srnode(snapid_t snapid
)
495 const sr_t
*cur_srnode
= get_projected_srnode();
499 new_srnode
= new sr_t(*cur_srnode
);
500 if (!new_srnode
->past_parents
.empty()) {
501 // convert past_parents to past_parent_snaps
502 ceph_assert(snaprealm
);
503 auto& snaps
= snaprealm
->get_snaps();
504 for (auto p
: snaps
) {
505 if (p
>= new_srnode
->current_parent_since
)
507 if (!new_srnode
->snaps
.count(p
))
508 new_srnode
->past_parent_snaps
.insert(p
);
510 new_srnode
->seq
= snaprealm
->get_newest_seq();
511 new_srnode
->past_parents
.clear();
514 snaprealm
->past_parents_dirty
= false;
517 snapid
= mdcache
->get_global_snaprealm()->get_newest_seq();
518 new_srnode
= new sr_t();
519 new_srnode
->seq
= snapid
;
520 new_srnode
->created
= snapid
;
521 new_srnode
->current_parent_since
= get_oldest_snap();
526 const sr_t
*CInode::get_projected_srnode() const {
527 if (num_projected_srnodes
> 0) {
528 for (auto it
= projected_nodes
.rbegin(); it
!= projected_nodes
.rend(); ++it
)
529 if (it
->snapnode
!= projected_inode::UNDEF_SRNODE
)
533 return &snaprealm
->srnode
;
538 void CInode::project_snaprealm(sr_t
*new_srnode
)
540 dout(10) << __func__
<< " " << new_srnode
<< dendl
;
541 ceph_assert(projected_nodes
.back().snapnode
== projected_inode::UNDEF_SRNODE
);
542 projected_nodes
.back().snapnode
= new_srnode
;
543 ++num_projected_srnodes
;
546 void CInode::mark_snaprealm_global(sr_t
*new_srnode
)
548 ceph_assert(!is_dir());
549 // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since'
550 new_srnode
->last_destroyed
= new_srnode
->current_parent_since
;
551 new_srnode
->current_parent_since
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
552 new_srnode
->mark_parent_global();
555 void CInode::clear_snaprealm_global(sr_t
*new_srnode
)
557 // restore 'current_parent_since'
558 new_srnode
->current_parent_since
= new_srnode
->last_destroyed
;
559 new_srnode
->last_destroyed
= 0;
560 new_srnode
->seq
= mdcache
->get_global_snaprealm()->get_newest_seq();
561 new_srnode
->clear_parent_global();
564 bool CInode::is_projected_snaprealm_global() const
566 const sr_t
*srnode
= get_projected_srnode();
567 if (srnode
&& srnode
->is_parent_global())
572 void CInode::project_snaprealm_past_parent(SnapRealm
*newparent
)
574 sr_t
*new_snap
= project_snaprealm();
575 record_snaprealm_past_parent(new_snap
, newparent
);
579 /* if newparent != parent, add parent to past_parents
580 if parent DNE, we need to find what the parent actually is and fill that in */
581 void CInode::record_snaprealm_past_parent(sr_t
*new_snap
, SnapRealm
*newparent
)
583 ceph_assert(!new_snap
->is_parent_global());
584 SnapRealm
*oldparent
;
586 oldparent
= find_snaprealm();
588 oldparent
= snaprealm
->parent
;
591 if (newparent
!= oldparent
) {
592 snapid_t oldparentseq
= oldparent
->get_newest_seq();
593 if (oldparentseq
+ 1 > new_snap
->current_parent_since
) {
594 // copy old parent's snaps
595 const set
<snapid_t
>& snaps
= oldparent
->get_snaps();
596 auto p
= snaps
.lower_bound(new_snap
->current_parent_since
);
597 if (p
!= snaps
.end())
598 new_snap
->past_parent_snaps
.insert(p
, snaps
.end());
599 if (oldparentseq
> new_snap
->seq
)
600 new_snap
->seq
= oldparentseq
;
602 new_snap
->current_parent_since
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
606 void CInode::record_snaprealm_parent_dentry(sr_t
*new_snap
, SnapRealm
*newparent
,
607 CDentry
*dn
, bool primary_dn
)
609 ceph_assert(new_snap
->is_parent_global());
610 SnapRealm
*oldparent
= dn
->get_dir()->inode
->find_snaprealm();
611 auto& snaps
= oldparent
->get_snaps();
614 auto p
= snaps
.lower_bound(dn
->first
);
615 if (p
!= snaps
.end())
616 new_snap
->past_parent_snaps
.insert(p
, snaps
.end());
617 } else if (newparent
!= oldparent
) {
618 // 'last_destroyed' is used as 'current_parent_since'
619 auto p
= snaps
.lower_bound(new_snap
->last_destroyed
);
620 if (p
!= snaps
.end())
621 new_snap
->past_parent_snaps
.insert(p
, snaps
.end());
622 new_snap
->last_destroyed
= mdcache
->get_global_snaprealm()->get_newest_seq() + 1;
626 void CInode::early_pop_projected_snaprealm()
628 ceph_assert(!projected_nodes
.empty());
629 if (projected_nodes
.front().snapnode
!= projected_inode::UNDEF_SRNODE
) {
630 pop_projected_snaprealm(projected_nodes
.front().snapnode
, true);
631 projected_nodes
.front().snapnode
= projected_inode::UNDEF_SRNODE
;
632 --num_projected_srnodes
;
636 void CInode::pop_projected_snaprealm(sr_t
*next_snaprealm
, bool early
)
638 if (next_snaprealm
) {
639 dout(10) << __func__
<< (early
? " (early) " : " ")
640 << next_snaprealm
<< " seq " << next_snaprealm
->seq
<< dendl
;
641 bool invalidate_cached_snaps
= false;
644 } else if (!snaprealm
->srnode
.past_parents
.empty()) {
645 invalidate_cached_snaps
= true;
646 // re-open past parents
647 snaprealm
->close_parents();
649 dout(10) << " realm " << *snaprealm
<< " past_parents " << snaprealm
->srnode
.past_parents
650 << " -> " << next_snaprealm
->past_parents
<< dendl
;
652 auto old_flags
= snaprealm
->srnode
.flags
;
653 snaprealm
->srnode
= *next_snaprealm
;
654 delete next_snaprealm
;
656 if ((snaprealm
->srnode
.flags
^ old_flags
) & sr_t::PARENT_GLOBAL
) {
657 snaprealm
->close_parents();
658 snaprealm
->adjust_parent();
661 // we should be able to open these up (or have them already be open).
662 bool ok
= snaprealm
->_open_parents(NULL
);
665 if (invalidate_cached_snaps
)
666 snaprealm
->invalidate_cached_snaps();
668 if (snaprealm
->parent
)
669 dout(10) << " realm " << *snaprealm
<< " parent " << *snaprealm
->parent
<< dendl
;
671 dout(10) << __func__
<< (early
? " (early) null" : " null") << dendl
;
672 ceph_assert(snaprealm
);
673 snaprealm
->merge_to(NULL
);
678 // ====== CInode =======
682 __u32
InodeStoreBase::hash_dentry_name(std::string_view dn
)
684 int which
= inode
.dir_layout
.dl_dir_hash
;
686 which
= CEPH_STR_HASH_LINUX
;
687 ceph_assert(ceph_str_hash_valid(which
));
688 return ceph_str_hash(which
, dn
.data(), dn
.length());
691 frag_t
InodeStoreBase::pick_dirfrag(std::string_view dn
)
693 if (dirfragtree
.empty())
694 return frag_t(); // avoid the string hash if we can.
696 __u32 h
= hash_dentry_name(dn
);
697 return dirfragtree
[h
];
700 std::pair
<bool, std::vector
<CDir
*>> CInode::get_dirfrags_under(frag_t fg
)
702 std::pair
<bool, std::vector
<CDir
*>> result
;
703 auto& all
= result
.first
;
704 auto& dirs
= result
.second
;
707 if (auto it
= dirfrags
.find(fg
); it
!= dirfrags
.end()){
709 dirs
.push_back(it
->second
);
714 for(auto &[_fg
, _dir
] : dirfrags
){
715 // frag_t.bits() can indicate the depth of the partition in the directory tree
717 // 01* : bit = 2, on the second floor
720 // 00* 01* 10* 11* -- > level 2, bit = 2
721 // so fragA.bits > fragB.bits means fragA is deeper than fragB
723 if (fg
.bits() >= _fg
.bits()) {
724 if (_fg
.contains(fg
)) {
729 if (fg
.contains(_fg
)) {
730 dirs
.push_back(_dir
);
731 // we can calculate how many sub slices a slice can be divided into
732 // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*)
733 // or 2^2 frags belonging to the second layer(00* 01* 10* 11*)
734 // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level
735 total
+= 1 << (24 - _fg
.bits());
740 // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache
741 all
= ((1<<(24-fg
.bits())) == total
);
745 void CInode::verify_dirfrags()
748 for (const auto &p
: dirfrags
) {
749 if (!dirfragtree
.is_leaf(p
.first
)) {
750 dout(0) << "have open dirfrag " << p
.first
<< " but not leaf in " << dirfragtree
751 << ": " << *p
.second
<< dendl
;
758 void CInode::force_dirfrags()
761 for (auto &p
: dirfrags
) {
762 if (!dirfragtree
.is_leaf(p
.first
)) {
763 dout(0) << "have open dirfrag " << p
.first
<< " but not leaf in " << dirfragtree
764 << ": " << *p
.second
<< dendl
;
771 dirfragtree
.get_leaves(leaves
);
772 for (const auto& leaf
: leaves
) {
773 mdcache
->get_force_dirfrag(dirfrag_t(ino(), leaf
), true);
780 CDir
*CInode::get_approx_dirfrag(frag_t fg
)
782 CDir
*dir
= get_dirfrag(fg
);
786 auto&& p
= get_dirfrags_under(fg
);
787 if (!p
.second
.empty())
788 return p
.second
.front();
791 while (fg
.bits() > 0) {
793 dir
= get_dirfrag(fg
);
799 CDir
*CInode::get_or_open_dirfrag(MDCache
*mdcache
, frag_t fg
)
801 ceph_assert(is_dir());
804 CDir
*dir
= get_dirfrag(fg
);
807 ceph_assert(is_auth() || mdcache
->mds
->is_any_replay());
808 dir
= new CDir(this, fg
, mdcache
, is_auth());
814 CDir
*CInode::add_dirfrag(CDir
*dir
)
816 auto em
= dirfrags
.emplace(std::piecewise_construct
, std::forward_as_tuple(dir
->dirfrag().frag
), std::forward_as_tuple(dir
));
817 ceph_assert(em
.second
);
819 if (stickydir_ref
> 0) {
820 dir
->state_set(CDir::STATE_STICKY
);
821 dir
->get(CDir::PIN_STICKY
);
829 void CInode::close_dirfrag(frag_t fg
)
831 dout(14) << __func__
<< " " << fg
<< dendl
;
832 ceph_assert(dirfrags
.count(fg
));
834 CDir
*dir
= dirfrags
[fg
];
835 dir
->remove_null_dentries();
841 if (stickydir_ref
> 0) {
842 dir
->state_clear(CDir::STATE_STICKY
);
843 dir
->put(CDir::PIN_STICKY
);
846 if (dir
->is_subtree_root())
849 // dump any remaining dentries, for debugging purposes
850 for (const auto &p
: dir
->items
)
851 dout(14) << __func__
<< " LEFTOVER dn " << *p
.second
<< dendl
;
853 ceph_assert(dir
->get_num_ref() == 0);
858 void CInode::close_dirfrags()
860 while (!dirfrags
.empty())
861 close_dirfrag(dirfrags
.begin()->first
);
864 bool CInode::has_subtree_root_dirfrag(int auth
)
866 if (num_subtree_roots
> 0) {
869 for (const auto &p
: dirfrags
) {
870 if (p
.second
->is_subtree_root() &&
871 p
.second
->dir_auth
.first
== auth
)
878 bool CInode::has_subtree_or_exporting_dirfrag()
880 if (num_subtree_roots
> 0 || num_exporting_dirs
> 0)
885 void CInode::get_stickydirs()
887 if (stickydir_ref
== 0) {
889 for (const auto &p
: dirfrags
) {
890 p
.second
->state_set(CDir::STATE_STICKY
);
891 p
.second
->get(CDir::PIN_STICKY
);
897 void CInode::put_stickydirs()
899 ceph_assert(stickydir_ref
> 0);
901 if (stickydir_ref
== 0) {
903 for (const auto &p
: dirfrags
) {
904 p
.second
->state_clear(CDir::STATE_STICKY
);
905 p
.second
->put(CDir::PIN_STICKY
);
916 void CInode::first_get()
920 parent
->get(CDentry::PIN_INODEPIN
);
923 void CInode::last_put()
927 parent
->put(CDentry::PIN_INODEPIN
);
932 if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
933 mdcache
->maybe_eval_stray(this, true);
936 void CInode::add_remote_parent(CDentry
*p
)
938 if (remote_parents
.empty())
939 get(PIN_REMOTEPARENT
);
940 remote_parents
.insert(p
);
942 void CInode::remove_remote_parent(CDentry
*p
)
944 remote_parents
.erase(p
);
945 if (remote_parents
.empty())
946 put(PIN_REMOTEPARENT
);
952 CDir
*CInode::get_parent_dir()
958 CDir
*CInode::get_projected_parent_dir()
960 CDentry
*p
= get_projected_parent_dn();
965 CInode
*CInode::get_parent_inode()
968 return parent
->dir
->inode
;
972 bool CInode::is_ancestor_of(const CInode
*other
) const
977 const CDentry
*pdn
= other
->get_oldest_parent_dn();
979 ceph_assert(other
->is_base());
982 other
= pdn
->get_dir()->get_inode();
987 bool CInode::is_projected_ancestor_of(const CInode
*other
) const
992 const CDentry
*pdn
= other
->get_projected_parent_dn();
994 ceph_assert(other
->is_base());
997 other
= pdn
->get_dir()->get_inode();
1003 * Because a non-directory inode may have multiple links, the use_parent
1004 * argument allows selecting which parent to use for path construction. This
1005 * argument is only meaningful for the final component (i.e. the first of the
1006 * nested calls) because directories cannot have multiple hard links. If
1007 * use_parent is NULL and projected is true, the primary parent's projected
1008 * inode is used all the way up the path chain. Otherwise the primary parent
1009 * stable inode is used.
1011 void CInode::make_path_string(string
& s
, bool projected
, const CDentry
*use_parent
) const
1014 use_parent
= projected
? get_projected_parent_dn() : parent
;
1018 use_parent
->make_path_string(s
, projected
);
1019 } else if (is_root()) {
1021 } else if (is_mdsdir()) {
1023 uint64_t eino(ino());
1024 eino
-= MDS_INO_MDSDIR_OFFSET
;
1025 snprintf(t
, sizeof(t
), "~mds%" PRId64
, eino
);
1029 uint64_t eino(ino());
1030 snprintf(n
, sizeof(n
), "#%" PRIx64
, eino
);
1035 void CInode::make_path(filepath
& fp
, bool projected
) const
1037 const CDentry
*use_parent
= projected
? get_projected_parent_dn() : parent
;
1039 ceph_assert(!is_base());
1040 use_parent
->make_path(fp
, projected
);
1042 fp
= filepath(ino());
1046 void CInode::name_stray_dentry(string
& dname
)
1049 snprintf(s
, sizeof(s
), "%llx", (unsigned long long)inode
.ino
.val
);
1053 version_t
CInode::pre_dirty()
1056 CDentry
* _cdentry
= get_projected_parent_dn();
1058 pv
= _cdentry
->pre_dirty(get_projected_version());
1059 dout(10) << "pre_dirty " << pv
<< " (current v " << inode
.version
<< ")" << dendl
;
1061 ceph_assert(is_base());
1062 pv
= get_projected_version() + 1;
1064 // force update backtrace for old format inode (see mempool_inode::decode)
1065 if (inode
.backtrace_version
== 0 && !projected_nodes
.empty()) {
1066 mempool_inode
&pi
= projected_nodes
.back().inode
;
1067 if (pi
.backtrace_version
== 0)
1068 pi
.update_backtrace(pv
);
1073 void CInode::_mark_dirty(LogSegment
*ls
)
1075 if (!state_test(STATE_DIRTY
)) {
1076 state_set(STATE_DIRTY
);
1081 // move myself to this segment's dirty list
1083 ls
->dirty_inodes
.push_back(&item_dirty
);
1086 void CInode::mark_dirty(version_t pv
, LogSegment
*ls
) {
1088 dout(10) << __func__
<< " " << *this << dendl
;
1091 NOTE: I may already be dirty, but this fn _still_ needs to be called so that
1092 the directory is (perhaps newly) dirtied, and so that parent_dir_version is
1096 // only auth can get dirty. "dirty" async data in replicas is relative to
1097 // filelock state, not the dirty flag.
1098 ceph_assert(is_auth());
1100 // touch my private version
1101 ceph_assert(inode
.version
< pv
);
1107 parent
->mark_dirty(pv
, ls
);
1111 void CInode::mark_clean()
1113 dout(10) << __func__
<< " " << *this << dendl
;
1114 if (state_test(STATE_DIRTY
)) {
1115 state_clear(STATE_DIRTY
);
1118 // remove myself from ls dirty list
1119 item_dirty
.remove_myself();
1125 // per-inode storage
1126 // (currently for root inode only)
1128 struct C_IO_Inode_Stored
: public CInodeIOContext
{
1131 C_IO_Inode_Stored(CInode
*i
, version_t v
, Context
*f
) : CInodeIOContext(i
), version(v
), fin(f
) {}
1132 void finish(int r
) override
{
1133 in
->_stored(r
, version
, fin
);
1135 void print(ostream
& out
) const override
{
1136 out
<< "inode_store(" << in
->ino() << ")";
1140 object_t
InodeStoreBase::get_object_name(inodeno_t ino
, frag_t fg
, std::string_view suffix
)
1143 snprintf(n
, sizeof(n
), "%llx.%08llx", (long long unsigned)ino
, (long long unsigned)fg
);
1144 ceph_assert(strlen(n
) + suffix
.size() < sizeof n
);
1145 strncat(n
, suffix
.data(), suffix
.size());
1149 void CInode::store(MDSContext
*fin
)
1151 dout(10) << __func__
<< " " << get_version() << dendl
;
1152 ceph_assert(is_base());
1155 purge_stale_snap_data(snaprealm
->get_snaps());
1159 string magic
= CEPH_FS_ONDISK_MAGIC
;
1162 encode_store(bl
, mdcache
->mds
->mdsmap
->get_up_features());
1169 object_t oid
= CInode::get_object_name(ino(), frag_t(), ".inode");
1170 object_locator_t
oloc(mdcache
->mds
->mdsmap
->get_metadata_pool());
1173 new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin
),
1174 mdcache
->mds
->finisher
);
1175 mdcache
->mds
->objecter
->mutate(oid
, oloc
, m
, snapc
,
1176 ceph::real_clock::now(), 0,
1180 void CInode::_stored(int r
, version_t v
, Context
*fin
)
1183 dout(1) << "store error " << r
<< " v " << v
<< " on " << *this << dendl
;
1184 mdcache
->mds
->clog
->error() << "failed to store inode " << ino()
1185 << " object: " << cpp_strerror(r
);
1186 mdcache
->mds
->handle_write_error(r
);
1191 dout(10) << __func__
<< " " << v
<< " on " << *this << dendl
;
1192 if (v
== get_projected_version())
1198 void CInode::flush(MDSContext
*fin
)
1200 dout(10) << __func__
<< " " << *this << dendl
;
1201 ceph_assert(is_auth() && can_auth_pin());
1203 MDSGatherBuilder
gather(g_ceph_context
);
1205 if (is_dirty_parent()) {
1206 store_backtrace(gather
.new_sub());
1210 store(gather
.new_sub());
1212 parent
->dir
->commit(0, gather
.new_sub());
1216 if (gather
.has_subs()) {
1217 gather
.set_finisher(fin
);
1224 struct C_IO_Inode_Fetched
: public CInodeIOContext
{
1227 C_IO_Inode_Fetched(CInode
*i
, Context
*f
) : CInodeIOContext(i
), fin(f
) {}
1228 void finish(int r
) override
{
1229 // Ignore 'r', because we fetch from two places, so r is usually ENOENT
1230 in
->_fetched(bl
, bl2
, fin
);
1232 void print(ostream
& out
) const override
{
1233 out
<< "inode_fetch(" << in
->ino() << ")";
1237 void CInode::fetch(MDSContext
*fin
)
1239 dout(10) << __func__
<< dendl
;
1241 C_IO_Inode_Fetched
*c
= new C_IO_Inode_Fetched(this, fin
);
1242 C_GatherBuilder
gather(g_ceph_context
, new C_OnFinisher(c
, mdcache
->mds
->finisher
));
1244 object_t oid
= CInode::get_object_name(ino(), frag_t(), "");
1245 object_locator_t
oloc(mdcache
->mds
->mdsmap
->get_metadata_pool());
1247 // Old on-disk format: inode stored in xattr of a dirfrag
1249 rd
.getxattr("inode", &c
->bl
, NULL
);
1250 mdcache
->mds
->objecter
->read(oid
, oloc
, rd
, CEPH_NOSNAP
, (bufferlist
*)NULL
, 0, gather
.new_sub());
1252 // Current on-disk format: inode stored in a .inode object
1253 object_t oid2
= CInode::get_object_name(ino(), frag_t(), ".inode");
1254 mdcache
->mds
->objecter
->read(oid2
, oloc
, 0, 0, CEPH_NOSNAP
, &c
->bl2
, 0, gather
.new_sub());
1259 void CInode::_fetched(bufferlist
& bl
, bufferlist
& bl2
, Context
*fin
)
1261 dout(10) << __func__
<< " got " << bl
.length() << " and " << bl2
.length() << dendl
;
1262 bufferlist::const_iterator p
;
1265 } else if (bl
.length()) {
1268 derr
<< "No data while reading inode " << ino() << dendl
;
1269 fin
->complete(-ENOENT
);
1278 dout(10) << " magic is '" << magic
<< "' (expecting '"
1279 << CEPH_FS_ONDISK_MAGIC
<< "')" << dendl
;
1280 if (magic
!= CEPH_FS_ONDISK_MAGIC
) {
1281 dout(0) << "on disk magic '" << magic
<< "' != my magic '" << CEPH_FS_ONDISK_MAGIC
1283 fin
->complete(-EINVAL
);
1286 dout(10) << "_fetched " << *this << dendl
;
1289 } catch (buffer::error
&err
) {
1290 derr
<< "Corrupt inode " << ino() << ": " << err
<< dendl
;
1291 fin
->complete(-EINVAL
);
1296 void CInode::build_backtrace(int64_t pool
, inode_backtrace_t
& bt
)
1299 bt
.ancestors
.clear();
1303 CDentry
*pdn
= get_parent_dn();
1305 CInode
*diri
= pdn
->get_dir()->get_inode();
1306 bt
.ancestors
.push_back(inode_backpointer_t(diri
->ino(), pdn
->get_name(), in
->inode
.version
));
1308 pdn
= in
->get_parent_dn();
1310 for (auto &p
: inode
.old_pools
) {
1311 // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
1313 bt
.old_pools
.insert(p
);
1317 struct C_IO_Inode_StoredBacktrace
: public CInodeIOContext
{
1320 C_IO_Inode_StoredBacktrace(CInode
*i
, version_t v
, Context
*f
) : CInodeIOContext(i
), version(v
), fin(f
) {}
1321 void finish(int r
) override
{
1322 in
->_stored_backtrace(r
, version
, fin
);
1324 void print(ostream
& out
) const override
{
1325 out
<< "backtrace_store(" << in
->ino() << ")";
1329 void CInode::store_backtrace(MDSContext
*fin
, int op_prio
)
1331 dout(10) << __func__
<< " on " << *this << dendl
;
1332 ceph_assert(is_dirty_parent());
1335 op_prio
= CEPH_MSG_PRIO_DEFAULT
;
1339 const int64_t pool
= get_backtrace_pool();
1340 inode_backtrace_t bt
;
1341 build_backtrace(pool
, bt
);
1342 bufferlist parent_bl
;
1344 encode(bt
, parent_bl
);
1347 op
.priority
= op_prio
;
1349 op
.setxattr("parent", parent_bl
);
1351 bufferlist layout_bl
;
1352 encode(inode
.layout
, layout_bl
, mdcache
->mds
->mdsmap
->get_up_features());
1353 op
.setxattr("layout", layout_bl
);
1356 object_t oid
= get_object_name(ino(), frag_t(), "");
1357 object_locator_t
oloc(pool
);
1358 Context
*fin2
= new C_OnFinisher(
1359 new C_IO_Inode_StoredBacktrace(this, inode
.backtrace_version
, fin
),
1360 mdcache
->mds
->finisher
);
1362 if (!state_test(STATE_DIRTYPOOL
) || inode
.old_pools
.empty()) {
1363 dout(20) << __func__
<< ": no dirtypool or no old pools" << dendl
;
1364 mdcache
->mds
->objecter
->mutate(oid
, oloc
, op
, snapc
,
1365 ceph::real_clock::now(),
1370 C_GatherBuilder
gather(g_ceph_context
, fin2
);
1371 mdcache
->mds
->objecter
->mutate(oid
, oloc
, op
, snapc
,
1372 ceph::real_clock::now(),
1373 0, gather
.new_sub());
1375 // In the case where DIRTYPOOL is set, we update all old pools backtraces
1376 // such that anyone reading them will see the new pool ID in
1377 // inode_backtrace_t::pool and go read everything else from there.
1378 for (const auto &p
: inode
.old_pools
) {
1382 dout(20) << __func__
<< ": updating old pool " << p
<< dendl
;
1385 op
.priority
= op_prio
;
1387 op
.setxattr("parent", parent_bl
);
1389 object_locator_t
oloc(p
);
1390 mdcache
->mds
->objecter
->mutate(oid
, oloc
, op
, snapc
,
1391 ceph::real_clock::now(),
1392 0, gather
.new_sub());
1397 void CInode::_stored_backtrace(int r
, version_t v
, Context
*fin
)
1400 const int64_t pool
= get_backtrace_pool();
1401 bool exists
= mdcache
->mds
->objecter
->with_osdmap(
1402 [pool
](const OSDMap
&osd_map
) {
1403 return osd_map
.have_pg_pool(pool
);
1406 // This ENOENT is because the pool doesn't exist (the user deleted it
1407 // out from under us), so the backtrace can never be written, so pretend
1408 // to succeed so that the user can proceed to e.g. delete the file.
1410 dout(4) << __func__
<< " got ENOENT: a data pool was deleted "
1411 "beneath us!" << dendl
;
1417 dout(1) << "store backtrace error " << r
<< " v " << v
<< dendl
;
1418 mdcache
->mds
->clog
->error() << "failed to store backtrace on ino "
1419 << ino() << " object"
1420 << ", pool " << get_backtrace_pool()
1422 mdcache
->mds
->handle_write_error(r
);
1428 dout(10) << __func__
<< " v " << v
<< dendl
;
1431 if (v
== inode
.backtrace_version
)
1432 clear_dirty_parent();
1437 void CInode::fetch_backtrace(Context
*fin
, bufferlist
*backtrace
)
1439 mdcache
->fetch_backtrace(inode
.ino
, get_backtrace_pool(), *backtrace
, fin
);
1442 void CInode::mark_dirty_parent(LogSegment
*ls
, bool dirty_pool
)
1444 if (!state_test(STATE_DIRTYPARENT
)) {
1445 dout(10) << __func__
<< dendl
;
1446 state_set(STATE_DIRTYPARENT
);
1447 get(PIN_DIRTYPARENT
);
1451 state_set(STATE_DIRTYPOOL
);
1453 ls
->dirty_parent_inodes
.push_back(&item_dirty_parent
);
1456 void CInode::clear_dirty_parent()
1458 if (state_test(STATE_DIRTYPARENT
)) {
1459 dout(10) << __func__
<< dendl
;
1460 state_clear(STATE_DIRTYPARENT
);
1461 state_clear(STATE_DIRTYPOOL
);
1462 put(PIN_DIRTYPARENT
);
1463 item_dirty_parent
.remove_myself();
1467 void CInode::verify_diri_backtrace(bufferlist
&bl
, int err
)
1469 if (is_base() || is_dirty_parent() || !is_auth())
1472 dout(10) << __func__
<< dendl
;
1475 inode_backtrace_t backtrace
;
1477 decode(backtrace
, bl
);
1478 CDentry
*pdn
= get_parent_dn();
1479 if (backtrace
.ancestors
.empty() ||
1480 backtrace
.ancestors
[0].dname
!= pdn
->get_name() ||
1481 backtrace
.ancestors
[0].dirino
!= pdn
->get_dir()->ino())
1486 MDSRank
*mds
= mdcache
->mds
;
1487 mds
->clog
->error() << "bad backtrace on directory inode " << ino();
1488 ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace
> 1));
1490 mark_dirty_parent(mds
->mdlog
->get_current_segment(), false);
1491 mds
->mdlog
->flush();
1495 // ------------------
1499 void InodeStoreBase::encode_bare(bufferlist
&bl
, uint64_t features
,
1500 const bufferlist
*snap_blob
) const
1503 encode(inode
, bl
, features
);
1505 encode(symlink
, bl
);
1506 encode(dirfragtree
, bl
);
1509 encode(*snap_blob
, bl
);
1511 encode(bufferlist(), bl
);
1512 encode(old_inodes
, bl
, features
);
1513 encode(oldest_snap
, bl
);
1514 encode(damage_flags
, bl
);
1517 void InodeStoreBase::encode(bufferlist
&bl
, uint64_t features
,
1518 const bufferlist
*snap_blob
) const
1520 ENCODE_START(6, 4, bl
);
1521 encode_bare(bl
, features
, snap_blob
);
1525 void CInode::encode_store(bufferlist
& bl
, uint64_t features
)
1527 bufferlist snap_blob
;
1528 encode_snap_blob(snap_blob
);
1529 InodeStoreBase::encode(bl
, mdcache
->mds
->mdsmap
->get_up_features(),
1533 void InodeStoreBase::decode_bare(bufferlist::const_iterator
&bl
,
1534 bufferlist
& snap_blob
, __u8 struct_v
)
1541 symlink
= std::string_view(tmp
);
1543 decode(dirfragtree
, bl
);
1545 decode(snap_blob
, bl
);
1547 decode(old_inodes
, bl
);
1548 if (struct_v
== 2 && inode
.is_dir()) {
1549 bool default_layout_exists
;
1550 decode(default_layout_exists
, bl
);
1551 if (default_layout_exists
) {
1552 decode(struct_v
, bl
); // this was a default_file_layout
1553 decode(inode
.layout
, bl
); // but we only care about the layout portion
1557 if (struct_v
>= 5) {
1558 // InodeStore is embedded in dentries without proper versioning, so
1559 // we consume up to the end of the buffer
1561 decode(oldest_snap
, bl
);
1565 decode(damage_flags
, bl
);
1571 void InodeStoreBase::decode(bufferlist::const_iterator
&bl
, bufferlist
& snap_blob
)
1573 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl
);
1574 decode_bare(bl
, snap_blob
, struct_v
);
1578 void CInode::decode_store(bufferlist::const_iterator
& bl
)
1580 bufferlist snap_blob
;
1581 InodeStoreBase::decode(bl
, snap_blob
);
1582 decode_snap_blob(snap_blob
);
1585 // ------------------
1588 SimpleLock
* CInode::get_lock(int type
)
1591 case CEPH_LOCK_IVERSION
: return &versionlock
;
1592 case CEPH_LOCK_IFILE
: return &filelock
;
1593 case CEPH_LOCK_IAUTH
: return &authlock
;
1594 case CEPH_LOCK_ILINK
: return &linklock
;
1595 case CEPH_LOCK_IDFT
: return &dirfragtreelock
;
1596 case CEPH_LOCK_IXATTR
: return &xattrlock
;
1597 case CEPH_LOCK_ISNAP
: return &snaplock
;
1598 case CEPH_LOCK_INEST
: return &nestlock
;
1599 case CEPH_LOCK_IFLOCK
: return &flocklock
;
1600 case CEPH_LOCK_IPOLICY
: return &policylock
;
1605 void CInode::set_object_info(MDSCacheObjectInfo
&info
)
1611 void CInode::encode_lock_iauth(bufferlist
& bl
)
1613 ENCODE_START(1, 1, bl
);
1614 encode(inode
.version
, bl
);
1615 encode(inode
.ctime
, bl
);
1616 encode(inode
.mode
, bl
);
1617 encode(inode
.uid
, bl
);
1618 encode(inode
.gid
, bl
);
1622 void CInode::decode_lock_iauth(bufferlist::const_iterator
& p
)
1625 decode(inode
.version
, p
);
1628 if (inode
.ctime
< tm
) inode
.ctime
= tm
;
1629 decode(inode
.mode
, p
);
1630 decode(inode
.uid
, p
);
1631 decode(inode
.gid
, p
);
1635 void CInode::encode_lock_ilink(bufferlist
& bl
)
1637 ENCODE_START(1, 1, bl
);
1638 encode(inode
.version
, bl
);
1639 encode(inode
.ctime
, bl
);
1640 encode(inode
.nlink
, bl
);
1644 void CInode::decode_lock_ilink(bufferlist::const_iterator
& p
)
1647 decode(inode
.version
, p
);
1650 if (inode
.ctime
< tm
) inode
.ctime
= tm
;
1651 decode(inode
.nlink
, p
);
1655 void CInode::encode_lock_idft(bufferlist
& bl
)
1657 ENCODE_START(1, 1, bl
);
1659 encode(inode
.version
, bl
);
1661 // treat flushing as dirty when rejoining cache
1662 bool dirty
= dirfragtreelock
.is_dirty_or_flushing();
1666 // encode the raw tree
1667 encode(dirfragtree
, bl
);
1669 // also specify which frags are mine
1670 set
<frag_t
> myfrags
;
1671 auto&& dfls
= get_dirfrags();
1672 for (const auto& dir
: dfls
) {
1673 if (dir
->is_auth()) {
1674 frag_t fg
= dir
->get_frag();
1678 encode(myfrags
, bl
);
1683 void CInode::decode_lock_idft(bufferlist::const_iterator
& p
)
1688 decode(replica_dirty
, p
);
1689 if (replica_dirty
) {
1690 dout(10) << __func__
<< " setting dftlock dirty flag" << dendl
;
1691 dirfragtreelock
.mark_dirty(); // ok bc we're auth and caller will handle
1694 decode(inode
.version
, p
);
1699 set
<frag_t
> authfrags
;
1700 decode(authfrags
, p
);
1702 // auth. believe replica's auth frags only.
1703 for (auto fg
: authfrags
) {
1704 if (!dirfragtree
.is_leaf(fg
)) {
1705 dout(10) << " forcing frag " << fg
<< " to leaf (split|merge)" << dendl
;
1706 dirfragtree
.force_to_leaf(g_ceph_context
, fg
);
1707 dirfragtreelock
.mark_dirty(); // ok bc we're auth and caller will handle
1711 // replica. take the new tree, BUT make sure any open
1712 // dirfrags remain leaves (they may have split _after_ this
1713 // dft was scattered, or we may still be be waiting on the
1714 // notify from the auth)
1715 dirfragtree
.swap(temp
);
1716 for (const auto &p
: dirfrags
) {
1717 if (!dirfragtree
.is_leaf(p
.first
)) {
1718 dout(10) << " forcing open dirfrag " << p
.first
<< " to leaf (racing with split|merge)" << dendl
;
1719 dirfragtree
.force_to_leaf(g_ceph_context
, p
.first
);
1721 if (p
.second
->is_auth())
1722 p
.second
->state_clear(CDir::STATE_DIRTYDFT
);
1725 if (g_conf()->mds_debug_frag
)
1731 void CInode::encode_lock_ifile(bufferlist
& bl
)
1733 ENCODE_START(1, 1, bl
);
1735 encode(inode
.version
, bl
);
1736 encode(inode
.ctime
, bl
);
1737 encode(inode
.mtime
, bl
);
1738 encode(inode
.atime
, bl
);
1739 encode(inode
.time_warp_seq
, bl
);
1741 encode(inode
.layout
, bl
, mdcache
->mds
->mdsmap
->get_up_features());
1742 encode(inode
.size
, bl
);
1743 encode(inode
.truncate_seq
, bl
);
1744 encode(inode
.truncate_size
, bl
);
1745 encode(inode
.client_ranges
, bl
);
1746 encode(inode
.inline_data
, bl
);
1749 // treat flushing as dirty when rejoining cache
1750 bool dirty
= filelock
.is_dirty_or_flushing();
1753 dout(15) << __func__
<< " inode.dirstat is " << inode
.dirstat
<< dendl
;
1754 encode(inode
.dirstat
, bl
); // only meaningful if i am auth.
1757 for (const auto &p
: dirfrags
) {
1758 frag_t fg
= p
.first
;
1759 CDir
*dir
= p
.second
;
1760 if (is_auth() || dir
->is_auth()) {
1761 fnode_t
*pf
= dir
->get_projected_fnode();
1762 dout(15) << fg
<< " " << *dir
<< dendl
;
1763 dout(20) << fg
<< " fragstat " << pf
->fragstat
<< dendl
;
1764 dout(20) << fg
<< " accounted_fragstat " << pf
->accounted_fragstat
<< dendl
;
1766 encode(dir
->first
, tmp
);
1767 encode(pf
->fragstat
, tmp
);
1768 encode(pf
->accounted_fragstat
, tmp
);
1773 bl
.claim_append(tmp
);
1777 void CInode::decode_lock_ifile(bufferlist::const_iterator
& p
)
1781 decode(inode
.version
, p
);
1784 if (inode
.ctime
< tm
) inode
.ctime
= tm
;
1785 decode(inode
.mtime
, p
);
1786 decode(inode
.atime
, p
);
1787 decode(inode
.time_warp_seq
, p
);
1789 decode(inode
.layout
, p
);
1790 decode(inode
.size
, p
);
1791 decode(inode
.truncate_seq
, p
);
1792 decode(inode
.truncate_size
, p
);
1793 decode(inode
.client_ranges
, p
);
1794 decode(inode
.inline_data
, p
);
1798 decode(replica_dirty
, p
);
1799 if (replica_dirty
) {
1800 dout(10) << __func__
<< " setting filelock dirty flag" << dendl
;
1801 filelock
.mark_dirty(); // ok bc we're auth and caller will handle
1805 frag_info_t dirstat
;
1808 dout(10) << " taking inode dirstat " << dirstat
<< " for " << *this << dendl
;
1809 inode
.dirstat
= dirstat
; // take inode summation if replica
1813 dout(10) << " ...got " << n
<< " fragstats on " << *this << dendl
;
1817 frag_info_t fragstat
;
1818 frag_info_t accounted_fragstat
;
1821 decode(fragstat
, p
);
1822 decode(accounted_fragstat
, p
);
1823 dout(10) << fg
<< " [" << fgfirst
<< ",head] " << dendl
;
1824 dout(10) << fg
<< " fragstat " << fragstat
<< dendl
;
1825 dout(20) << fg
<< " accounted_fragstat " << accounted_fragstat
<< dendl
;
1827 CDir
*dir
= get_dirfrag(fg
);
1829 ceph_assert(dir
); // i am auth; i had better have this dir open
1830 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
1831 << " on " << *dir
<< dendl
;
1832 dir
->first
= fgfirst
;
1833 dir
->fnode
.fragstat
= fragstat
;
1834 dir
->fnode
.accounted_fragstat
= accounted_fragstat
;
1835 if (!(fragstat
== accounted_fragstat
)) {
1836 dout(10) << fg
<< " setting filelock updated flag" << dendl
;
1837 filelock
.mark_dirty(); // ok bc we're auth and caller will handle
1840 if (dir
&& dir
->is_auth()) {
1841 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
1842 << " on " << *dir
<< dendl
;
1843 dir
->first
= fgfirst
;
1844 fnode_t
*pf
= dir
->get_projected_fnode();
1845 finish_scatter_update(&filelock
, dir
,
1846 inode
.dirstat
.version
, pf
->accounted_fragstat
.version
);
1853 void CInode::encode_lock_inest(bufferlist
& bl
)
1855 ENCODE_START(1, 1, bl
);
1857 encode(inode
.version
, bl
);
1859 // treat flushing as dirty when rejoining cache
1860 bool dirty
= nestlock
.is_dirty_or_flushing();
1863 dout(15) << __func__
<< " inode.rstat is " << inode
.rstat
<< dendl
;
1864 encode(inode
.rstat
, bl
); // only meaningful if i am auth.
1867 for (const auto &p
: dirfrags
) {
1868 frag_t fg
= p
.first
;
1869 CDir
*dir
= p
.second
;
1870 if (is_auth() || dir
->is_auth()) {
1871 fnode_t
*pf
= dir
->get_projected_fnode();
1872 dout(10) << __func__
<< " " << fg
<< " dir " << *dir
<< dendl
;
1873 dout(10) << __func__
<< " " << fg
<< " rstat " << pf
->rstat
<< dendl
;
1874 dout(10) << __func__
<< " " << fg
<< " accounted_rstat " << pf
->rstat
<< dendl
;
1875 dout(10) << __func__
<< " " << fg
<< " dirty_old_rstat " << dir
->dirty_old_rstat
<< dendl
;
1877 encode(dir
->first
, tmp
);
1878 encode(pf
->rstat
, tmp
);
1879 encode(pf
->accounted_rstat
, tmp
);
1880 encode(dir
->dirty_old_rstat
, tmp
);
1885 bl
.claim_append(tmp
);
1889 void CInode::decode_lock_inest(bufferlist::const_iterator
& p
)
1894 decode(replica_dirty
, p
);
1895 if (replica_dirty
) {
1896 dout(10) << __func__
<< " setting nestlock dirty flag" << dendl
;
1897 nestlock
.mark_dirty(); // ok bc we're auth and caller will handle
1900 decode(inode
.version
, p
);
1905 dout(10) << __func__
<< " taking inode rstat " << rstat
<< " for " << *this << dendl
;
1906 inode
.rstat
= rstat
; // take inode summation if replica
1914 nest_info_t accounted_rstat
;
1915 decltype(CDir::dirty_old_rstat
) dirty_old_rstat
;
1919 decode(accounted_rstat
, p
);
1920 decode(dirty_old_rstat
, p
);
1921 dout(10) << __func__
<< " " << fg
<< " [" << fgfirst
<< ",head]" << dendl
;
1922 dout(10) << __func__
<< " " << fg
<< " rstat " << rstat
<< dendl
;
1923 dout(10) << __func__
<< " " << fg
<< " accounted_rstat " << accounted_rstat
<< dendl
;
1924 dout(10) << __func__
<< " " << fg
<< " dirty_old_rstat " << dirty_old_rstat
<< dendl
;
1925 CDir
*dir
= get_dirfrag(fg
);
1927 ceph_assert(dir
); // i am auth; i had better have this dir open
1928 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
1929 << " on " << *dir
<< dendl
;
1930 dir
->first
= fgfirst
;
1931 dir
->fnode
.rstat
= rstat
;
1932 dir
->fnode
.accounted_rstat
= accounted_rstat
;
1933 dir
->dirty_old_rstat
.swap(dirty_old_rstat
);
1934 if (!(rstat
== accounted_rstat
) || !dir
->dirty_old_rstat
.empty()) {
1935 dout(10) << fg
<< " setting nestlock updated flag" << dendl
;
1936 nestlock
.mark_dirty(); // ok bc we're auth and caller will handle
1939 if (dir
&& dir
->is_auth()) {
1940 dout(10) << fg
<< " first " << dir
->first
<< " -> " << fgfirst
1941 << " on " << *dir
<< dendl
;
1942 dir
->first
= fgfirst
;
1943 fnode_t
*pf
= dir
->get_projected_fnode();
1944 finish_scatter_update(&nestlock
, dir
,
1945 inode
.rstat
.version
, pf
->accounted_rstat
.version
);
1952 void CInode::encode_lock_ixattr(bufferlist
& bl
)
1954 ENCODE_START(1, 1, bl
);
1955 encode(inode
.version
, bl
);
1956 encode(inode
.ctime
, bl
);
1961 void CInode::decode_lock_ixattr(bufferlist::const_iterator
& p
)
1964 decode(inode
.version
, p
);
1967 if (inode
.ctime
< tm
) inode
.ctime
= tm
;
1972 void CInode::encode_lock_isnap(bufferlist
& bl
)
1974 ENCODE_START(1, 1, bl
);
1975 encode(inode
.version
, bl
);
1976 encode(inode
.ctime
, bl
);
1981 void CInode::decode_lock_isnap(bufferlist::const_iterator
& p
)
1984 decode(inode
.version
, p
);
1987 if (inode
.ctime
< tm
) inode
.ctime
= tm
;
1992 void CInode::encode_lock_iflock(bufferlist
& bl
)
1994 ENCODE_START(1, 1, bl
);
1995 encode(inode
.version
, bl
);
1996 _encode_file_locks(bl
);
2000 void CInode::decode_lock_iflock(bufferlist::const_iterator
& p
)
2003 decode(inode
.version
, p
);
2004 _decode_file_locks(p
);
2008 void CInode::encode_lock_ipolicy(bufferlist
& bl
)
2010 ENCODE_START(1, 1, bl
);
2011 if (inode
.is_dir()) {
2012 encode(inode
.version
, bl
);
2013 encode(inode
.ctime
, bl
);
2014 encode(inode
.layout
, bl
, mdcache
->mds
->mdsmap
->get_up_features());
2015 encode(inode
.quota
, bl
);
2016 encode(inode
.export_pin
, bl
);
2021 void CInode::decode_lock_ipolicy(bufferlist::const_iterator
& p
)
2024 if (inode
.is_dir()) {
2025 decode(inode
.version
, p
);
2028 if (inode
.ctime
< tm
) inode
.ctime
= tm
;
2029 decode(inode
.layout
, p
);
2030 decode(inode
.quota
, p
);
2031 mds_rank_t old_pin
= inode
.export_pin
;
2032 decode(inode
.export_pin
, p
);
2033 maybe_export_pin(old_pin
!= inode
.export_pin
);
2038 void CInode::encode_lock_state(int type
, bufferlist
& bl
)
2040 ENCODE_START(1, 1, bl
);
2043 encode(parent
->first
, bl
);
2046 case CEPH_LOCK_IAUTH
:
2047 encode_lock_iauth(bl
);
2050 case CEPH_LOCK_ILINK
:
2051 encode_lock_ilink(bl
);
2054 case CEPH_LOCK_IDFT
:
2055 encode_lock_idft(bl
);
2058 case CEPH_LOCK_IFILE
:
2059 encode_lock_ifile(bl
);
2062 case CEPH_LOCK_INEST
:
2063 encode_lock_inest(bl
);
2066 case CEPH_LOCK_IXATTR
:
2067 encode_lock_ixattr(bl
);
2070 case CEPH_LOCK_ISNAP
:
2071 encode_lock_isnap(bl
);
2074 case CEPH_LOCK_IFLOCK
:
2075 encode_lock_iflock(bl
);
2078 case CEPH_LOCK_IPOLICY
:
2079 encode_lock_ipolicy(bl
);
2088 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2090 void CInode::decode_lock_state(int type
, const bufferlist
& bl
)
2092 auto p
= bl
.cbegin();
2099 decode(newfirst
, p
);
2100 if (!is_auth() && newfirst
!= first
) {
2101 dout(10) << __func__
<< " first " << first
<< " -> " << newfirst
<< dendl
;
2105 decode(newfirst
, p
);
2106 if (!parent
->is_auth() && newfirst
!= parent
->first
) {
2107 dout(10) << __func__
<< " parent first " << first
<< " -> " << newfirst
<< dendl
;
2108 parent
->first
= newfirst
;
2113 case CEPH_LOCK_IAUTH
:
2114 decode_lock_iauth(p
);
2117 case CEPH_LOCK_ILINK
:
2118 decode_lock_ilink(p
);
2121 case CEPH_LOCK_IDFT
:
2122 decode_lock_idft(p
);
2125 case CEPH_LOCK_IFILE
:
2126 decode_lock_ifile(p
);
2129 case CEPH_LOCK_INEST
:
2130 decode_lock_inest(p
);
2133 case CEPH_LOCK_IXATTR
:
2134 decode_lock_ixattr(p
);
2137 case CEPH_LOCK_ISNAP
:
2138 decode_lock_isnap(p
);
2141 case CEPH_LOCK_IFLOCK
:
2142 decode_lock_iflock(p
);
2145 case CEPH_LOCK_IPOLICY
:
2146 decode_lock_ipolicy(p
);
2156 bool CInode::is_dirty_scattered()
2159 filelock
.is_dirty_or_flushing() ||
2160 nestlock
.is_dirty_or_flushing() ||
2161 dirfragtreelock
.is_dirty_or_flushing();
2164 void CInode::clear_scatter_dirty()
2166 filelock
.remove_dirty();
2167 nestlock
.remove_dirty();
2168 dirfragtreelock
.remove_dirty();
2171 void CInode::clear_dirty_scattered(int type
)
2173 dout(10) << __func__
<< " " << type
<< " on " << *this << dendl
;
2174 ceph_assert(is_dir());
2176 case CEPH_LOCK_IFILE
:
2177 item_dirty_dirfrag_dir
.remove_myself();
2180 case CEPH_LOCK_INEST
:
2181 item_dirty_dirfrag_nest
.remove_myself();
2184 case CEPH_LOCK_IDFT
:
2185 item_dirty_dirfrag_dirfragtree
.remove_myself();
2195 * when we initially scatter a lock, we need to check if any of the dirfrags
2196 * have out of date accounted_rstat/fragstat. if so, mark the lock stale.
2198 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2199 void CInode::start_scatter(ScatterLock
*lock
)
2201 dout(10) << __func__
<< " " << *lock
<< " on " << *this << dendl
;
2202 ceph_assert(is_auth());
2203 mempool_inode
*pi
= get_projected_inode();
2205 for (const auto &p
: dirfrags
) {
2206 frag_t fg
= p
.first
;
2207 CDir
*dir
= p
.second
;
2208 fnode_t
*pf
= dir
->get_projected_fnode();
2209 dout(20) << fg
<< " " << *dir
<< dendl
;
2211 if (!dir
->is_auth())
2214 switch (lock
->get_type()) {
2215 case CEPH_LOCK_IFILE
:
2216 finish_scatter_update(lock
, dir
, pi
->dirstat
.version
, pf
->accounted_fragstat
.version
);
2219 case CEPH_LOCK_INEST
:
2220 finish_scatter_update(lock
, dir
, pi
->rstat
.version
, pf
->accounted_rstat
.version
);
2223 case CEPH_LOCK_IDFT
:
2224 dir
->state_clear(CDir::STATE_DIRTYDFT
);
2231 class C_Inode_FragUpdate
: public MDSLogContextBase
{
2236 MDSRank
*get_mds() override
{return in
->mdcache
->mds
;}
2237 void finish(int r
) override
{
2238 in
->_finish_frag_update(dir
, mut
);
2242 C_Inode_FragUpdate(CInode
*i
, CDir
*d
, MutationRef
& m
) : in(i
), dir(d
), mut(m
) {}
2245 void CInode::finish_scatter_update(ScatterLock
*lock
, CDir
*dir
,
2246 version_t inode_version
, version_t dir_accounted_version
)
2248 frag_t fg
= dir
->get_frag();
2249 ceph_assert(dir
->is_auth());
2251 if (dir
->is_frozen()) {
2252 dout(10) << __func__
<< " " << fg
<< " frozen, marking " << *lock
<< " stale " << *dir
<< dendl
;
2253 } else if (dir
->get_version() == 0) {
2254 dout(10) << __func__
<< " " << fg
<< " not loaded, marking " << *lock
<< " stale " << *dir
<< dendl
;
2256 if (dir_accounted_version
!= inode_version
) {
2257 dout(10) << __func__
<< " " << fg
<< " journaling accounted scatterstat update v" << inode_version
<< dendl
;
2259 MDLog
*mdlog
= mdcache
->mds
->mdlog
;
2260 MutationRef
mut(new MutationImpl());
2261 mut
->ls
= mdlog
->get_current_segment();
2263 mempool_inode
*pi
= get_projected_inode();
2264 fnode_t
*pf
= dir
->project_fnode();
2266 std::string_view ename
;
2267 switch (lock
->get_type()) {
2268 case CEPH_LOCK_IFILE
:
2269 pf
->fragstat
.version
= pi
->dirstat
.version
;
2270 pf
->accounted_fragstat
= pf
->fragstat
;
2271 ename
= "lock ifile accounted scatter stat update";
2273 case CEPH_LOCK_INEST
:
2274 pf
->rstat
.version
= pi
->rstat
.version
;
2275 pf
->accounted_rstat
= pf
->rstat
;
2276 ename
= "lock inest accounted scatter stat update";
2278 if (!is_auth() && lock
->get_state() == LOCK_MIX
) {
2279 dout(10) << __func__
<< " try to assimilate dirty rstat on "
2281 dir
->assimilate_dirty_rstat_inodes();
2289 pf
->version
= dir
->pre_dirty();
2290 mut
->add_projected_fnode(dir
);
2292 EUpdate
*le
= new EUpdate(mdlog
, ename
);
2293 mdlog
->start_entry(le
);
2294 le
->metablob
.add_dir_context(dir
);
2295 le
->metablob
.add_dir(dir
, true);
2297 ceph_assert(!dir
->is_frozen());
2300 if (lock
->get_type() == CEPH_LOCK_INEST
&&
2301 !is_auth() && lock
->get_state() == LOCK_MIX
) {
2302 dout(10) << __func__
<< " finish assimilating dirty rstat on "
2304 dir
->assimilate_dirty_rstat_inodes_finish(mut
, &le
->metablob
);
2306 if (!(pf
->rstat
== pf
->accounted_rstat
)) {
2307 if (!mut
->is_wrlocked(&nestlock
)) {
2308 mdcache
->mds
->locker
->wrlock_force(&nestlock
, mut
);
2311 mdcache
->mds
->locker
->mark_updated_scatterlock(&nestlock
);
2312 mut
->ls
->dirty_dirfrag_nest
.push_back(&item_dirty_dirfrag_nest
);
2316 mdlog
->submit_entry(le
, new C_Inode_FragUpdate(this, dir
, mut
));
2318 dout(10) << __func__
<< " " << fg
<< " accounted " << *lock
2319 << " scatter stat unchanged at v" << dir_accounted_version
<< dendl
;
2324 void CInode::_finish_frag_update(CDir
*dir
, MutationRef
& mut
)
2326 dout(10) << __func__
<< " on " << *dir
<< dendl
;
2328 mdcache
->mds
->locker
->drop_locks(mut
.get());
2334 * when we gather a lock, we need to assimilate dirfrag changes into the inode
2335 * state. it's possible we can't update the dirfrag accounted_rstat/fragstat
2336 * because the frag is auth and frozen, or that the replica couldn't for the same
2337 * reason. hopefully it will get updated the next time the lock cycles.
2339 * we have two dimensions of behavior:
2340 * - we may be (auth and !frozen), and able to update, or not.
2341 * - the frag may be stale, or not.
2343 * if the frag is non-stale, we want to assimilate the diff into the
2344 * inode, regardless of whether it's auth or updateable.
2346 * if we update the frag, we want to set accounted_fragstat = frag,
2347 * both if we took the diff or it was stale and we are making it
2350 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2351 void CInode::finish_scatter_gather_update(int type
)
2353 LogChannelRef clog
= mdcache
->mds
->clog
;
2355 dout(10) << __func__
<< " " << type
<< " on " << *this << dendl
;
2356 ceph_assert(is_auth());
2359 case CEPH_LOCK_IFILE
:
2361 fragtree_t tmpdft
= dirfragtree
;
2362 struct frag_info_t dirstat
;
2363 bool dirstat_valid
= true;
2366 ceph_assert(is_auth());
2367 mempool_inode
*pi
= get_projected_inode();
2369 bool touched_mtime
= false, touched_chattr
= false;
2370 dout(20) << " orig dirstat " << pi
->dirstat
<< dendl
;
2371 pi
->dirstat
.version
++;
2372 for (const auto &p
: dirfrags
) {
2373 frag_t fg
= p
.first
;
2374 CDir
*dir
= p
.second
;
2375 dout(20) << fg
<< " " << *dir
<< dendl
;
2378 if (dir
->get_version() != 0) {
2379 update
= dir
->is_auth() && !dir
->is_frozen();
2382 dirstat_valid
= false;
2385 fnode_t
*pf
= dir
->get_projected_fnode();
2387 pf
= dir
->project_fnode();
2389 if (pf
->accounted_fragstat
.version
== pi
->dirstat
.version
- 1) {
2390 dout(20) << fg
<< " fragstat " << pf
->fragstat
<< dendl
;
2391 dout(20) << fg
<< " accounted_fragstat " << pf
->accounted_fragstat
<< dendl
;
2392 pi
->dirstat
.add_delta(pf
->fragstat
, pf
->accounted_fragstat
, &touched_mtime
, &touched_chattr
);
2394 dout(20) << fg
<< " skipping STALE accounted_fragstat " << pf
->accounted_fragstat
<< dendl
;
2397 if (pf
->fragstat
.nfiles
< 0 ||
2398 pf
->fragstat
.nsubdirs
< 0) {
2399 clog
->error() << "bad/negative dir size on "
2400 << dir
->dirfrag() << " " << pf
->fragstat
;
2401 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter
);
2403 if (pf
->fragstat
.nfiles
< 0)
2404 pf
->fragstat
.nfiles
= 0;
2405 if (pf
->fragstat
.nsubdirs
< 0)
2406 pf
->fragstat
.nsubdirs
= 0;
2410 pf
->accounted_fragstat
= pf
->fragstat
;
2411 pf
->fragstat
.version
= pf
->accounted_fragstat
.version
= pi
->dirstat
.version
;
2412 dout(10) << fg
<< " updated accounted_fragstat " << pf
->fragstat
<< " on " << *dir
<< dendl
;
2415 tmpdft
.force_to_leaf(g_ceph_context
, fg
);
2416 dirstat
.add(pf
->fragstat
);
2419 pi
->mtime
= pi
->ctime
= pi
->dirstat
.mtime
;
2421 pi
->change_attr
= pi
->dirstat
.change_attr
;
2422 dout(20) << " final dirstat " << pi
->dirstat
<< dendl
;
2424 if (dirstat_valid
&& !dirstat
.same_sums(pi
->dirstat
)) {
2426 tmpdft
.get_leaves_under(frag_t(), leaves
);
2427 for (const auto& leaf
: leaves
) {
2428 if (!dirfrags
.count(leaf
)) {
2429 dirstat_valid
= false;
2433 if (dirstat_valid
) {
2434 if (state_test(CInode::STATE_REPAIRSTATS
)) {
2435 dout(20) << " dirstat mismatch, fixing" << dendl
;
2437 clog
->error() << "unmatched fragstat on " << ino() << ", inode has "
2438 << pi
->dirstat
<< ", dirfrags have " << dirstat
;
2439 ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter
);
2441 // trust the dirfrags for now
2442 version_t v
= pi
->dirstat
.version
;
2443 if (pi
->dirstat
.mtime
> dirstat
.mtime
)
2444 dirstat
.mtime
= pi
->dirstat
.mtime
;
2445 if (pi
->dirstat
.change_attr
> dirstat
.change_attr
)
2446 dirstat
.change_attr
= pi
->dirstat
.change_attr
;
2447 pi
->dirstat
= dirstat
;
2448 pi
->dirstat
.version
= v
;
2452 if (pi
->dirstat
.nfiles
< 0 || pi
->dirstat
.nsubdirs
< 0)
2455 make_path_string(path
);
2456 clog
->error() << "Inconsistent statistics detected: fragstat on inode "
2457 << ino() << " (" << path
<< "), inode has " << pi
->dirstat
;
2458 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter
);
2460 if (pi
->dirstat
.nfiles
< 0)
2461 pi
->dirstat
.nfiles
= 0;
2462 if (pi
->dirstat
.nsubdirs
< 0)
2463 pi
->dirstat
.nsubdirs
= 0;
2468 case CEPH_LOCK_INEST
:
2471 ceph_assert(is_auth());
2473 fragtree_t tmpdft
= dirfragtree
;
2475 bool rstat_valid
= true;
2478 if (const sr_t
*srnode
= get_projected_srnode(); srnode
)
2479 rstat
.rsnaps
= srnode
->snaps
.size();
2481 mempool_inode
*pi
= get_projected_inode();
2482 dout(20) << " orig rstat " << pi
->rstat
<< dendl
;
2483 pi
->rstat
.version
++;
2484 for (const auto &p
: dirfrags
) {
2485 frag_t fg
= p
.first
;
2486 CDir
*dir
= p
.second
;
2487 dout(20) << fg
<< " " << *dir
<< dendl
;
2490 if (dir
->get_version() != 0) {
2491 update
= dir
->is_auth() && !dir
->is_frozen();
2494 rstat_valid
= false;
2497 fnode_t
*pf
= dir
->get_projected_fnode();
2499 pf
= dir
->project_fnode();
2501 if (pf
->accounted_rstat
.version
== pi
->rstat
.version
-1) {
2502 // only pull this frag's dirty rstat inodes into the frag if
2503 // the frag is non-stale and updateable. if it's stale,
2504 // that info will just get thrown out!
2506 dir
->assimilate_dirty_rstat_inodes();
2508 dout(20) << fg
<< " rstat " << pf
->rstat
<< dendl
;
2509 dout(20) << fg
<< " accounted_rstat " << pf
->accounted_rstat
<< dendl
;
2510 dout(20) << fg
<< " dirty_old_rstat " << dir
->dirty_old_rstat
<< dendl
;
2511 mdcache
->project_rstat_frag_to_inode(pf
->rstat
, pf
->accounted_rstat
,
2512 dir
->first
, CEPH_NOSNAP
, this, true);
2513 for (auto &p
: dir
->dirty_old_rstat
) {
2514 mdcache
->project_rstat_frag_to_inode(p
.second
.rstat
, p
.second
.accounted_rstat
,
2515 p
.second
.first
, p
.first
, this, true);
2517 if (update
) // dir contents not valid if frozen or non-auth
2518 dir
->check_rstats();
2520 dout(20) << fg
<< " skipping STALE accounted_rstat " << pf
->accounted_rstat
<< dendl
;
2523 pf
->accounted_rstat
= pf
->rstat
;
2524 dir
->dirty_old_rstat
.clear();
2525 pf
->rstat
.version
= pf
->accounted_rstat
.version
= pi
->rstat
.version
;
2526 dir
->check_rstats();
2527 dout(10) << fg
<< " updated accounted_rstat " << pf
->rstat
<< " on " << *dir
<< dendl
;
2530 tmpdft
.force_to_leaf(g_ceph_context
, fg
);
2531 rstat
.add(pf
->rstat
);
2533 dout(20) << " final rstat " << pi
->rstat
<< dendl
;
2535 if (rstat_valid
&& !rstat
.same_sums(pi
->rstat
)) {
2537 tmpdft
.get_leaves_under(frag_t(), leaves
);
2538 for (const auto& leaf
: leaves
) {
2539 if (!dirfrags
.count(leaf
)) {
2540 rstat_valid
= false;
2545 if (state_test(CInode::STATE_REPAIRSTATS
)) {
2546 dout(20) << " rstat mismatch, fixing" << dendl
;
2548 clog
->error() << "inconsistent rstat on inode " << ino()
2549 << ", inode has " << pi
->rstat
2550 << ", directory fragments have " << rstat
;
2551 ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter
);
2553 // trust the dirfrag for now
2554 version_t v
= pi
->rstat
.version
;
2555 if (pi
->rstat
.rctime
> rstat
.rctime
)
2556 rstat
.rctime
= pi
->rstat
.rctime
;
2558 pi
->rstat
.version
= v
;
2562 mdcache
->broadcast_quota_to_client(this);
2566 case CEPH_LOCK_IDFT
:
2574 void CInode::finish_scatter_gather_update_accounted(int type
, MutationRef
& mut
, EMetaBlob
*metablob
)
2576 dout(10) << __func__
<< " " << type
<< " on " << *this << dendl
;
2577 ceph_assert(is_auth());
2579 for (const auto &p
: dirfrags
) {
2580 CDir
*dir
= p
.second
;
2581 if (!dir
->is_auth() || dir
->get_version() == 0 || dir
->is_frozen())
2584 if (type
== CEPH_LOCK_IDFT
)
2585 continue; // nothing to do.
2587 dout(10) << " journaling updated frag accounted_ on " << *dir
<< dendl
;
2588 ceph_assert(dir
->is_projected());
2589 fnode_t
*pf
= dir
->get_projected_fnode();
2590 pf
->version
= dir
->pre_dirty();
2591 mut
->add_projected_fnode(dir
);
2592 metablob
->add_dir(dir
, true);
2595 if (type
== CEPH_LOCK_INEST
)
2596 dir
->assimilate_dirty_rstat_inodes_finish(mut
, metablob
);
2602 bool CInode::is_frozen() const
2604 if (is_frozen_inode()) return true;
2605 if (parent
&& parent
->dir
->is_frozen()) return true;
2609 bool CInode::is_frozen_dir() const
2611 if (parent
&& parent
->dir
->is_frozen_dir()) return true;
2615 bool CInode::is_freezing() const
2617 if (is_freezing_inode()) return true;
2618 if (parent
&& parent
->dir
->is_freezing()) return true;
2622 void CInode::add_dir_waiter(frag_t fg
, MDSContext
*c
)
2624 if (waiting_on_dir
.empty())
2626 waiting_on_dir
[fg
].push_back(c
);
2627 dout(10) << __func__
<< " frag " << fg
<< " " << c
<< " on " << *this << dendl
;
2630 void CInode::take_dir_waiting(frag_t fg
, MDSContext::vec
& ls
)
2632 if (waiting_on_dir
.empty())
2635 auto it
= waiting_on_dir
.find(fg
);
2636 if (it
!= waiting_on_dir
.end()) {
2637 dout(10) << __func__
<< " frag " << fg
<< " on " << *this << dendl
;
2638 auto& waiting
= it
->second
;
2639 ls
.insert(ls
.end(), waiting
.begin(), waiting
.end());
2640 waiting_on_dir
.erase(it
);
2642 if (waiting_on_dir
.empty())
2647 void CInode::add_waiter(uint64_t tag
, MDSContext
*c
)
2649 dout(10) << __func__
<< " tag " << std::hex
<< tag
<< std::dec
<< " " << c
2650 << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH
)
2651 << " !frozen " << !is_frozen_inode()
2652 << " !freezing " << !is_freezing_inode()
2654 // wait on the directory?
2655 // make sure its not the inode that is explicitly ambiguous|freezing|frozen
2656 if (((tag
& WAIT_SINGLEAUTH
) && !state_test(STATE_AMBIGUOUSAUTH
)) ||
2657 ((tag
& WAIT_UNFREEZE
) &&
2658 !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) {
2659 dout(15) << "passing waiter up tree" << dendl
;
2660 parent
->dir
->add_waiter(tag
, c
);
2663 dout(15) << "taking waiter here" << dendl
;
2664 MDSCacheObject::add_waiter(tag
, c
);
2667 void CInode::take_waiting(uint64_t mask
, MDSContext::vec
& ls
)
2669 if ((mask
& WAIT_DIR
) && !waiting_on_dir
.empty()) {
2670 // take all dentry waiters
2671 while (!waiting_on_dir
.empty()) {
2672 auto it
= waiting_on_dir
.begin();
2673 dout(10) << __func__
<< " dirfrag " << it
->first
<< " on " << *this << dendl
;
2674 auto& waiting
= it
->second
;
2675 ls
.insert(ls
.end(), waiting
.begin(), waiting
.end());
2676 waiting_on_dir
.erase(it
);
2682 MDSCacheObject::take_waiting(mask
, ls
);
2685 void CInode::maybe_finish_freeze_inode()
2687 CDir
*dir
= get_parent_dir();
2688 if (auth_pins
> auth_pin_freeze_allowance
|| dir
->frozen_inode_suppressed
)
2691 dout(10) << "maybe_finish_freeze_inode - frozen" << dendl
;
2692 ceph_assert(auth_pins
== auth_pin_freeze_allowance
);
2695 state_clear(STATE_FREEZING
);
2696 state_set(STATE_FROZEN
);
2698 item_freezing_inode
.remove_myself();
2699 dir
->num_frozen_inodes
++;
2701 finish_waiting(WAIT_FROZEN
);
2704 bool CInode::freeze_inode(int auth_pin_allowance
)
2706 CDir
*dir
= get_parent_dir();
2709 ceph_assert(auth_pin_allowance
> 0); // otherwise we need to adjust parent's nested_auth_pins
2710 ceph_assert(auth_pins
>= auth_pin_allowance
);
2711 if (auth_pins
== auth_pin_allowance
&& !dir
->frozen_inode_suppressed
) {
2712 dout(10) << "freeze_inode - frozen" << dendl
;
2713 if (!state_test(STATE_FROZEN
)) {
2715 state_set(STATE_FROZEN
);
2716 dir
->num_frozen_inodes
++;
2721 dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance
<< dendl
;
2722 auth_pin_freeze_allowance
= auth_pin_allowance
;
2723 dir
->freezing_inodes
.push_back(&item_freezing_inode
);
2726 state_set(STATE_FREEZING
);
2728 if (!dir
->lock_caches_with_auth_pins
.empty())
2729 mdcache
->mds
->locker
->invalidate_lock_caches(dir
);
2731 const static int lock_types
[] = {
2732 CEPH_LOCK_IVERSION
, CEPH_LOCK_IFILE
, CEPH_LOCK_IAUTH
, CEPH_LOCK_ILINK
, CEPH_LOCK_IDFT
,
2733 CEPH_LOCK_IXATTR
, CEPH_LOCK_ISNAP
, CEPH_LOCK_INEST
, CEPH_LOCK_IFLOCK
, CEPH_LOCK_IPOLICY
, 0
2735 for (int i
= 0; lock_types
[i
]; ++i
) {
2736 auto lock
= get_lock(lock_types
[i
]);
2737 if (lock
->is_cached())
2738 mdcache
->mds
->locker
->invalidate_lock_caches(lock
);
2740 // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed
2741 // and finish freezing the inode
2742 return state_test(STATE_FROZEN
);
2745 void CInode::unfreeze_inode(MDSContext::vec
& finished
)
2747 dout(10) << __func__
<< dendl
;
2748 if (state_test(STATE_FREEZING
)) {
2749 state_clear(STATE_FREEZING
);
2751 item_freezing_inode
.remove_myself();
2752 } else if (state_test(STATE_FROZEN
)) {
2753 state_clear(STATE_FROZEN
);
2755 get_parent_dir()->num_frozen_inodes
--;
2758 take_waiting(WAIT_UNFREEZE
, finished
);
2761 void CInode::unfreeze_inode()
2763 MDSContext::vec finished
;
2764 unfreeze_inode(finished
);
2765 mdcache
->mds
->queue_waiters(finished
);
2768 void CInode::freeze_auth_pin()
2770 ceph_assert(state_test(CInode::STATE_FROZEN
));
2771 state_set(CInode::STATE_FROZENAUTHPIN
);
2772 get_parent_dir()->num_frozen_inodes
++;
2775 void CInode::unfreeze_auth_pin()
2777 ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN
));
2778 state_clear(CInode::STATE_FROZENAUTHPIN
);
2779 get_parent_dir()->num_frozen_inodes
--;
2780 if (!state_test(STATE_FREEZING
|STATE_FROZEN
)) {
2781 MDSContext::vec finished
;
2782 take_waiting(WAIT_UNFREEZE
, finished
);
2783 mdcache
->mds
->queue_waiters(finished
);
2787 void CInode::clear_ambiguous_auth(MDSContext::vec
& finished
)
2789 ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH
));
2790 state_clear(CInode::STATE_AMBIGUOUSAUTH
);
2791 take_waiting(CInode::WAIT_SINGLEAUTH
, finished
);
2794 void CInode::clear_ambiguous_auth()
2796 MDSContext::vec finished
;
2797 clear_ambiguous_auth(finished
);
2798 mdcache
->mds
->queue_waiters(finished
);
2802 bool CInode::can_auth_pin(int *err_ret
) const {
2806 } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
2807 err
= ERR_EXPORTING_INODE
;
2810 return parent
->can_auth_pin(err_ret
);
2818 void CInode::auth_pin(void *by
)
2824 #ifdef MDS_AUTHPIN_SET
2825 auth_pin_set
.insert(by
);
2828 dout(10) << "auth_pin by " << by
<< " on " << *this << " now " << auth_pins
<< dendl
;
2831 parent
->adjust_nested_auth_pins(1, this);
2834 void CInode::auth_unpin(void *by
)
2838 #ifdef MDS_AUTHPIN_SET
2840 auto it
= auth_pin_set
.find(by
);
2841 ceph_assert(it
!= auth_pin_set
.end());
2842 auth_pin_set
.erase(it
);
2849 dout(10) << "auth_unpin by " << by
<< " on " << *this << " now " << auth_pins
<< dendl
;
2851 ceph_assert(auth_pins
>= 0);
2854 parent
->adjust_nested_auth_pins(-1, by
);
2856 if (is_freezing_inode())
2857 maybe_finish_freeze_inode();
2862 mds_authority_t
CInode::authority() const
2864 if (inode_auth
.first
>= 0)
2868 return parent
->dir
->authority();
2870 // new items that are not yet linked in (in the committed plane) belong
2871 // to their first parent.
2872 if (!projected_parent
.empty())
2873 return projected_parent
.front()->dir
->authority();
2875 return CDIR_AUTH_UNDEF
;
2881 snapid_t
CInode::get_oldest_snap()
2884 if (!old_inodes
.empty())
2885 t
= old_inodes
.begin()->second
.first
;
2886 return std::min(t
, oldest_snap
);
2889 CInode::mempool_old_inode
& CInode::cow_old_inode(snapid_t follows
, bool cow_head
)
2891 ceph_assert(follows
>= first
);
2893 mempool_inode
*pi
= cow_head
? get_projected_inode() : get_previous_projected_inode();
2894 mempool_xattr_map
*px
= cow_head
? get_projected_xattrs() : get_previous_projected_xattrs();
2896 mempool_old_inode
&old
= old_inodes
[follows
];
2901 if (first
< oldest_snap
)
2902 oldest_snap
= first
;
2904 dout(10) << " " << px
->size() << " xattrs cowed, " << *px
<< dendl
;
2906 old
.inode
.trim_client_ranges(follows
);
2908 if (g_conf()->mds_snap_rstat
&&
2909 !(old
.inode
.rstat
== old
.inode
.accounted_rstat
))
2910 dirty_old_rstats
.insert(follows
);
2914 dout(10) << __func__
<< " " << (cow_head
? "head" : "previous_head" )
2915 << " to [" << old
.first
<< "," << follows
<< "] on "
2921 void CInode::split_old_inode(snapid_t snap
)
2923 auto it
= old_inodes
.lower_bound(snap
);
2924 ceph_assert(it
!= old_inodes
.end() && it
->second
.first
< snap
);
2926 mempool_old_inode
&old
= old_inodes
[snap
- 1];
2929 it
->second
.first
= snap
;
2930 dout(10) << __func__
<< " " << "[" << old
.first
<< "," << it
->first
2931 << "] to [" << snap
<< "," << it
->first
<< "] on " << *this << dendl
;
2934 void CInode::pre_cow_old_inode()
2936 snapid_t follows
= mdcache
->get_global_snaprealm()->get_newest_seq();
2937 if (first
<= follows
)
2938 cow_old_inode(follows
, true);
2941 bool CInode::has_snap_data(snapid_t snapid
)
2943 bool found
= snapid
>= first
&& snapid
<= last
;
2944 if (!found
&& is_multiversion()) {
2945 auto p
= old_inodes
.lower_bound(snapid
);
2946 if (p
!= old_inodes
.end()) {
2947 if (p
->second
.first
> snapid
) {
2948 if (p
!= old_inodes
.begin())
2951 if (p
->second
.first
<= snapid
&& snapid
<= p
->first
) {
2959 void CInode::purge_stale_snap_data(const set
<snapid_t
>& snaps
)
2961 dout(10) << __func__
<< " " << snaps
<< dendl
;
2963 for (auto it
= old_inodes
.begin(); it
!= old_inodes
.end(); ) {
2964 const snapid_t
&id
= it
->first
;
2965 const auto &s
= snaps
.lower_bound(it
->second
.first
);
2966 if (s
== snaps
.end() || *s
> id
) {
2967 dout(10) << " purging old_inode [" << it
->second
.first
<< "," << id
<< "]" << dendl
;
2968 it
= old_inodes
.erase(it
);
2976 * pick/create an old_inode
2978 CInode::mempool_old_inode
* CInode::pick_old_inode(snapid_t snap
)
2980 auto it
= old_inodes
.lower_bound(snap
); // p is first key >= to snap
2981 if (it
!= old_inodes
.end() && it
->second
.first
<= snap
) {
2982 dout(10) << __func__
<< " snap " << snap
<< " -> [" << it
->second
.first
<< "," << it
->first
<< "]" << dendl
;
2985 dout(10) << __func__
<< " snap " << snap
<< " -> nothing" << dendl
;
2989 void CInode::open_snaprealm(bool nosplit
)
2992 SnapRealm
*parent
= find_snaprealm();
2993 snaprealm
= new SnapRealm(mdcache
, this);
2995 dout(10) << __func__
<< " " << snaprealm
2996 << " parent is " << parent
2998 dout(30) << " siblings are " << parent
->open_children
<< dendl
;
2999 snaprealm
->parent
= parent
;
3001 parent
->split_at(snaprealm
);
3002 parent
->open_children
.insert(snaprealm
);
3006 void CInode::close_snaprealm(bool nojoin
)
3009 dout(15) << __func__
<< " " << *snaprealm
<< dendl
;
3010 snaprealm
->close_parents();
3011 if (snaprealm
->parent
) {
3012 snaprealm
->parent
->open_children
.erase(snaprealm
);
3014 //snaprealm->parent->join(snaprealm);
3021 SnapRealm
*CInode::find_snaprealm() const
3023 const CInode
*cur
= this;
3024 while (!cur
->snaprealm
) {
3025 const CDentry
*pdn
= cur
->get_oldest_parent_dn();
3028 cur
= pdn
->get_dir()->get_inode();
3030 return cur
->snaprealm
;
3033 void CInode::encode_snap_blob(bufferlist
&snapbl
)
3037 encode(snaprealm
->srnode
, snapbl
);
3038 dout(20) << __func__
<< " " << *snaprealm
<< dendl
;
3041 void CInode::decode_snap_blob(const bufferlist
& snapbl
)
3044 if (snapbl
.length()) {
3046 auto old_flags
= snaprealm
->srnode
.flags
;
3047 auto p
= snapbl
.cbegin();
3048 decode(snaprealm
->srnode
, p
);
3050 bool ok
= snaprealm
->_open_parents(NULL
);
3053 if ((snaprealm
->srnode
.flags
^ old_flags
) & sr_t::PARENT_GLOBAL
) {
3054 snaprealm
->close_parents();
3055 snaprealm
->adjust_parent();
3058 dout(20) << __func__
<< " " << *snaprealm
<< dendl
;
3059 } else if (snaprealm
&&
3060 !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675
3061 ceph_assert(mdcache
->mds
->is_any_replay());
3062 snaprealm
->merge_to(NULL
);
3066 void CInode::encode_snap(bufferlist
& bl
)
3068 ENCODE_START(1, 1, bl
);
3070 encode_snap_blob(snapbl
);
3072 encode(oldest_snap
, bl
);
3076 void CInode::decode_snap(bufferlist::const_iterator
& p
)
3081 decode(oldest_snap
, p
);
3082 decode_snap_blob(snapbl
);
3086 // =============================================
3088 client_t
CInode::calc_ideal_loner()
3090 if (mdcache
->is_readonly())
3092 if (!get_mds_caps_wanted().empty())
3096 client_t loner
= -1;
3097 for (const auto &p
: client_caps
) {
3098 if (!p
.second
.is_stale() &&
3100 !has_subtree_or_exporting_dirfrag() :
3101 (p
.second
.wanted() & (CEPH_CAP_ANY_WR
|CEPH_CAP_FILE_RD
)))) {
3111 bool CInode::choose_ideal_loner()
3113 want_loner_cap
= calc_ideal_loner();
3114 int changed
= false;
3115 if (loner_cap
>= 0 && loner_cap
!= want_loner_cap
) {
3116 if (!try_drop_loner())
3121 if (want_loner_cap
>= 0) {
3122 if (loner_cap
< 0) {
3123 set_loner_cap(want_loner_cap
);
3126 ceph_assert(loner_cap
== want_loner_cap
);
3131 bool CInode::try_set_loner()
3133 ceph_assert(want_loner_cap
>= 0);
3134 if (loner_cap
>= 0 && loner_cap
!= want_loner_cap
)
3136 set_loner_cap(want_loner_cap
);
3140 void CInode::set_loner_cap(client_t l
)
3143 authlock
.set_excl_client(loner_cap
);
3144 filelock
.set_excl_client(loner_cap
);
3145 linklock
.set_excl_client(loner_cap
);
3146 xattrlock
.set_excl_client(loner_cap
);
3149 bool CInode::try_drop_loner()
3154 int other_allowed
= get_caps_allowed_by_type(CAP_ANY
);
3155 Capability
*cap
= get_client_cap(loner_cap
);
3157 (cap
->issued() & ~other_allowed
) == 0) {
3165 // choose new lock state during recovery, based on issued caps
3166 void CInode::choose_lock_state(SimpleLock
*lock
, int allissued
)
3168 int shift
= lock
->get_cap_shift();
3169 int issued
= (allissued
>> shift
) & lock
->get_cap_mask();
3171 if (lock
->is_xlocked()) {
3173 } else if (lock
->get_state() != LOCK_MIX
) {
3174 if (issued
& (CEPH_CAP_GEXCL
| CEPH_CAP_GBUFFER
))
3175 lock
->set_state(LOCK_EXCL
);
3176 else if (issued
& CEPH_CAP_GWR
)
3177 lock
->set_state(LOCK_MIX
);
3178 else if (lock
->is_dirty()) {
3179 if (is_replicated())
3180 lock
->set_state(LOCK_MIX
);
3182 lock
->set_state(LOCK_LOCK
);
3184 lock
->set_state(LOCK_SYNC
);
3187 // our states have already been chosen during rejoin.
3188 if (lock
->is_xlocked())
3189 ceph_assert(lock
->get_state() == LOCK_LOCK
);
3193 void CInode::choose_lock_states(int dirty_caps
)
3195 int issued
= get_caps_issued() | dirty_caps
;
3196 if (is_auth() && (issued
& (CEPH_CAP_ANY_EXCL
|CEPH_CAP_ANY_WR
)))
3197 choose_ideal_loner();
3198 choose_lock_state(&filelock
, issued
);
3199 choose_lock_state(&nestlock
, issued
);
3200 choose_lock_state(&dirfragtreelock
, issued
);
3201 choose_lock_state(&authlock
, issued
);
3202 choose_lock_state(&xattrlock
, issued
);
3203 choose_lock_state(&linklock
, issued
);
3206 int CInode::count_nonstale_caps()
3209 for (const auto &p
: client_caps
) {
3210 if (!p
.second
.is_stale())
3216 bool CInode::multiple_nonstale_caps()
3219 for (const auto &p
: client_caps
) {
3220 if (!p
.second
.is_stale()) {
3229 void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map
<int32_t,int32_t>& m
)
3231 bool old_empty
= mds_caps_wanted
.empty();
3232 mds_caps_wanted
.swap(m
);
3233 if (old_empty
!= (bool)mds_caps_wanted
.empty()) {
3235 adjust_num_caps_wanted(1);
3237 adjust_num_caps_wanted(-1);
3241 void CInode::set_mds_caps_wanted(mds_rank_t mds
, int32_t wanted
)
3243 bool old_empty
= mds_caps_wanted
.empty();
3245 mds_caps_wanted
[mds
] = wanted
;
3247 adjust_num_caps_wanted(1);
3248 } else if (!old_empty
) {
3249 mds_caps_wanted
.erase(mds
);
3250 if (mds_caps_wanted
.empty())
3251 adjust_num_caps_wanted(-1);
3255 void CInode::adjust_num_caps_wanted(int d
)
3257 if (!num_caps_wanted
&& d
> 0)
3258 mdcache
->open_file_table
.add_inode(this);
3259 else if (num_caps_wanted
> 0 && num_caps_wanted
== -d
)
3260 mdcache
->open_file_table
.remove_inode(this);
3262 num_caps_wanted
+=d
;
3263 ceph_assert(num_caps_wanted
>= 0);
3266 Capability
*CInode::add_client_cap(client_t client
, Session
*session
,
3267 SnapRealm
*conrealm
, bool new_inode
)
3269 ceph_assert(last
== CEPH_NOSNAP
);
3270 if (client_caps
.empty()) {
3273 containing_realm
= conrealm
;
3275 containing_realm
= find_snaprealm();
3276 containing_realm
->inodes_with_caps
.push_back(&item_caps
);
3277 dout(10) << __func__
<< " first cap, joining realm " << *containing_realm
<< dendl
;
3279 mdcache
->num_inodes_with_caps
++;
3281 parent
->dir
->adjust_num_inodes_with_caps(1);
3284 uint64_t cap_id
= new_inode
? 1 : ++mdcache
->last_cap_id
;
3285 auto ret
= client_caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(client
),
3286 std::forward_as_tuple(this, session
, cap_id
));
3287 ceph_assert(ret
.second
== true);
3288 Capability
*cap
= &ret
.first
->second
;
3290 cap
->client_follows
= first
-1;
3291 containing_realm
->add_cap(client
, cap
);
3296 void CInode::remove_client_cap(client_t client
)
3298 auto it
= client_caps
.find(client
);
3299 ceph_assert(it
!= client_caps
.end());
3300 Capability
*cap
= &it
->second
;
3302 cap
->item_session_caps
.remove_myself();
3303 cap
->item_revoking_caps
.remove_myself();
3304 cap
->item_client_revoking_caps
.remove_myself();
3305 containing_realm
->remove_cap(client
, cap
);
3307 if (client
== loner_cap
)
3311 adjust_num_caps_wanted(-1);
3313 client_caps
.erase(it
);
3314 if (client_caps
.empty()) {
3315 dout(10) << __func__
<< " last cap, leaving realm " << *containing_realm
<< dendl
;
3317 item_caps
.remove_myself();
3318 containing_realm
= NULL
;
3319 mdcache
->num_inodes_with_caps
--;
3321 parent
->dir
->adjust_num_inodes_with_caps(-1);
3324 //clean up advisory locks
3325 bool fcntl_removed
= fcntl_locks
? fcntl_locks
->remove_all_from(client
) : false;
3326 bool flock_removed
= flock_locks
? flock_locks
->remove_all_from(client
) : false;
3327 if (fcntl_removed
|| flock_removed
) {
3328 MDSContext::vec waiters
;
3329 take_waiting(CInode::WAIT_FLOCK
, waiters
);
3330 mdcache
->mds
->queue_waiters(waiters
);
3334 void CInode::move_to_realm(SnapRealm
*realm
)
3336 dout(10) << __func__
<< " joining realm " << *realm
3337 << ", leaving realm " << *containing_realm
<< dendl
;
3338 for (auto& p
: client_caps
) {
3339 containing_realm
->remove_cap(p
.first
, &p
.second
);
3340 realm
->add_cap(p
.first
, &p
.second
);
3342 item_caps
.remove_myself();
3343 realm
->inodes_with_caps
.push_back(&item_caps
);
3344 containing_realm
= realm
;
3347 Capability
*CInode::reconnect_cap(client_t client
, const cap_reconnect_t
& icr
, Session
*session
)
3349 Capability
*cap
= get_client_cap(client
);
3352 cap
->merge(icr
.capinfo
.wanted
, icr
.capinfo
.issued
);
3354 cap
= add_client_cap(client
, session
);
3355 cap
->set_cap_id(icr
.capinfo
.cap_id
);
3356 cap
->set_wanted(icr
.capinfo
.wanted
);
3357 cap
->issue_norevoke(icr
.capinfo
.issued
);
3360 cap
->set_last_issue_stamp(ceph_clock_now());
3364 void CInode::clear_client_caps_after_export()
3366 while (!client_caps
.empty())
3367 remove_client_cap(client_caps
.begin()->first
);
3369 want_loner_cap
= -1;
3370 if (!get_mds_caps_wanted().empty()) {
3371 mempool::mds_co::compact_map
<int32_t,int32_t> empty
;
3372 set_mds_caps_wanted(empty
);
3376 void CInode::export_client_caps(map
<client_t
,Capability::Export
>& cl
)
3378 for (const auto &p
: client_caps
) {
3379 cl
[p
.first
] = p
.second
.make_export();
3384 int CInode::get_caps_liked() const
3387 return CEPH_CAP_PIN
| CEPH_CAP_ANY_EXCL
| CEPH_CAP_ANY_SHARED
; // but not, say, FILE_RD|WR|WRBUFFER
3389 return CEPH_CAP_ANY
& ~CEPH_CAP_FILE_LAZYIO
;
3392 int CInode::get_caps_allowed_ever() const
3396 allowed
= CEPH_CAP_PIN
| CEPH_CAP_ANY_EXCL
| CEPH_CAP_ANY_SHARED
;
3398 allowed
= CEPH_CAP_ANY
;
3401 (filelock
.gcaps_allowed_ever() << filelock
.get_cap_shift()) |
3402 (authlock
.gcaps_allowed_ever() << authlock
.get_cap_shift()) |
3403 (xattrlock
.gcaps_allowed_ever() << xattrlock
.get_cap_shift()) |
3404 (linklock
.gcaps_allowed_ever() << linklock
.get_cap_shift()));
3407 int CInode::get_caps_allowed_by_type(int type
) const
3411 (filelock
.gcaps_allowed(type
) << filelock
.get_cap_shift()) |
3412 (authlock
.gcaps_allowed(type
) << authlock
.get_cap_shift()) |
3413 (xattrlock
.gcaps_allowed(type
) << xattrlock
.get_cap_shift()) |
3414 (linklock
.gcaps_allowed(type
) << linklock
.get_cap_shift());
3417 int CInode::get_caps_careful() const
3420 (filelock
.gcaps_careful() << filelock
.get_cap_shift()) |
3421 (authlock
.gcaps_careful() << authlock
.get_cap_shift()) |
3422 (xattrlock
.gcaps_careful() << xattrlock
.get_cap_shift()) |
3423 (linklock
.gcaps_careful() << linklock
.get_cap_shift());
3426 int CInode::get_xlocker_mask(client_t client
) const
3429 (filelock
.gcaps_xlocker_mask(client
) << filelock
.get_cap_shift()) |
3430 (authlock
.gcaps_xlocker_mask(client
) << authlock
.get_cap_shift()) |
3431 (xattrlock
.gcaps_xlocker_mask(client
) << xattrlock
.get_cap_shift()) |
3432 (linklock
.gcaps_xlocker_mask(client
) << linklock
.get_cap_shift());
3435 int CInode::get_caps_allowed_for_client(Session
*session
, Capability
*cap
,
3436 mempool_inode
*file_i
) const
3438 client_t client
= session
->get_client();
3440 if (client
== get_loner()) {
3441 // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked
3443 get_caps_allowed_by_type(CAP_LONER
) |
3444 (get_caps_allowed_by_type(CAP_XLOCKER
) & get_xlocker_mask(client
));
3446 allowed
= get_caps_allowed_by_type(CAP_ANY
);
3450 allowed
&= ~CEPH_CAP_ANY_DIR_OPS
;
3451 if (cap
&& (allowed
& CEPH_CAP_FILE_EXCL
))
3452 allowed
|= cap
->get_lock_cache_allowed();
3454 if (file_i
->inline_data
.version
== CEPH_INLINE_NONE
&&
3455 file_i
->layout
.pool_ns
.empty()) {
3458 if ((file_i
->inline_data
.version
!= CEPH_INLINE_NONE
&&
3459 cap
->is_noinline()) ||
3460 (!file_i
->layout
.pool_ns
.empty() &&
3461 cap
->is_nopoolns()))
3462 allowed
&= ~(CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_WR
);
3464 auto& conn
= session
->get_connection();
3465 if ((file_i
->inline_data
.version
!= CEPH_INLINE_NONE
&&
3466 !conn
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) ||
3467 (!file_i
->layout
.pool_ns
.empty() &&
3468 !conn
->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
)))
3469 allowed
&= ~(CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_WR
);
3475 // caps issued, wanted
3476 int CInode::get_caps_issued(int *ploner
, int *pother
, int *pxlocker
,
3477 int shift
, int mask
)
3480 int loner
= 0, other
= 0, xlocker
= 0;
3485 for (const auto &p
: client_caps
) {
3486 int i
= p
.second
.issued();
3488 if (p
.first
== loner_cap
)
3492 xlocker
|= get_xlocker_mask(p
.first
) & i
;
3494 if (ploner
) *ploner
= (loner
>> shift
) & mask
;
3495 if (pother
) *pother
= (other
>> shift
) & mask
;
3496 if (pxlocker
) *pxlocker
= (xlocker
>> shift
) & mask
;
3497 return (c
>> shift
) & mask
;
3500 bool CInode::is_any_caps_wanted() const
3502 for (const auto &p
: client_caps
) {
3503 if (p
.second
.wanted())
3509 int CInode::get_caps_wanted(int *ploner
, int *pother
, int shift
, int mask
) const
3512 int loner
= 0, other
= 0;
3513 for (const auto &p
: client_caps
) {
3514 if (!p
.second
.is_stale()) {
3515 int t
= p
.second
.wanted();
3517 if (p
.first
== loner_cap
)
3522 //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
3525 for (const auto &p
: mds_caps_wanted
) {
3528 //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
3530 if (ploner
) *ploner
= (loner
>> shift
) & mask
;
3531 if (pother
) *pother
= (other
>> shift
) & mask
;
3532 return (w
>> shift
) & mask
;
3535 bool CInode::issued_caps_need_gather(SimpleLock
*lock
)
3537 int loner_issued
, other_issued
, xlocker_issued
;
3538 get_caps_issued(&loner_issued
, &other_issued
, &xlocker_issued
,
3539 lock
->get_cap_shift(), lock
->get_cap_mask());
3540 if ((loner_issued
& ~lock
->gcaps_allowed(CAP_LONER
)) ||
3541 (other_issued
& ~lock
->gcaps_allowed(CAP_ANY
)) ||
3542 (xlocker_issued
& ~lock
->gcaps_allowed(CAP_XLOCKER
)))
3548 // =============================================
3550 int CInode::encode_inodestat(bufferlist
& bl
, Session
*session
,
3551 SnapRealm
*dir_realm
,
3556 client_t client
= session
->get_client();
3557 ceph_assert(snapid
);
3562 mempool_inode
*oi
= &inode
;
3563 mempool_inode
*pi
= get_projected_inode();
3565 CInode::mempool_xattr_map
*pxattrs
= nullptr;
3567 if (snapid
!= CEPH_NOSNAP
) {
3569 // for now at least, old_inodes is only defined/valid on the auth
3573 if (is_multiversion()) {
3574 auto it
= old_inodes
.lower_bound(snapid
);
3575 if (it
!= old_inodes
.end()) {
3576 if (it
->second
.first
> snapid
) {
3577 if (it
!= old_inodes
.begin())
3580 if (it
->second
.first
<= snapid
&& snapid
<= it
->first
) {
3581 dout(15) << __func__
<< " snapid " << snapid
3582 << " to old_inode [" << it
->second
.first
<< "," << it
->first
<< "]"
3583 << " " << it
->second
.inode
.rstat
3585 auto &p
= it
->second
;
3587 pxattrs
= &p
.xattrs
;
3589 // snapshoted remote dentry can result this
3590 dout(0) << __func__
<< " old_inode for snapid " << snapid
3591 << " not found" << dendl
;
3594 } else if (snapid
< first
|| snapid
> last
) {
3595 // snapshoted remote dentry can result this
3596 dout(0) << __func__
<< " [" << first
<< "," << last
<< "]"
3597 << " not match snapid " << snapid
<< dendl
;
3602 SnapRealm
*realm
= find_snaprealm();
3603 if (snapid
!= CEPH_NOSNAP
&& realm
) {
3604 // add snapshot timestamp vxattr
3605 map
<snapid_t
,const SnapInfo
*> infomap
;
3606 realm
->get_snap_info(infomap
,
3609 if (!infomap
.empty()) {
3610 ceph_assert(infomap
.size() == 1);
3611 const SnapInfo
*si
= infomap
.begin()->second
;
3612 snap_btime
= si
->stamp
;
3617 bool no_caps
= !valid
||
3618 session
->is_stale() ||
3619 (dir_realm
&& realm
!= dir_realm
) ||
3621 state_test(CInode::STATE_EXPORTINGCAPS
);
3623 dout(20) << __func__
<< " no caps"
3624 << (!valid
?", !valid":"")
3625 << (session
->is_stale()?", session stale ":"")
3626 << ((dir_realm
&& realm
!= dir_realm
)?", snaprealm differs ":"")
3627 << (is_frozen()?", frozen inode":"")
3628 << (state_test(CInode::STATE_EXPORTINGCAPS
)?", exporting caps":"")
3632 // "fake" a version that is old (stable) version, +1 if projected.
3633 version_t version
= (oi
->version
* 2) + is_projected();
3635 Capability
*cap
= get_client_cap(client
);
3636 bool pfile
= filelock
.is_xlocked_by_client(client
) || get_loner() == client
;
3637 //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL));
3638 bool pauth
= authlock
.is_xlocked_by_client(client
) || get_loner() == client
;
3639 bool plink
= linklock
.is_xlocked_by_client(client
) || get_loner() == client
;
3640 bool pxattr
= xattrlock
.is_xlocked_by_client(client
) || get_loner() == client
;
3642 bool plocal
= versionlock
.get_last_wrlock_client() == client
;
3643 bool ppolicy
= policylock
.is_xlocked_by_client(client
) || get_loner()==client
;
3645 mempool_inode
*any_i
= (pfile
|pauth
|plink
|pxattr
|plocal
) ? pi
: oi
;
3647 dout(20) << " pfile " << pfile
<< " pauth " << pauth
3648 << " plink " << plink
<< " pxattr " << pxattr
3649 << " plocal " << plocal
3650 << " ctime " << any_i
->ctime
3651 << " valid=" << valid
<< dendl
;
3654 mempool_inode
*file_i
= pfile
? pi
:oi
;
3655 file_layout_t layout
;
3657 layout
= (ppolicy
? pi
: oi
)->layout
;
3659 layout
= file_i
->layout
;
3662 // max_size is min of projected, actual
3664 std::min(oi
->client_ranges
.count(client
) ?
3665 oi
->client_ranges
[client
].range
.last
: 0,
3666 pi
->client_ranges
.count(client
) ?
3667 pi
->client_ranges
[client
].range
.last
: 0);
3670 version_t inline_version
= 0;
3671 bufferlist inline_data
;
3672 if (file_i
->inline_data
.version
== CEPH_INLINE_NONE
) {
3673 inline_version
= CEPH_INLINE_NONE
;
3674 } else if ((!cap
&& !no_caps
) ||
3675 (cap
&& cap
->client_inline_version
< file_i
->inline_data
.version
) ||
3676 (getattr_caps
& CEPH_CAP_FILE_RD
)) { // client requests inline data
3677 inline_version
= file_i
->inline_data
.version
;
3678 if (file_i
->inline_data
.length() > 0)
3679 inline_data
= file_i
->inline_data
.get_data();
3682 // nest (do same as file... :/)
3684 cap
->last_rbytes
= file_i
->rstat
.rbytes
;
3685 cap
->last_rsize
= file_i
->rstat
.rsize();
3689 mempool_inode
*auth_i
= pauth
? pi
:oi
;
3692 mempool_inode
*link_i
= plink
? pi
:oi
;
3695 mempool_inode
*xattr_i
= pxattr
? pi
:oi
;
3699 version_t xattr_version
;
3700 if ((!cap
&& !no_caps
) ||
3701 (cap
&& cap
->client_xattr_version
< xattr_i
->xattr_version
) ||
3702 (getattr_caps
& CEPH_CAP_XATTR_SHARED
)) { // client requests xattrs
3704 pxattrs
= pxattr
? get_projected_xattrs() : &xattrs
;
3705 xattr_version
= xattr_i
->xattr_version
;
3713 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap
) +
3714 sizeof(struct ceph_file_layout
) +
3715 sizeof(struct ceph_timespec
) * 3 + 4 + // ctime ~ time_warp_seq
3716 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink
3717 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec
) + // dirstat.nfiles ~ rstat.rctime
3718 sizeof(__u32
) + sizeof(__u32
) * 2 * dirfragtree
._splits
.size() + // dirfragtree
3719 sizeof(__u32
) + symlink
.length() + // symlink
3720 sizeof(struct ceph_dir_layout
); // dir_layout
3722 if (xattr_version
) {
3723 bytes
+= sizeof(__u32
) + sizeof(__u32
); // xattr buffer len + number entries
3725 for (const auto &p
: *pxattrs
)
3726 bytes
+= sizeof(__u32
) * 2 + p
.first
.length() + p
.second
.length();
3729 bytes
+= sizeof(__u32
); // xattr buffer len
3732 sizeof(version_t
) + sizeof(__u32
) + inline_data
.length() + // inline data
3733 1 + 1 + 8 + 8 + 4 + // quota
3734 4 + layout
.pool_ns
.size() + // pool ns
3735 sizeof(struct ceph_timespec
) + 8; // btime + change_attr
3737 if (bytes
> max_bytes
)
3743 struct ceph_mds_reply_cap ecap
;
3744 if (snapid
!= CEPH_NOSNAP
) {
3746 * snapped inodes (files or dirs) only get read-only caps. always
3747 * issue everything possible, since it is read only.
3749 * if a snapped inode has caps, limit issued caps based on the
3752 * if it is a live inode, limit issued caps based on the lock
3755 * do NOT adjust cap issued state, because the client always
3756 * tracks caps per-snap and the mds does either per-interval or
3759 ecap
.caps
= valid
? get_caps_allowed_by_type(CAP_ANY
) : CEPH_STAT_CAP_INODE
;
3760 if (last
== CEPH_NOSNAP
|| is_any_caps())
3761 ecap
.caps
= ecap
.caps
& get_caps_allowed_for_client(session
, nullptr, file_i
);
3766 if (!no_caps
&& !cap
) {
3768 cap
= add_client_cap(client
, session
, realm
);
3770 choose_ideal_loner();
3774 if (!no_caps
&& cap
) {
3775 int likes
= get_caps_liked();
3776 int allowed
= get_caps_allowed_for_client(session
, cap
, file_i
);
3777 issue
= (cap
->wanted() | likes
) & allowed
;
3778 cap
->issue_norevoke(issue
, true);
3779 issue
= cap
->pending();
3780 dout(10) << "encode_inodestat issuing " << ccap_string(issue
)
3781 << " seq " << cap
->get_last_seq() << dendl
;
3782 } else if (cap
&& cap
->is_new() && !dir_realm
) {
3783 // alway issue new caps to client, otherwise the caps get lost
3784 ceph_assert(cap
->is_stale());
3785 ceph_assert(!cap
->pending());
3786 issue
= CEPH_CAP_PIN
;
3787 cap
->issue_norevoke(issue
, true);
3788 dout(10) << "encode_inodestat issuing " << ccap_string(issue
)
3789 << " seq " << cap
->get_last_seq()
3790 << "(stale&new caps)" << dendl
;
3794 cap
->set_last_issue();
3795 cap
->set_last_issue_stamp(ceph_clock_now());
3797 ecap
.wanted
= cap
->wanted();
3798 ecap
.cap_id
= cap
->get_cap_id();
3799 ecap
.seq
= cap
->get_last_seq();
3800 ecap
.mseq
= cap
->get_mseq();
3801 ecap
.realm
= realm
->inode
->ino();
3811 ecap
.flags
= is_auth() ? CEPH_CAP_FLAG_AUTH
: 0;
3812 dout(10) << "encode_inodestat caps " << ccap_string(ecap
.caps
)
3813 << " seq " << ecap
.seq
<< " mseq " << ecap
.mseq
3814 << " xattrv " << xattr_version
<< dendl
;
3816 if (inline_data
.length() && cap
) {
3817 if ((cap
->pending() | getattr_caps
) & CEPH_CAP_FILE_SHARED
) {
3818 dout(10) << "including inline version " << inline_version
<< dendl
;
3819 cap
->client_inline_version
= inline_version
;
3821 dout(10) << "dropping inline version " << inline_version
<< dendl
;
3823 inline_data
.clear();
3827 // include those xattrs?
3828 if (xattr_version
&& cap
) {
3829 if ((cap
->pending() | getattr_caps
) & CEPH_CAP_XATTR_SHARED
) {
3830 dout(10) << "including xattrs version " << xattr_version
<< dendl
;
3831 cap
->client_xattr_version
= xattr_version
;
3833 dout(10) << "dropping xattrs version " << xattr_version
<< dendl
;
3838 // The end result of encode_xattrs() is equivalent to:
3841 // if (xattr_version) {
3843 // encode(*pxattrs, bl);
3845 // encode((__u32)0, bl);
3850 // But encoding xattrs into the 'xbl' requires a memory allocation.
3851 // The 'bl' should have enough pre-allocated memory in most cases.
3852 // Encoding xattrs directly into it can avoid the extra allocation.
3853 auto encode_xattrs
= [xattr_version
, pxattrs
, &bl
]() {
3855 if (xattr_version
) {
3857 auto filler
= bl
.append_hole(sizeof(xbl_len
));
3858 const auto starting_bl_len
= bl
.length();
3860 encode(*pxattrs
, bl
);
3862 encode((__u32
)0, bl
);
3863 xbl_len
= bl
.length() - starting_bl_len
;
3864 filler
.copy_in(sizeof(xbl_len
), (char *)&xbl_len
);
3866 encode((__u32
)0, bl
);
3871 * note: encoding matches MClientReply::InodeStat
3873 if (session
->info
.has_feature(CEPHFS_FEATURE_REPLY_ENCODING
)) {
3874 ENCODE_START(3, 1, bl
);
3875 encode(oi
->ino
, bl
);
3877 encode(oi
->rdev
, bl
);
3878 encode(version
, bl
);
3879 encode(xattr_version
, bl
);
3882 ceph_file_layout legacy_layout
;
3883 layout
.to_legacy(&legacy_layout
);
3884 encode(legacy_layout
, bl
);
3886 encode(any_i
->ctime
, bl
);
3887 encode(file_i
->mtime
, bl
);
3888 encode(file_i
->atime
, bl
);
3889 encode(file_i
->time_warp_seq
, bl
);
3890 encode(file_i
->size
, bl
);
3891 encode(max_size
, bl
);
3892 encode(file_i
->truncate_size
, bl
);
3893 encode(file_i
->truncate_seq
, bl
);
3894 encode(auth_i
->mode
, bl
);
3895 encode((uint32_t)auth_i
->uid
, bl
);
3896 encode((uint32_t)auth_i
->gid
, bl
);
3897 encode(link_i
->nlink
, bl
);
3898 encode(file_i
->dirstat
.nfiles
, bl
);
3899 encode(file_i
->dirstat
.nsubdirs
, bl
);
3900 encode(file_i
->rstat
.rbytes
, bl
);
3901 encode(file_i
->rstat
.rfiles
, bl
);
3902 encode(file_i
->rstat
.rsubdirs
, bl
);
3903 encode(file_i
->rstat
.rctime
, bl
);
3904 dirfragtree
.encode(bl
);
3905 encode(symlink
, bl
);
3906 encode(file_i
->dir_layout
, bl
);
3908 encode(inline_version
, bl
);
3909 encode(inline_data
, bl
);
3910 mempool_inode
*policy_i
= ppolicy
? pi
: oi
;
3911 encode(policy_i
->quota
, bl
);
3912 encode(layout
.pool_ns
, bl
);
3913 encode(any_i
->btime
, bl
);
3914 encode(any_i
->change_attr
, bl
);
3915 encode(file_i
->export_pin
, bl
);
3916 encode(snap_btime
, bl
);
3920 ceph_assert(session
->get_connection());
3922 encode(oi
->ino
, bl
);
3924 encode(oi
->rdev
, bl
);
3925 encode(version
, bl
);
3926 encode(xattr_version
, bl
);
3929 ceph_file_layout legacy_layout
;
3930 layout
.to_legacy(&legacy_layout
);
3931 encode(legacy_layout
, bl
);
3933 encode(any_i
->ctime
, bl
);
3934 encode(file_i
->mtime
, bl
);
3935 encode(file_i
->atime
, bl
);
3936 encode(file_i
->time_warp_seq
, bl
);
3937 encode(file_i
->size
, bl
);
3938 encode(max_size
, bl
);
3939 encode(file_i
->truncate_size
, bl
);
3940 encode(file_i
->truncate_seq
, bl
);
3941 encode(auth_i
->mode
, bl
);
3942 encode((uint32_t)auth_i
->uid
, bl
);
3943 encode((uint32_t)auth_i
->gid
, bl
);
3944 encode(link_i
->nlink
, bl
);
3945 encode(file_i
->dirstat
.nfiles
, bl
);
3946 encode(file_i
->dirstat
.nsubdirs
, bl
);
3947 encode(file_i
->rstat
.rbytes
, bl
);
3948 encode(file_i
->rstat
.rfiles
, bl
);
3949 encode(file_i
->rstat
.rsubdirs
, bl
);
3950 encode(file_i
->rstat
.rctime
, bl
);
3951 dirfragtree
.encode(bl
);
3952 encode(symlink
, bl
);
3953 auto& conn
= session
->get_connection();
3954 if (conn
->has_feature(CEPH_FEATURE_DIRLAYOUTHASH
)) {
3955 encode(file_i
->dir_layout
, bl
);
3958 if (conn
->has_feature(CEPH_FEATURE_MDS_INLINE_DATA
)) {
3959 encode(inline_version
, bl
);
3960 encode(inline_data
, bl
);
3962 if (conn
->has_feature(CEPH_FEATURE_MDS_QUOTA
)) {
3963 mempool_inode
*policy_i
= ppolicy
? pi
: oi
;
3964 encode(policy_i
->quota
, bl
);
3966 if (conn
->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2
)) {
3967 encode(layout
.pool_ns
, bl
);
3969 if (conn
->has_feature(CEPH_FEATURE_FS_BTIME
)) {
3970 encode(any_i
->btime
, bl
);
3971 encode(any_i
->change_attr
, bl
);
3978 void CInode::encode_cap_message(const ref_t
<MClientCaps
> &m
, Capability
*cap
)
3982 client_t client
= cap
->get_client();
3984 bool pfile
= filelock
.is_xlocked_by_client(client
) || (cap
->issued() & CEPH_CAP_FILE_EXCL
);
3985 bool pauth
= authlock
.is_xlocked_by_client(client
);
3986 bool plink
= linklock
.is_xlocked_by_client(client
);
3987 bool pxattr
= xattrlock
.is_xlocked_by_client(client
);
3989 mempool_inode
*oi
= &inode
;
3990 mempool_inode
*pi
= get_projected_inode();
3991 mempool_inode
*i
= (pfile
|pauth
|plink
|pxattr
) ? pi
: oi
;
3993 dout(20) << __func__
<< " pfile " << pfile
3994 << " pauth " << pauth
<< " plink " << plink
<< " pxattr " << pxattr
3995 << " ctime " << i
->ctime
<< dendl
;
3998 m
->set_layout(i
->layout
);
4000 m
->truncate_seq
= i
->truncate_seq
;
4001 m
->truncate_size
= i
->truncate_size
;
4002 m
->mtime
= i
->mtime
;
4003 m
->atime
= i
->atime
;
4004 m
->ctime
= i
->ctime
;
4005 m
->change_attr
= i
->change_attr
;
4006 m
->time_warp_seq
= i
->time_warp_seq
;
4007 m
->nfiles
= i
->dirstat
.nfiles
;
4008 m
->nsubdirs
= i
->dirstat
.nsubdirs
;
4010 if (cap
->client_inline_version
< i
->inline_data
.version
) {
4011 m
->inline_version
= cap
->client_inline_version
= i
->inline_data
.version
;
4012 if (i
->inline_data
.length() > 0)
4013 m
->inline_data
= i
->inline_data
.get_data();
4015 m
->inline_version
= 0;
4018 // max_size is min of projected, actual.
4019 uint64_t oldms
= oi
->client_ranges
.count(client
) ? oi
->client_ranges
[client
].range
.last
: 0;
4020 uint64_t newms
= pi
->client_ranges
.count(client
) ? pi
->client_ranges
[client
].range
.last
: 0;
4021 m
->max_size
= std::min(oldms
, newms
);
4024 m
->head
.mode
= i
->mode
;
4025 m
->head
.uid
= i
->uid
;
4026 m
->head
.gid
= i
->gid
;
4029 m
->head
.nlink
= i
->nlink
;
4033 auto ix
= pxattr
? get_projected_xattrs() : &xattrs
;
4034 if ((cap
->pending() & CEPH_CAP_XATTR_SHARED
) &&
4035 i
->xattr_version
> cap
->client_xattr_version
) {
4036 dout(10) << " including xattrs v " << i
->xattr_version
<< dendl
;
4037 encode(*ix
, m
->xattrbl
);
4038 m
->head
.xattr_version
= i
->xattr_version
;
4039 cap
->client_xattr_version
= i
->xattr_version
;
4045 void CInode::_encode_base(bufferlist
& bl
, uint64_t features
)
4047 ENCODE_START(1, 1, bl
);
4049 encode(inode
, bl
, features
);
4050 encode(symlink
, bl
);
4051 encode(dirfragtree
, bl
);
4053 encode(old_inodes
, bl
, features
);
4054 encode(damage_flags
, bl
);
4058 void CInode::_decode_base(bufferlist::const_iterator
& p
)
4066 symlink
= std::string_view(tmp
);
4068 decode(dirfragtree
, p
);
4070 decode(old_inodes
, p
);
4071 decode(damage_flags
, p
);
4076 void CInode::_encode_locks_full(bufferlist
& bl
)
4079 encode(authlock
, bl
);
4080 encode(linklock
, bl
);
4081 encode(dirfragtreelock
, bl
);
4082 encode(filelock
, bl
);
4083 encode(xattrlock
, bl
);
4084 encode(snaplock
, bl
);
4085 encode(nestlock
, bl
);
4086 encode(flocklock
, bl
);
4087 encode(policylock
, bl
);
4089 encode(loner_cap
, bl
);
4091 void CInode::_decode_locks_full(bufferlist::const_iterator
& p
)
4094 decode(authlock
, p
);
4095 decode(linklock
, p
);
4096 decode(dirfragtreelock
, p
);
4097 decode(filelock
, p
);
4098 decode(xattrlock
, p
);
4099 decode(snaplock
, p
);
4100 decode(nestlock
, p
);
4101 decode(flocklock
, p
);
4102 decode(policylock
, p
);
4104 decode(loner_cap
, p
);
4105 set_loner_cap(loner_cap
);
4106 want_loner_cap
= loner_cap
; // for now, we'll eval() shortly.
4109 void CInode::_encode_locks_state_for_replica(bufferlist
& bl
, bool need_recover
)
4111 ENCODE_START(1, 1, bl
);
4112 authlock
.encode_state_for_replica(bl
);
4113 linklock
.encode_state_for_replica(bl
);
4114 dirfragtreelock
.encode_state_for_replica(bl
);
4115 filelock
.encode_state_for_replica(bl
);
4116 nestlock
.encode_state_for_replica(bl
);
4117 xattrlock
.encode_state_for_replica(bl
);
4118 snaplock
.encode_state_for_replica(bl
);
4119 flocklock
.encode_state_for_replica(bl
);
4120 policylock
.encode_state_for_replica(bl
);
4121 encode(need_recover
, bl
);
4125 void CInode::_encode_locks_state_for_rejoin(bufferlist
& bl
, int rep
)
4127 authlock
.encode_state_for_replica(bl
);
4128 linklock
.encode_state_for_replica(bl
);
4129 dirfragtreelock
.encode_state_for_rejoin(bl
, rep
);
4130 filelock
.encode_state_for_rejoin(bl
, rep
);
4131 nestlock
.encode_state_for_rejoin(bl
, rep
);
4132 xattrlock
.encode_state_for_replica(bl
);
4133 snaplock
.encode_state_for_replica(bl
);
4134 flocklock
.encode_state_for_replica(bl
);
4135 policylock
.encode_state_for_replica(bl
);
4138 void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator
& p
, bool is_new
)
4141 authlock
.decode_state(p
, is_new
);
4142 linklock
.decode_state(p
, is_new
);
4143 dirfragtreelock
.decode_state(p
, is_new
);
4144 filelock
.decode_state(p
, is_new
);
4145 nestlock
.decode_state(p
, is_new
);
4146 xattrlock
.decode_state(p
, is_new
);
4147 snaplock
.decode_state(p
, is_new
);
4148 flocklock
.decode_state(p
, is_new
);
4149 policylock
.decode_state(p
, is_new
);
4152 decode(need_recover
, p
);
4153 if (need_recover
&& is_new
) {
4154 // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
4155 // and change the object when replaying unsafe requests.
4156 authlock
.mark_need_recover();
4157 linklock
.mark_need_recover();
4158 dirfragtreelock
.mark_need_recover();
4159 filelock
.mark_need_recover();
4160 nestlock
.mark_need_recover();
4161 xattrlock
.mark_need_recover();
4162 snaplock
.mark_need_recover();
4163 flocklock
.mark_need_recover();
4164 policylock
.mark_need_recover();
4168 void CInode::_decode_locks_rejoin(bufferlist::const_iterator
& p
, MDSContext::vec
& waiters
,
4169 list
<SimpleLock
*>& eval_locks
, bool survivor
)
4171 authlock
.decode_state_rejoin(p
, waiters
, survivor
);
4172 linklock
.decode_state_rejoin(p
, waiters
, survivor
);
4173 dirfragtreelock
.decode_state_rejoin(p
, waiters
, survivor
);
4174 filelock
.decode_state_rejoin(p
, waiters
, survivor
);
4175 nestlock
.decode_state_rejoin(p
, waiters
, survivor
);
4176 xattrlock
.decode_state_rejoin(p
, waiters
, survivor
);
4177 snaplock
.decode_state_rejoin(p
, waiters
, survivor
);
4178 flocklock
.decode_state_rejoin(p
, waiters
, survivor
);
4179 policylock
.decode_state_rejoin(p
, waiters
, survivor
);
4181 if (!dirfragtreelock
.is_stable() && !dirfragtreelock
.is_wrlocked())
4182 eval_locks
.push_back(&dirfragtreelock
);
4183 if (!filelock
.is_stable() && !filelock
.is_wrlocked())
4184 eval_locks
.push_back(&filelock
);
4185 if (!nestlock
.is_stable() && !nestlock
.is_wrlocked())
4186 eval_locks
.push_back(&nestlock
);
4192 void CInode::encode_export(bufferlist
& bl
)
4194 ENCODE_START(5, 4, bl
);
4195 _encode_base(bl
, mdcache
->mds
->mdsmap
->get_up_features());
4201 encode(get_replicas(), bl
);
4203 // include scatterlock info for any bounding CDirs
4204 bufferlist bounding
;
4206 for (const auto &p
: dirfrags
) {
4207 CDir
*dir
= p
.second
;
4208 if (dir
->state_test(CDir::STATE_EXPORTBOUND
)) {
4209 encode(p
.first
, bounding
);
4210 encode(dir
->fnode
.fragstat
, bounding
);
4211 encode(dir
->fnode
.accounted_fragstat
, bounding
);
4212 encode(dir
->fnode
.rstat
, bounding
);
4213 encode(dir
->fnode
.accounted_rstat
, bounding
);
4214 dout(10) << " encoded fragstat/rstat info for " << *dir
<< dendl
;
4217 encode(bounding
, bl
);
4219 _encode_locks_full(bl
);
4221 _encode_file_locks(bl
);
4225 get(PIN_TEMPEXPORTING
);
4228 void CInode::finish_export()
4230 state
&= MASK_STATE_EXPORT_KEPT
;
4235 //dirlock.clear_updated();
4239 put(PIN_TEMPEXPORTING
);
4242 void CInode::decode_import(bufferlist::const_iterator
& p
,
4251 state_set(STATE_AUTH
| (s
& MASK_STATE_EXPORTED
));
4257 if (is_dirty_parent()) {
4258 get(PIN_DIRTYPARENT
);
4259 mark_dirty_parent(ls
);
4264 decode(get_replicas(), p
);
4265 if (is_replicated())
4266 get(PIN_REPLICATED
);
4269 // decode fragstat info on bounding cdirs
4270 bufferlist bounding
;
4271 decode(bounding
, p
);
4272 auto q
= bounding
.cbegin();
4276 CDir
*dir
= get_dirfrag(fg
);
4277 ceph_assert(dir
); // we should have all bounds open
4279 // Only take the remote's fragstat/rstat if we are non-auth for
4280 // this dirfrag AND the lock is NOT in a scattered (MIX) state.
4281 // We know lock is stable, and MIX is the only state in which
4282 // the inode auth (who sent us this data) may not have the best
4285 // HMM: Are there cases where dir->is_auth() is an insufficient
4286 // check because the dirfrag is under migration? That implies
4287 // it is frozen (and in a SYNC or LOCK state). FIXME.
4289 if (dir
->is_auth() ||
4290 filelock
.get_state() == LOCK_MIX
) {
4291 dout(10) << " skipped fragstat info for " << *dir
<< dendl
;
4296 decode(dir
->fnode
.fragstat
, q
);
4297 decode(dir
->fnode
.accounted_fragstat
, q
);
4298 dout(10) << " took fragstat info for " << *dir
<< dendl
;
4300 if (dir
->is_auth() ||
4301 nestlock
.get_state() == LOCK_MIX
) {
4302 dout(10) << " skipped rstat info for " << *dir
<< dendl
;
4307 decode(dir
->fnode
.rstat
, q
);
4308 decode(dir
->fnode
.accounted_rstat
, q
);
4309 dout(10) << " took rstat info for " << *dir
<< dendl
;
4313 _decode_locks_full(p
);
4315 _decode_file_locks(p
);
4321 void InodeStoreBase::dump(Formatter
*f
) const
4324 f
->dump_string("symlink", symlink
);
4326 f
->open_array_section("xattrs");
4327 for (const auto& [key
, val
] : xattrs
) {
4328 f
->open_object_section("xattr");
4329 f
->dump_string("key", key
);
4330 std::string
v(val
.c_str(), val
.length());
4331 f
->dump_string("val", v
);
4335 f
->open_object_section("dirfragtree");
4336 dirfragtree
.dump(f
);
4337 f
->close_section(); // dirfragtree
4339 f
->open_array_section("old_inodes");
4340 for (const auto &p
: old_inodes
) {
4341 f
->open_object_section("old_inode");
4342 // The key is the last snapid, the first is in the mempool_old_inode
4343 f
->dump_int("last", p
.first
);
4345 f
->close_section(); // old_inode
4347 f
->close_section(); // old_inodes
4349 f
->dump_unsigned("oldest_snap", oldest_snap
);
4350 f
->dump_unsigned("damage_flags", damage_flags
);
4354 void InodeStore::generate_test_instances(std::list
<InodeStore
*> &ls
)
4356 InodeStore
*populated
= new InodeStore
;
4357 populated
->inode
.ino
= 0xdeadbeef;
4358 populated
->symlink
= "rhubarb";
4359 ls
.push_back(populated
);
4362 void InodeStoreBare::generate_test_instances(std::list
<InodeStoreBare
*> &ls
)
4364 InodeStoreBare
*populated
= new InodeStoreBare
;
4365 populated
->inode
.ino
= 0xdeadbeef;
4366 populated
->symlink
= "rhubarb";
4367 ls
.push_back(populated
);
4370 void CInode::validate_disk_state(CInode::validated_data
*results
,
4373 class ValidationContinuation
: public MDSContinuation
{
4377 CInode::validated_data
*results
;
4389 ValidationContinuation(CInode
*i
,
4390 CInode::validated_data
*data_r
,
4392 MDSContinuation(i
->mdcache
->mds
->server
),
4397 set_callback(START
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_start
));
4398 set_callback(BACKTRACE
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_backtrace
));
4399 set_callback(INODE
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_inode_disk
));
4400 set_callback(DIRFRAGS
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_dirfrags
));
4401 set_callback(SNAPREALM
, static_cast<Continuation::stagePtr
>(&ValidationContinuation::_snaprealm
));
4404 ~ValidationContinuation() override
{
4407 in
->mdcache
->num_shadow_inodes
--;
4412 * Fetch backtrace and set tag if tag is non-empty
4414 void fetch_backtrace_and_tag(CInode
*in
,
4415 std::string_view tag
, bool is_internal
,
4416 Context
*fin
, int *bt_r
, bufferlist
*bt
)
4418 const int64_t pool
= in
->get_backtrace_pool();
4419 object_t oid
= CInode::get_object_name(in
->ino(), frag_t(), "");
4421 ObjectOperation fetch
;
4422 fetch
.getxattr("parent", bt
, bt_r
);
4423 in
->mdcache
->mds
->objecter
->read(oid
, object_locator_t(pool
), fetch
, CEPH_NOSNAP
,
4427 ObjectOperation scrub_tag
;
4429 encode(tag
, tag_bl
);
4430 scrub_tag
.setxattr("scrub_tag", tag_bl
);
4432 in
->mdcache
->mds
->objecter
->mutate(oid
, object_locator_t(pool
), scrub_tag
, snapc
,
4433 ceph::real_clock::now(),
4438 bool _start(int rval
) {
4439 if (in
->is_dirty()) {
4440 MDCache
*mdcache
= in
->mdcache
;
4441 mempool_inode
& inode
= in
->inode
;
4442 dout(20) << "validating a dirty CInode; results will be inconclusive"
4445 if (in
->is_symlink()) {
4446 // there's nothing to do for symlinks!
4450 // prefetch snaprealm's past parents
4451 if (in
->snaprealm
&& !in
->snaprealm
->have_past_parents_open())
4452 in
->snaprealm
->open_parents(nullptr);
4454 C_OnFinisher
*conf
= new C_OnFinisher(get_io_callback(BACKTRACE
),
4455 in
->mdcache
->mds
->finisher
);
4457 std::string_view tag
= in
->scrub_infop
->header
->get_tag();
4458 bool is_internal
= in
->scrub_infop
->header
->is_internal_tag();
4459 // Rather than using the usual CInode::fetch_backtrace,
4460 // use a special variant that optionally writes a tag in the same
4462 fetch_backtrace_and_tag(in
, tag
, is_internal
, conf
, &results
->backtrace
.ondisk_read_retval
, &bl
);
4466 bool _backtrace(int rval
) {
4467 // set up basic result reporting and make sure we got the data
4468 results
->performed_validation
= true; // at least, some of it!
4469 results
->backtrace
.checked
= true;
4471 const int64_t pool
= in
->get_backtrace_pool();
4472 inode_backtrace_t
& memory_backtrace
= results
->backtrace
.memory_value
;
4473 in
->build_backtrace(pool
, memory_backtrace
);
4474 bool equivalent
, divergent
;
4477 MDCache
*mdcache
= in
->mdcache
; // For the benefit of dout
4478 const mempool_inode
& inode
= in
->inode
; // For the benefit of dout
4480 // Ignore rval because it's the result of a FAILOK operation
4481 // from fetch_backtrace_and_tag: the real result is in
4482 // backtrace.ondisk_read_retval
4483 dout(20) << "ondisk_read_retval: " << results
->backtrace
.ondisk_read_retval
<< dendl
;
4484 if (results
->backtrace
.ondisk_read_retval
!= 0) {
4485 results
->backtrace
.error_str
<< "failed to read off disk; see retval";
4489 // extract the backtrace, and compare it to a newly-constructed one
4491 auto p
= bl
.cbegin();
4493 decode(results
->backtrace
.ondisk_value
, p
);
4494 dout(10) << "decoded " << bl
.length() << " bytes of backtrace successfully" << dendl
;
4495 } catch (buffer::error
&) {
4496 if (results
->backtrace
.ondisk_read_retval
== 0 && rval
!= 0) {
4497 // Cases where something has clearly gone wrong with the overall
4498 // fetch op, though we didn't get a nonzero rc from the getxattr
4499 // operation. e.g. object missing.
4500 results
->backtrace
.ondisk_read_retval
= rval
;
4502 results
->backtrace
.error_str
<< "failed to decode on-disk backtrace ("
4503 << bl
.length() << " bytes)!";
4507 memory_newer
= memory_backtrace
.compare(results
->backtrace
.ondisk_value
,
4508 &equivalent
, &divergent
);
4510 if (divergent
|| memory_newer
< 0) {
4511 // we're divergent, or on-disk version is newer
4512 results
->backtrace
.error_str
<< "On-disk backtrace is divergent or newer";
4514 results
->backtrace
.passed
= true;
4518 if (!results
->backtrace
.passed
&& in
->scrub_infop
->header
->get_repair()) {
4520 in
->make_path_string(path
);
4521 in
->mdcache
->mds
->clog
->warn() << "bad backtrace on inode " << in
->ino()
4522 << "(" << path
<< "), rewriting it";
4523 in
->mark_dirty_parent(in
->mdcache
->mds
->mdlog
->get_current_segment(),
4525 // Flag that we repaired this BT so that it won't go into damagetable
4526 results
->backtrace
.repaired
= true;
4529 // If the inode's number was free in the InoTable, fix that
4532 InoTable
*inotable
= mdcache
->mds
->inotable
;
4534 dout(10) << "scrub: inotable ino = " << inode
.ino
<< dendl
;
4535 dout(10) << "scrub: inotable free says "
4536 << inotable
->is_marked_free(inode
.ino
) << dendl
;
4538 if (inotable
->is_marked_free(inode
.ino
)) {
4539 LogChannelRef clog
= in
->mdcache
->mds
->clog
;
4540 clog
->error() << "scrub: inode wrongly marked free: " << inode
.ino
;
4542 if (in
->scrub_infop
->header
->get_repair()) {
4543 bool repaired
= inotable
->repair(inode
.ino
);
4545 clog
->error() << "inode table repaired for inode: " << inode
.ino
;
4549 clog
->error() << "Cannot repair inotable while other operations"
4558 return validate_directory_data();
4560 // TODO: validate on-disk inode for normal files
4561 return check_inode_snaprealm();
4565 bool validate_directory_data() {
4566 ceph_assert(in
->is_dir());
4568 if (in
->is_base()) {
4570 shadow_in
= new CInode(in
->mdcache
);
4571 in
->mdcache
->create_unlinked_system_inode(shadow_in
, in
->inode
.ino
, in
->inode
.mode
);
4572 in
->mdcache
->num_shadow_inodes
++;
4574 shadow_in
->fetch(get_internal_callback(INODE
));
4577 // TODO: validate on-disk inode for non-base directories
4578 results
->inode
.passed
= true;
4579 return check_dirfrag_rstats();
4583 bool _inode_disk(int rval
) {
4584 results
->inode
.checked
= true;
4585 results
->inode
.ondisk_read_retval
= rval
;
4586 results
->inode
.ondisk_value
= shadow_in
->inode
;
4587 results
->inode
.memory_value
= in
->inode
;
4589 mempool_inode
& si
= shadow_in
->inode
;
4590 mempool_inode
& i
= in
->inode
;
4591 if (si
.version
> i
.version
) {
4593 results
->inode
.error_str
<< "On-disk inode is newer than in-memory one; ";
4596 bool divergent
= false;
4597 int r
= i
.compare(si
, &divergent
);
4598 results
->inode
.passed
= !divergent
&& r
>= 0;
4599 if (!results
->inode
.passed
) {
4600 results
->inode
.error_str
<<
4601 "On-disk inode is divergent or newer than in-memory one; ";
4606 return check_dirfrag_rstats();
4609 bool check_dirfrag_rstats() {
4610 MDSGatherBuilder
gather(g_ceph_context
);
4612 in
->dirfragtree
.get_leaves(leaves
);
4613 for (const auto& leaf
: leaves
) {
4614 CDir
*dir
= in
->get_or_open_dirfrag(in
->mdcache
, leaf
);
4616 if (!dir
->scrub_infop
->header
)
4617 dir
->scrub_infop
->header
= in
->scrub_infop
->header
;
4618 if (dir
->is_complete()) {
4621 dir
->scrub_infop
->need_scrub_local
= true;
4622 dir
->fetch(gather
.new_sub(), false);
4625 if (gather
.has_subs()) {
4626 gather
.set_finisher(get_internal_callback(DIRFRAGS
));
4630 return immediate(DIRFRAGS
, 0);
4634 bool _dirfrags(int rval
) {
4635 int frags_errors
= 0;
4636 // basic reporting setup
4637 results
->raw_stats
.checked
= true;
4638 results
->raw_stats
.ondisk_read_retval
= rval
;
4640 results
->raw_stats
.memory_value
.dirstat
= in
->inode
.dirstat
;
4641 results
->raw_stats
.memory_value
.rstat
= in
->inode
.rstat
;
4642 frag_info_t
& dir_info
= results
->raw_stats
.ondisk_value
.dirstat
;
4643 nest_info_t
& nest_info
= results
->raw_stats
.ondisk_value
.rstat
;
4646 results
->raw_stats
.error_str
<< "Failed to read dirfrags off disk";
4650 // check each dirfrag...
4651 for (const auto &p
: in
->dirfrags
) {
4652 CDir
*dir
= p
.second
;
4653 ceph_assert(dir
->get_version() > 0);
4654 nest_info
.add(dir
->fnode
.accounted_rstat
);
4655 dir_info
.add(dir
->fnode
.accounted_fragstat
);
4656 if (dir
->scrub_infop
->pending_scrub_error
) {
4657 dir
->scrub_infop
->pending_scrub_error
= false;
4658 if (dir
->scrub_infop
->header
->get_repair()) {
4659 results
->raw_stats
.repaired
= true;
4660 results
->raw_stats
.error_str
4661 << "dirfrag(" << p
.first
<< ") has bad stats (will be fixed); ";
4663 results
->raw_stats
.error_str
4664 << "dirfrag(" << p
.first
<< ") has bad stats; ";
4669 nest_info
.rsubdirs
++; // it gets one to account for self
4670 if (const sr_t
*srnode
= in
->get_projected_srnode(); srnode
)
4671 nest_info
.rsnaps
+= srnode
->snaps
.size();
4673 // ...and that their sum matches our inode settings
4674 if (!dir_info
.same_sums(in
->inode
.dirstat
) ||
4675 !nest_info
.same_sums(in
->inode
.rstat
)) {
4676 if (in
->scrub_infop
->header
->get_repair()) {
4677 results
->raw_stats
.error_str
4678 << "freshly-calculated rstats don't match existing ones (will be fixed)";
4679 in
->mdcache
->repair_inode_stats(in
);
4680 results
->raw_stats
.repaired
= true;
4682 results
->raw_stats
.error_str
4683 << "freshly-calculated rstats don't match existing ones";
4687 if (frags_errors
> 0)
4690 results
->raw_stats
.passed
= true;
4693 return check_inode_snaprealm();
4696 bool check_inode_snaprealm() {
4700 if (!in
->snaprealm
->have_past_parents_open()) {
4701 in
->snaprealm
->open_parents(get_internal_callback(SNAPREALM
));
4704 return immediate(SNAPREALM
, 0);
4708 bool _snaprealm(int rval
) {
4710 if (in
->snaprealm
->past_parents_dirty
||
4711 !in
->get_projected_srnode()->past_parents
.empty()) {
4712 // temporarily store error in field of on-disk inode validation temporarily
4713 results
->inode
.checked
= true;
4714 results
->inode
.passed
= false;
4715 if (in
->scrub_infop
->header
->get_repair()) {
4716 results
->inode
.error_str
<< "Inode has old format snaprealm (will upgrade)";
4717 results
->inode
.repaired
= true;
4718 in
->mdcache
->upgrade_inode_snaprealm(in
);
4720 results
->inode
.error_str
<< "Inode has old format snaprealm";
4726 void _done() override
{
4727 if ((!results
->raw_stats
.checked
|| results
->raw_stats
.passed
) &&
4728 (!results
->backtrace
.checked
|| results
->backtrace
.passed
) &&
4729 (!results
->inode
.checked
|| results
->inode
.passed
))
4730 results
->passed_validation
= true;
4732 // Flag that we did some repair work so that our repair operation
4733 // can be flushed at end of scrub
4734 if (results
->backtrace
.repaired
||
4735 results
->inode
.repaired
||
4736 results
->raw_stats
.repaired
)
4737 in
->scrub_infop
->header
->set_repaired();
4739 fin
->complete(get_rval());
4744 dout(10) << "scrub starting validate_disk_state on " << *this << dendl
;
4745 ValidationContinuation
*vc
= new ValidationContinuation(this,
4751 void CInode::validated_data::dump(Formatter
*f
) const
4753 f
->open_object_section("results");
4755 f
->dump_bool("performed_validation", performed_validation
);
4756 f
->dump_bool("passed_validation", passed_validation
);
4757 f
->open_object_section("backtrace");
4759 f
->dump_bool("checked", backtrace
.checked
);
4760 f
->dump_bool("passed", backtrace
.passed
);
4761 f
->dump_int("read_ret_val", backtrace
.ondisk_read_retval
);
4762 f
->dump_stream("ondisk_value") << backtrace
.ondisk_value
;
4763 f
->dump_stream("memoryvalue") << backtrace
.memory_value
;
4764 f
->dump_string("error_str", backtrace
.error_str
.str());
4766 f
->close_section(); // backtrace
4767 f
->open_object_section("raw_stats");
4769 f
->dump_bool("checked", raw_stats
.checked
);
4770 f
->dump_bool("passed", raw_stats
.passed
);
4771 f
->dump_int("read_ret_val", raw_stats
.ondisk_read_retval
);
4772 f
->dump_stream("ondisk_value.dirstat") << raw_stats
.ondisk_value
.dirstat
;
4773 f
->dump_stream("ondisk_value.rstat") << raw_stats
.ondisk_value
.rstat
;
4774 f
->dump_stream("memory_value.dirrstat") << raw_stats
.memory_value
.dirstat
;
4775 f
->dump_stream("memory_value.rstat") << raw_stats
.memory_value
.rstat
;
4776 f
->dump_string("error_str", raw_stats
.error_str
.str());
4778 f
->close_section(); // raw_stats
4779 // dump failure return code
4781 if (backtrace
.checked
&& backtrace
.ondisk_read_retval
)
4782 rc
= backtrace
.ondisk_read_retval
;
4783 if (inode
.checked
&& inode
.ondisk_read_retval
)
4784 rc
= inode
.ondisk_read_retval
;
4785 if (raw_stats
.checked
&& raw_stats
.ondisk_read_retval
)
4786 rc
= raw_stats
.ondisk_read_retval
;
4787 f
->dump_int("return_code", rc
);
4789 f
->close_section(); // results
4792 bool CInode::validated_data::all_damage_repaired() const
4795 (raw_stats
.checked
&& !raw_stats
.passed
&& !raw_stats
.repaired
)
4797 (backtrace
.checked
&& !backtrace
.passed
&& !backtrace
.repaired
)
4799 (inode
.checked
&& !inode
.passed
&& !inode
.repaired
);
4804 void CInode::dump(Formatter
*f
, int flags
) const
4806 if (flags
& DUMP_PATH
) {
4808 make_path_string(path
, true);
4811 f
->dump_string("path", path
);
4814 if (flags
& DUMP_INODE_STORE_BASE
)
4815 InodeStoreBase::dump(f
);
4817 if (flags
& DUMP_MDS_CACHE_OBJECT
)
4818 MDSCacheObject::dump(f
);
4820 if (flags
& DUMP_LOCKS
) {
4821 f
->open_object_section("versionlock");
4822 versionlock
.dump(f
);
4825 f
->open_object_section("authlock");
4829 f
->open_object_section("linklock");
4833 f
->open_object_section("dirfragtreelock");
4834 dirfragtreelock
.dump(f
);
4837 f
->open_object_section("filelock");
4841 f
->open_object_section("xattrlock");
4845 f
->open_object_section("snaplock");
4849 f
->open_object_section("nestlock");
4853 f
->open_object_section("flocklock");
4857 f
->open_object_section("policylock");
4862 if (flags
& DUMP_STATE
) {
4863 f
->open_array_section("states");
4864 MDSCacheObject::dump_states(f
);
4865 if (state_test(STATE_EXPORTING
))
4866 f
->dump_string("state", "exporting");
4867 if (state_test(STATE_OPENINGDIR
))
4868 f
->dump_string("state", "openingdir");
4869 if (state_test(STATE_FREEZING
))
4870 f
->dump_string("state", "freezing");
4871 if (state_test(STATE_FROZEN
))
4872 f
->dump_string("state", "frozen");
4873 if (state_test(STATE_AMBIGUOUSAUTH
))
4874 f
->dump_string("state", "ambiguousauth");
4875 if (state_test(STATE_EXPORTINGCAPS
))
4876 f
->dump_string("state", "exportingcaps");
4877 if (state_test(STATE_NEEDSRECOVER
))
4878 f
->dump_string("state", "needsrecover");
4879 if (state_test(STATE_PURGING
))
4880 f
->dump_string("state", "purging");
4881 if (state_test(STATE_DIRTYPARENT
))
4882 f
->dump_string("state", "dirtyparent");
4883 if (state_test(STATE_DIRTYRSTAT
))
4884 f
->dump_string("state", "dirtyrstat");
4885 if (state_test(STATE_STRAYPINNED
))
4886 f
->dump_string("state", "straypinned");
4887 if (state_test(STATE_FROZENAUTHPIN
))
4888 f
->dump_string("state", "frozenauthpin");
4889 if (state_test(STATE_DIRTYPOOL
))
4890 f
->dump_string("state", "dirtypool");
4891 if (state_test(STATE_ORPHAN
))
4892 f
->dump_string("state", "orphan");
4893 if (state_test(STATE_MISSINGOBJS
))
4894 f
->dump_string("state", "missingobjs");
4898 if (flags
& DUMP_CAPS
) {
4899 f
->open_array_section("client_caps");
4900 for (const auto &p
: client_caps
) {
4901 auto &client
= p
.first
;
4902 auto cap
= &p
.second
;
4903 f
->open_object_section("client_cap");
4904 f
->dump_int("client_id", client
.v
);
4905 f
->dump_string("pending", ccap_string(cap
->pending()));
4906 f
->dump_string("issued", ccap_string(cap
->issued()));
4907 f
->dump_string("wanted", ccap_string(cap
->wanted()));
4908 f
->dump_int("last_sent", cap
->get_last_seq());
4913 f
->dump_int("loner", loner_cap
.v
);
4914 f
->dump_int("want_loner", want_loner_cap
.v
);
4916 f
->open_array_section("mds_caps_wanted");
4917 for (const auto &p
: mds_caps_wanted
) {
4918 f
->open_object_section("mds_cap_wanted");
4919 f
->dump_int("rank", p
.first
);
4920 f
->dump_string("cap", ccap_string(p
.second
));
4926 if (flags
& DUMP_DIRFRAGS
) {
4927 f
->open_array_section("dirfrags");
4928 auto&& dfs
= get_dirfrags();
4929 for(const auto &dir
: dfs
) {
4930 f
->open_object_section("dir");
4931 dir
->dump(f
, CDir::DUMP_DEFAULT
| CDir::DUMP_ITEMS
);
4932 dir
->check_rstats();
4939 /****** Scrub Stuff *****/
4940 void CInode::scrub_info_create() const
4942 dout(25) << __func__
<< dendl
;
4943 ceph_assert(!scrub_infop
);
4945 // break out of const-land to set up implicit initial state
4946 CInode
*me
= const_cast<CInode
*>(this);
4947 mempool_inode
*in
= me
->get_projected_inode();
4949 scrub_info_t
*si
= new scrub_info_t();
4950 si
->scrub_start_stamp
= si
->last_scrub_stamp
= in
->last_scrub_stamp
;
4951 si
->scrub_start_version
= si
->last_scrub_version
= in
->last_scrub_version
;
4953 me
->scrub_infop
= si
;
4956 void CInode::scrub_maybe_delete_info()
4959 !scrub_infop
->scrub_in_progress
&&
4960 !scrub_infop
->last_scrub_dirty
) {
4966 void CInode::scrub_initialize(CDentry
*scrub_parent
,
4967 ScrubHeaderRef
& header
,
4970 dout(20) << __func__
<< " with scrub_version " << get_version() << dendl
;
4971 if (scrub_is_in_progress()) {
4972 dout(20) << __func__
<< " inode moved during scrub, reinitializing "
4974 ceph_assert(scrub_infop
->scrub_parent
);
4975 CDentry
*dn
= scrub_infop
->scrub_parent
;
4976 CDir
*dir
= dn
->dir
;
4977 dn
->put(CDentry::PIN_SCRUBPARENT
);
4978 ceph_assert(dir
->scrub_infop
&& dir
->scrub_infop
->directory_scrubbing
);
4979 dir
->scrub_infop
->directories_scrubbing
.erase(dn
->key());
4980 dir
->scrub_infop
->others_scrubbing
.erase(dn
->key());
4984 scrub_infop
= new scrub_info_t();
4986 if (get_projected_inode()->is_dir()) {
4987 // fill in dirfrag_stamps with initial state
4989 dirfragtree
.get_leaves(leaves
);
4990 for (const auto& leaf
: leaves
) {
4991 if (header
->get_force())
4992 scrub_infop
->dirfrag_stamps
[leaf
].reset();
4994 scrub_infop
->dirfrag_stamps
[leaf
];
4999 scrub_parent
->get(CDentry::PIN_SCRUBPARENT
);
5000 scrub_infop
->scrub_parent
= scrub_parent
;
5001 scrub_infop
->on_finish
= f
;
5002 scrub_infop
->scrub_in_progress
= true;
5003 scrub_infop
->children_scrubbed
= false;
5004 scrub_infop
->header
= header
;
5006 scrub_infop
->scrub_start_version
= get_version();
5007 scrub_infop
->scrub_start_stamp
= ceph_clock_now();
5008 // right now we don't handle remote inodes
5011 int CInode::scrub_dirfrag_next(frag_t
* out_dirfrag
)
5013 dout(20) << __func__
<< dendl
;
5014 ceph_assert(scrub_is_in_progress());
5020 std::map
<frag_t
, scrub_stamp_info_t
>::iterator i
=
5021 scrub_infop
->dirfrag_stamps
.begin();
5023 while (i
!= scrub_infop
->dirfrag_stamps
.end()) {
5024 if (i
->second
.scrub_start_version
< scrub_infop
->scrub_start_version
) {
5025 i
->second
.scrub_start_version
= get_projected_version();
5026 i
->second
.scrub_start_stamp
= ceph_clock_now();
5027 *out_dirfrag
= i
->first
;
5028 dout(20) << " return frag " << *out_dirfrag
<< dendl
;
5034 dout(20) << " no frags left, ENOENT " << dendl
;
5038 void CInode::scrub_dirfrags_scrubbing(frag_vec_t
* out_dirfrags
)
5040 ceph_assert(out_dirfrags
!= NULL
);
5041 ceph_assert(scrub_infop
!= NULL
);
5043 out_dirfrags
->clear();
5044 std::map
<frag_t
, scrub_stamp_info_t
>::iterator i
=
5045 scrub_infop
->dirfrag_stamps
.begin();
5047 while (i
!= scrub_infop
->dirfrag_stamps
.end()) {
5048 if (i
->second
.scrub_start_version
>= scrub_infop
->scrub_start_version
) {
5049 if (i
->second
.last_scrub_version
< scrub_infop
->scrub_start_version
)
5050 out_dirfrags
->push_back(i
->first
);
5059 void CInode::scrub_dirfrag_finished(frag_t dirfrag
)
5061 dout(20) << __func__
<< " on frag " << dirfrag
<< dendl
;
5062 ceph_assert(scrub_is_in_progress());
5064 std::map
<frag_t
, scrub_stamp_info_t
>::iterator i
=
5065 scrub_infop
->dirfrag_stamps
.find(dirfrag
);
5066 ceph_assert(i
!= scrub_infop
->dirfrag_stamps
.end());
5068 scrub_stamp_info_t
&si
= i
->second
;
5069 si
.last_scrub_stamp
= si
.scrub_start_stamp
;
5070 si
.last_scrub_version
= si
.scrub_start_version
;
5073 void CInode::scrub_aborted(MDSContext
**c
) {
5074 dout(20) << __func__
<< dendl
;
5075 ceph_assert(scrub_is_in_progress());
5078 std::swap(*c
, scrub_infop
->on_finish
);
5080 if (scrub_infop
->scrub_parent
) {
5081 CDentry
*dn
= scrub_infop
->scrub_parent
;
5082 scrub_infop
->scrub_parent
= NULL
;
5083 dn
->dir
->scrub_dentry_finished(dn
);
5084 dn
->put(CDentry::PIN_SCRUBPARENT
);
5088 scrub_infop
= nullptr;
5091 void CInode::scrub_finished(MDSContext
**c
) {
5092 dout(20) << __func__
<< dendl
;
5093 ceph_assert(scrub_is_in_progress());
5094 for (std::map
<frag_t
, scrub_stamp_info_t
>::iterator i
=
5095 scrub_infop
->dirfrag_stamps
.begin();
5096 i
!= scrub_infop
->dirfrag_stamps
.end();
5098 if(i
->second
.last_scrub_version
!= i
->second
.scrub_start_version
) {
5099 derr
<< i
->second
.last_scrub_version
<< " != "
5100 << i
->second
.scrub_start_version
<< dendl
;
5102 ceph_assert(i
->second
.last_scrub_version
== i
->second
.scrub_start_version
);
5105 scrub_infop
->last_scrub_version
= scrub_infop
->scrub_start_version
;
5106 scrub_infop
->last_scrub_stamp
= scrub_infop
->scrub_start_stamp
;
5107 scrub_infop
->last_scrub_dirty
= true;
5108 scrub_infop
->scrub_in_progress
= false;
5110 if (scrub_infop
->scrub_parent
) {
5111 CDentry
*dn
= scrub_infop
->scrub_parent
;
5112 scrub_infop
->scrub_parent
= NULL
;
5113 dn
->dir
->scrub_dentry_finished(dn
);
5114 dn
->put(CDentry::PIN_SCRUBPARENT
);
5117 *c
= scrub_infop
->on_finish
;
5118 scrub_infop
->on_finish
= NULL
;
5120 if (scrub_infop
->header
->get_origin() == this) {
5121 // We are at the point that a tagging scrub was initiated
5122 LogChannelRef clog
= mdcache
->mds
->clog
;
5123 clog
->info() << "scrub complete with tag '"
5124 << scrub_infop
->header
->get_tag() << "'";
5128 int64_t CInode::get_backtrace_pool() const
5131 return mdcache
->mds
->mdsmap
->get_metadata_pool();
5133 // Files are required to have an explicit layout that specifies
5135 ceph_assert(inode
.layout
.pool_id
!= -1);
5136 return inode
.layout
.pool_id
;
5140 void CInode::maybe_export_pin(bool update
)
5142 if (!g_conf()->mds_bal_export_pin
)
5144 if (!is_dir() || !is_normal())
5147 mds_rank_t export_pin
= get_export_pin(false);
5148 if (export_pin
== MDS_RANK_NONE
&& !update
)
5151 if (state_test(CInode::STATE_QUEUEDEXPORTPIN
))
5155 for (auto p
= dirfrags
.begin(); p
!= dirfrags
.end(); p
++) {
5156 CDir
*dir
= p
->second
;
5157 if (!dir
->is_auth())
5159 if (export_pin
!= MDS_RANK_NONE
) {
5160 if (dir
->is_subtree_root()) {
5161 // set auxsubtree bit or export it
5162 if (!dir
->state_test(CDir::STATE_AUXSUBTREE
) ||
5163 export_pin
!= dir
->get_dir_auth().first
)
5166 // create aux subtree or export it
5170 // clear aux subtrees ?
5171 queue
= dir
->state_test(CDir::STATE_AUXSUBTREE
);
5174 state_set(CInode::STATE_QUEUEDEXPORTPIN
);
5175 mdcache
->export_pin_queue
.insert(this);
5181 void CInode::set_export_pin(mds_rank_t rank
)
5183 ceph_assert(is_dir());
5184 ceph_assert(is_projected());
5185 get_projected_inode()->export_pin
= rank
;
5188 mds_rank_t
CInode::get_export_pin(bool inherit
) const
5190 /* An inode that is export pinned may not necessarily be a subtree root, we
5191 * need to traverse the parents. A base or system inode cannot be pinned.
5192 * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5193 * have a parent yet.
5195 const CInode
*in
= this;
5197 if (in
->is_system())
5199 const CDentry
*pdn
= in
->get_parent_dn();
5202 // ignore export pin for unlinked directory
5203 if (in
->get_inode().nlink
== 0)
5205 if (in
->get_inode().export_pin
>= 0)
5206 return in
->get_inode().export_pin
;
5210 in
= pdn
->get_dir()->inode
;
5212 return MDS_RANK_NONE
;
5215 bool CInode::is_exportable(mds_rank_t dest
) const
5217 mds_rank_t pin
= get_export_pin();
5220 } else if (pin
>= 0) {
5227 void CInode::get_nested_dirfrags(std::vector
<CDir
*>& v
) const
5229 for (const auto &p
: dirfrags
) {
5230 const auto& dir
= p
.second
;
5231 if (!dir
->is_subtree_root())
5236 void CInode::get_subtree_dirfrags(std::vector
<CDir
*>& v
) const
5238 for (const auto &p
: dirfrags
) {
5239 const auto& dir
= p
.second
;
5240 if (dir
->is_subtree_root())
5245 MEMPOOL_DEFINE_OBJECT_FACTORY(CInode
, co_inode
, mds_co
);