1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2015 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 #include "common/perf_counters.h"
18 #include "mds/MDSRank.h"
19 #include "mds/MDCache.h"
20 #include "mds/MDLog.h"
22 #include "mds/CDentry.h"
23 #include "events/EUpdate.h"
24 #include "messages/MClientRequest.h"
26 #include "StrayManager.h"
28 #define dout_context g_ceph_context
29 #define dout_subsys ceph_subsys_mds
31 #define dout_prefix _prefix(_dout, mds)
32 static ostream
& _prefix(std::ostream
*_dout
, MDSRank
*mds
) {
33 return *_dout
<< "mds." << mds
->get_nodeid() << ".cache.strays ";
36 class StrayManagerIOContext
: public virtual MDSIOContextBase
{
39 MDSRank
*get_mds() override
44 explicit StrayManagerIOContext(StrayManager
*sm_
) : sm(sm_
) {}
47 class StrayManagerLogContext
: public virtual MDSLogContextBase
{
50 MDSRank
*get_mds() override
55 explicit StrayManagerLogContext(StrayManager
*sm_
) : sm(sm_
) {}
58 class StrayManagerContext
: public virtual MDSInternalContextBase
{
61 MDSRank
*get_mds() override
66 explicit StrayManagerContext(StrayManager
*sm_
) : sm(sm_
) {}
71 * Context wrapper for _purge_stray_purged completion
73 class C_IO_PurgeStrayPurged
: public StrayManagerIOContext
{
76 // How many ops_in_flight were allocated to this purge?
77 uint32_t ops_allowance
;
79 C_IO_PurgeStrayPurged(StrayManager
*sm_
, CDentry
*d
, bool oh
) :
80 StrayManagerIOContext(sm_
), dn(d
), only_head(oh
) { }
81 void finish(int r
) override
{
82 assert(r
== 0 || r
== -ENOENT
);
83 sm
->_purge_stray_purged(dn
, ops_allowance
, only_head
);
88 void StrayManager::purge(CDentry
*dn
)
90 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
91 CInode
*in
= dnl
->get_inode();
92 dout(10) << __func__
<< " " << *dn
<< " " << *in
<< dendl
;
93 assert(!dn
->is_replicated());
95 // CHEAT. there's no real need to journal our intent to purge, since
96 // that is implicit in the dentry's presence and non-use in the stray
97 // dir. on recovery, we'll need to re-eval all strays anyway.
99 SnapContext nullsnapc
;
102 item
.ino
= in
->inode
.ino
;
104 item
.action
= PurgeItem::PURGE_DIR
;
105 item
.fragtree
= in
->dirfragtree
;
107 item
.action
= PurgeItem::PURGE_FILE
;
109 const SnapContext
*snapc
;
110 SnapRealm
*realm
= in
->find_snaprealm();
112 dout(10) << " realm " << *realm
<< dendl
;
113 snapc
= &realm
->get_snap_context();
115 dout(10) << " NO realm, using null context" << dendl
;
117 assert(in
->last
== CEPH_NOSNAP
);
122 to
= in
->inode
.get_max_size();
123 to
= MAX(in
->inode
.size
, to
);
124 // when truncating a file, the filer does not delete stripe objects that are
125 // truncated to zero. so we need to purge stripe objects up to the max size
126 // the file has ever been.
127 to
= MAX(in
->inode
.max_size_ever
, to
);
130 inode_t
*pi
= in
->get_projected_inode();
133 item
.layout
= pi
->layout
;
134 item
.old_pools
= pi
->old_pools
;
138 purge_queue
.push(item
, new C_IO_PurgeStrayPurged(
142 class C_PurgeStrayLogged
: public StrayManagerLogContext
{
147 C_PurgeStrayLogged(StrayManager
*sm_
, CDentry
*d
, version_t v
, LogSegment
*s
) :
148 StrayManagerLogContext(sm_
), dn(d
), pdv(v
), ls(s
) { }
149 void finish(int r
) override
{
150 sm
->_purge_stray_logged(dn
, pdv
, ls
);
154 class C_TruncateStrayLogged
: public StrayManagerLogContext
{
158 C_TruncateStrayLogged(StrayManager
*sm
, CDentry
*d
, LogSegment
*s
) :
159 StrayManagerLogContext(sm
), dn(d
), ls(s
) { }
160 void finish(int r
) override
{
161 sm
->_truncate_stray_logged(dn
, ls
);
165 void StrayManager::_purge_stray_purged(
166 CDentry
*dn
, uint32_t ops_allowance
, bool only_head
)
168 CInode
*in
= dn
->get_projected_linkage()->get_inode();
169 dout(10) << "_purge_stray_purged " << *dn
<< " " << *in
<< dendl
;
171 logger
->inc(l_mdc_strays_enqueued
);
172 num_strays_enqueuing
--;
173 logger
->set(l_mdc_num_strays_enqueuing
, num_strays_enqueuing
);
176 /* This was a ::truncate */
177 EUpdate
*le
= new EUpdate(mds
->mdlog
, "purge_stray truncate");
178 mds
->mdlog
->start_entry(le
);
180 inode_t
*pi
= in
->project_inode();
182 pi
->max_size_ever
= 0;
183 pi
->client_ranges
.clear();
184 pi
->truncate_size
= 0;
185 pi
->truncate_from
= 0;
186 pi
->version
= in
->pre_dirty();
188 le
->metablob
.add_dir_context(dn
->dir
);
189 le
->metablob
.add_primary_dentry(dn
, in
, true);
191 mds
->mdlog
->submit_entry(le
,
192 new C_TruncateStrayLogged(
193 this, dn
, mds
->mdlog
->get_current_segment()));
195 if (in
->get_num_ref() != (int)in
->is_dirty() ||
196 dn
->get_num_ref() != (int)dn
->is_dirty() + !!in
->get_num_ref() + 1/*PIN_PURGING*/) {
197 // Nobody should be taking new references to an inode when it
198 // is being purged (aside from it were
200 derr
<< "Rogue reference after purge to " << *dn
<< dendl
;
201 assert(0 == "rogue reference to purging inode");
205 version_t pdv
= dn
->pre_dirty();
206 dn
->push_projected_linkage(); // NULL
208 EUpdate
*le
= new EUpdate(mds
->mdlog
, "purge_stray");
209 mds
->mdlog
->start_entry(le
);
211 // update dirfrag fragstat, rstat
212 CDir
*dir
= dn
->get_dir();
213 fnode_t
*pf
= dir
->project_fnode();
214 pf
->version
= dir
->pre_dirty();
216 pf
->fragstat
.nsubdirs
--;
218 pf
->fragstat
.nfiles
--;
219 pf
->rstat
.sub(in
->inode
.accounted_rstat
);
221 le
->metablob
.add_dir_context(dn
->dir
);
222 EMetaBlob::dirlump
& dl
= le
->metablob
.add_dir(dn
->dir
, true);
223 le
->metablob
.add_null_dentry(dl
, dn
, true);
224 le
->metablob
.add_destroyed_inode(in
->ino());
226 mds
->mdlog
->submit_entry(le
, new C_PurgeStrayLogged(this, dn
, pdv
,
227 mds
->mdlog
->get_current_segment()));
229 logger
->set(l_mdc_num_strays
, num_strays
);
233 void StrayManager::_purge_stray_logged(CDentry
*dn
, version_t pdv
, LogSegment
*ls
)
235 CInode
*in
= dn
->get_linkage()->get_inode();
236 dout(10) << "_purge_stray_logged " << *dn
<< " " << *in
<< dendl
;
238 assert(!in
->state_test(CInode::STATE_RECOVERING
));
240 bool new_dn
= dn
->is_new();
243 assert(dn
->get_projected_linkage()->is_null());
244 dn
->dir
->unlink_inode(dn
, !new_dn
);
245 dn
->pop_projected_linkage();
246 dn
->mark_dirty(pdv
, ls
);
248 dn
->dir
->pop_and_dirty_projected_fnode(ls
);
250 in
->state_clear(CInode::STATE_ORPHAN
);
251 dn
->state_clear(CDentry::STATE_PURGING
| CDentry::STATE_PURGINGPINNED
);
252 dn
->put(CDentry::PIN_PURGING
);
256 dout(20) << " dn is new, removing" << dendl
;
258 dn
->dir
->remove_dentry(dn
);
264 in
->mdcache
->remove_inode(in
);
267 void StrayManager::enqueue(CDentry
*dn
, bool trunc
)
269 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
271 CInode
*in
= dnl
->get_inode();
274 /* We consider a stray to be purging as soon as it is enqueued, to avoid
275 * enqueing it twice */
276 dn
->state_set(CDentry::STATE_PURGING
);
277 in
->state_set(CInode::STATE_PURGING
);
279 /* We must clear this as soon as enqueuing it, to prevent the journal
280 * expiry code from seeing a dirty parent and trying to write a backtrace */
282 if (in
->is_dirty_parent()) {
283 in
->clear_dirty_parent();
287 dout(20) << __func__
<< ": purging dn: " << *dn
<< dendl
;
289 if (!dn
->state_test(CDentry::STATE_PURGINGPINNED
)) {
290 dn
->get(CDentry::PIN_PURGING
);
291 dn
->state_set(CDentry::STATE_PURGINGPINNED
);
294 ++num_strays_enqueuing
;
295 logger
->set(l_mdc_num_strays_enqueuing
, num_strays_enqueuing
);
297 // Resources are available, acquire them and execute the purge
300 dout(10) << __func__
<< ": purging this dentry immediately: "
304 class C_OpenSnapParents
: public StrayManagerContext
{
308 C_OpenSnapParents(StrayManager
*sm_
, CDentry
*dn_
, bool t
) :
309 StrayManagerContext(sm_
), dn(dn_
), trunc(t
) { }
310 void finish(int r
) override
{
311 sm
->_enqueue(dn
, trunc
);
315 void StrayManager::_enqueue(CDentry
*dn
, bool trunc
)
319 CInode
*in
= dn
->get_linkage()->get_inode();
321 !in
->snaprealm
->have_past_parents_open() &&
322 !in
->snaprealm
->open_parents(new C_OpenSnapParents(this, dn
, trunc
))) {
323 // this can happen if the dentry had been trimmed from cache.
335 void StrayManager::advance_delayed()
340 for (elist
<CDentry
*>::iterator p
= delayed_eval_stray
.begin(); !p
.end(); ) {
343 dn
->item_stray
.remove_myself();
344 num_strays_delayed
--;
346 if (dn
->get_projected_linkage()->is_null()) {
347 /* A special case: a stray dentry can go null if its inode is being
348 * re-linked into another MDS's stray dir during a shutdown migration. */
349 dout(4) << __func__
<< ": delayed dentry is now null: " << *dn
<< dendl
;
353 const bool purging
= eval_stray(dn
);
355 derr
<< "Dentry " << *dn
<< " was purgeable but no longer is!" << dendl
;
357 * This can happen if a stray is purgeable, but has gained an extra
358 * reference by virtue of having its backtrace updated.
359 * FIXME perhaps we could simplify this further by
360 * avoiding writing the backtrace of purge-ready strays, so
361 * that this code could be more rigid?
365 logger
->set(l_mdc_num_strays_delayed
, num_strays_delayed
);
368 void StrayManager::set_num_strays(uint64_t num
)
372 logger
->set(l_mdc_num_strays
, num_strays
);
375 void StrayManager::notify_stray_created()
378 logger
->set(l_mdc_num_strays
, num_strays
);
379 logger
->inc(l_mdc_strays_created
);
382 void StrayManager::notify_stray_removed()
385 logger
->set(l_mdc_num_strays
, num_strays
);
388 struct C_EvalStray
: public StrayManagerContext
{
390 C_EvalStray(StrayManager
*sm_
, CDentry
*d
) : StrayManagerContext(sm_
), dn(d
) {}
391 void finish(int r
) override
{
396 struct C_MDC_EvalStray
: public StrayManagerContext
{
398 C_MDC_EvalStray(StrayManager
*sm_
, CDentry
*d
) : StrayManagerContext(sm_
), dn(d
) {}
399 void finish(int r
) override
{
404 bool StrayManager::_eval_stray(CDentry
*dn
, bool delay
)
406 dout(10) << "eval_stray " << *dn
<< dendl
;
407 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
408 assert(dnl
->is_primary());
409 dout(10) << " inode is " << *dnl
->get_inode() << dendl
;
410 CInode
*in
= dnl
->get_inode();
412 assert(!in
->state_test(CInode::STATE_REJOINUNDEF
));
414 // The only dentries elegible for purging are those
415 // in the stray directories
416 assert(dn
->get_dir()->get_inode()->is_stray());
418 // Inode may not pass through this function if it
419 // was already identified for purging (i.e. cannot
420 // call eval_stray() after purge()
421 assert(!dn
->state_test(CDentry::STATE_PURGING
));
423 if (!dn
->is_auth()) {
430 if (dn
->item_stray
.is_on_list()) {
434 dn
->item_stray
.remove_myself();
435 num_strays_delayed
--;
436 logger
->set(l_mdc_num_strays_delayed
, num_strays_delayed
);
440 if (in
->inode
.nlink
== 0) {
441 // past snaprealm parents imply snapped dentry remote links.
442 // only important for directories. normal file data snaps are handled
443 // by the object store.
445 if (!in
->snaprealm
->have_past_parents_open() &&
446 !in
->snaprealm
->open_parents(new C_MDC_EvalStray(this, dn
))) {
449 in
->snaprealm
->prune_past_parents();
450 in
->purge_stale_snap_data(in
->snaprealm
->get_snaps());
453 if (in
->snaprealm
&& in
->snaprealm
->has_past_parents()) {
454 dout(20) << " directory has past parents "
455 << in
->snaprealm
->srnode
.past_parents
<< dendl
;
456 if (in
->state_test(CInode::STATE_MISSINGOBJS
)) {
457 mds
->clog
->error() << "previous attempt at committing dirfrag of ino "
458 << in
->ino() << " has failed, missing object";
459 mds
->handle_write_error(-ENOENT
);
461 return false; // not until some snaps are deleted.
464 in
->mdcache
->clear_dirty_bits_for_stray(in
);
466 if (!in
->remote_parents
.empty()) {
467 // unlink any stale remote snap dentry.
468 for (compact_set
<CDentry
*>::iterator p
= in
->remote_parents
.begin();
469 p
!= in
->remote_parents
.end(); ) {
470 CDentry
*remote_dn
= *p
;
472 assert(remote_dn
->last
!= CEPH_NOSNAP
);
473 remote_dn
->unlink_remote(remote_dn
->get_linkage());
477 if (dn
->is_replicated()) {
478 dout(20) << " replicated" << dendl
;
481 if (dn
->is_any_leases() || in
->is_any_caps()) {
482 dout(20) << " caps | leases" << dendl
;
483 return false; // wait
485 if (in
->state_test(CInode::STATE_NEEDSRECOVER
) ||
486 in
->state_test(CInode::STATE_RECOVERING
)) {
487 dout(20) << " pending recovery" << dendl
;
488 return false; // don't mess with file size probing
490 if (in
->get_num_ref() > (int)in
->is_dirty() + (int)in
->is_dirty_parent()) {
491 dout(20) << " too many inode refs" << dendl
;
494 if (dn
->get_num_ref() > (int)dn
->is_dirty() + !!in
->get_num_ref()) {
495 dout(20) << " too many dn refs" << dendl
;
499 if (!dn
->item_stray
.is_on_list()) {
500 delayed_eval_stray
.push_back(&dn
->item_stray
);
501 num_strays_delayed
++;
502 logger
->set(l_mdc_num_strays_delayed
, num_strays_delayed
);
504 // don't purge multiversion inode with snap data
505 } else if (in
->snaprealm
&& in
->snaprealm
->has_past_parents() &&
506 !in
->old_inodes
.empty()) {
507 // A file with snapshots: we will truncate the HEAD revision
508 // but leave the metadata intact.
509 assert(!in
->is_dir());
510 dout(20) << " file has past parents "
511 << in
->snaprealm
->srnode
.past_parents
<< dendl
;
512 if (in
->is_file() && in
->get_projected_inode()->size
> 0) {
513 enqueue(dn
, true); // truncate head objects
516 // A straightforward file, ready to be purged. Enqueue it.
518 in
->close_dirfrags();
527 * Where a stray has some links, they should be remotes, check
528 * if we can do anything with them if we happen to have them in
531 _eval_stray_remote(dn
, NULL
);
536 void StrayManager::activate()
538 dout(10) << __func__
<< dendl
;
540 purge_queue
.activate();
543 bool StrayManager::eval_stray(CDentry
*dn
, bool delay
)
545 // avoid nested eval_stray
546 if (dn
->state_test(CDentry::STATE_EVALUATINGSTRAY
))
549 dn
->state_set(CDentry::STATE_EVALUATINGSTRAY
);
550 bool ret
= _eval_stray(dn
, delay
);
551 dn
->state_clear(CDentry::STATE_EVALUATINGSTRAY
);
555 void StrayManager::eval_remote(CDentry
*remote_dn
)
557 dout(10) << __func__
<< " " << *remote_dn
<< dendl
;
559 CDentry::linkage_t
*dnl
= remote_dn
->get_projected_linkage();
560 assert(dnl
->is_remote());
561 CInode
*in
= dnl
->get_inode();
564 dout(20) << __func__
<< ": no inode, cannot evaluate" << dendl
;
568 if (remote_dn
->last
!= CEPH_NOSNAP
) {
569 dout(20) << __func__
<< ": snap dentry, cannot evaluate" << dendl
;
574 CDentry
*primary_dn
= in
->get_projected_parent_dn();
575 assert(primary_dn
!= NULL
);
576 if (primary_dn
->get_dir()->get_inode()->is_stray()) {
577 _eval_stray_remote(primary_dn
, remote_dn
);
579 dout(20) << __func__
<< ": inode's primary dn not stray" << dendl
;
583 class C_RetryEvalRemote
: public StrayManagerContext
{
586 C_RetryEvalRemote(StrayManager
*sm_
, CDentry
*dn_
) :
587 StrayManagerContext(sm_
), dn(dn_
) {
588 dn
->get(CDentry::PIN_PTRWAITER
);
590 void finish(int r
) override
{
591 if (dn
->get_projected_linkage()->is_remote())
593 dn
->put(CDentry::PIN_PTRWAITER
);
597 void StrayManager::_eval_stray_remote(CDentry
*stray_dn
, CDentry
*remote_dn
)
599 dout(20) << __func__
<< " " << *stray_dn
<< dendl
;
600 assert(stray_dn
!= NULL
);
601 assert(stray_dn
->get_dir()->get_inode()->is_stray());
602 CDentry::linkage_t
*stray_dnl
= stray_dn
->get_projected_linkage();
603 assert(stray_dnl
->is_primary());
604 CInode
*stray_in
= stray_dnl
->get_inode();
605 assert(stray_in
->inode
.nlink
>= 1);
606 assert(stray_in
->last
== CEPH_NOSNAP
);
608 /* If no remote_dn hinted, pick one arbitrarily */
609 if (remote_dn
== NULL
) {
610 if (!stray_in
->remote_parents
.empty()) {
611 for (compact_set
<CDentry
*>::iterator p
= stray_in
->remote_parents
.begin();
612 p
!= stray_in
->remote_parents
.end();
614 if ((*p
)->last
== CEPH_NOSNAP
&& !(*p
)->is_projected()) {
615 if ((*p
)->is_auth()) {
617 if (remote_dn
->dir
->can_auth_pin())
619 } else if (!remote_dn
) {
625 dout(20) << __func__
<< ": not reintegrating (no remote parents in cache)" << dendl
;
629 assert(remote_dn
->last
== CEPH_NOSNAP
);
630 // NOTE: we repeat this check in _rename(), since our submission path is racey.
631 if (!remote_dn
->is_projected()) {
632 if (remote_dn
->is_auth()) {
633 if (remote_dn
->dir
->can_auth_pin()) {
634 reintegrate_stray(stray_dn
, remote_dn
);
636 remote_dn
->dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_RetryEvalRemote(this, remote_dn
));
637 dout(20) << __func__
<< ": not reintegrating (can't authpin remote parent)" << dendl
;
640 } else if (!remote_dn
->is_auth() && stray_dn
->is_auth()) {
641 migrate_stray(stray_dn
, remote_dn
->authority().first
);
643 dout(20) << __func__
<< ": not reintegrating" << dendl
;
646 // don't do anything if the remote parent is projected, or we may
647 // break user-visible semantics!
648 dout(20) << __func__
<< ": not reintegrating (projected)" << dendl
;
652 void StrayManager::reintegrate_stray(CDentry
*straydn
, CDentry
*rdn
)
654 dout(10) << __func__
<< " " << *straydn
<< " into " << *rdn
<< dendl
;
656 logger
->inc(l_mdc_strays_reintegrated
);
658 // rename it to another mds.
660 straydn
->make_path(src
);
664 MClientRequest
*req
= new MClientRequest(CEPH_MDS_OP_RENAME
);
665 req
->set_filepath(dst
);
666 req
->set_filepath2(src
);
667 req
->set_tid(mds
->issue_tid());
669 mds
->send_message_mds(req
, rdn
->authority().first
);
672 void StrayManager::migrate_stray(CDentry
*dn
, mds_rank_t to
)
674 CInode
*in
= dn
->get_projected_linkage()->get_inode();
676 CInode
*diri
= dn
->dir
->get_inode();
677 assert(diri
->is_stray());
678 dout(10) << "migrate_stray from mds." << MDS_INO_STRAY_OWNER(diri
->inode
.ino
)
680 << " " << *dn
<< " " << *in
<< dendl
;
682 logger
->inc(l_mdc_strays_migrated
);
684 // rename it to another mds.
687 assert(src
.depth() == 2);
689 filepath
dst(MDS_INO_MDSDIR(to
));
690 dst
.push_dentry(src
[0]);
691 dst
.push_dentry(src
[1]);
693 MClientRequest
*req
= new MClientRequest(CEPH_MDS_OP_RENAME
);
694 req
->set_filepath(dst
);
695 req
->set_filepath2(src
);
696 req
->set_tid(mds
->issue_tid());
698 mds
->send_message_mds(req
, to
);
701 StrayManager::StrayManager(MDSRank
*mds
, PurgeQueue
&purge_queue_
)
702 : delayed_eval_stray(member_offset(CDentry
, item_stray
)),
703 mds(mds
), logger(NULL
), started(false), num_strays(0),
704 num_strays_delayed(0), num_strays_enqueuing(0),
705 purge_queue(purge_queue_
)
710 void StrayManager::truncate(CDentry
*dn
)
712 const CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
713 const CInode
*in
= dnl
->get_inode();
715 dout(10) << __func__
<< ": " << *dn
<< " " << *in
<< dendl
;
716 assert(!dn
->is_replicated());
718 const SnapRealm
*realm
= in
->find_snaprealm();
720 dout(10) << " realm " << *realm
<< dendl
;
721 const SnapContext
*snapc
= &realm
->get_snap_context();
723 uint64_t to
= in
->inode
.get_max_size();
724 to
= MAX(in
->inode
.size
, to
);
725 // when truncating a file, the filer does not delete stripe objects that are
726 // truncated to zero. so we need to purge stripe objects up to the max size
727 // the file has ever been.
728 to
= MAX(in
->inode
.max_size_ever
, to
);
733 item
.ino
= in
->inode
.ino
;
734 item
.layout
= in
->inode
.layout
;
738 purge_queue
.push(item
, new C_IO_PurgeStrayPurged(
742 void StrayManager::_truncate_stray_logged(CDentry
*dn
, LogSegment
*ls
)
744 CInode
*in
= dn
->get_projected_linkage()->get_inode();
746 dout(10) << __func__
<< ": " << *dn
<< " " << *in
<< dendl
;
748 dn
->state_clear(CDentry::STATE_PURGING
| CDentry::STATE_PURGINGPINNED
);
749 dn
->put(CDentry::PIN_PURGING
);
751 in
->pop_and_dirty_projected_inode(ls
);