1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2015 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 #include "common/perf_counters.h"
18 #include "mds/MDSRank.h"
19 #include "mds/MDCache.h"
20 #include "mds/MDLog.h"
22 #include "mds/CDentry.h"
23 #include "events/EUpdate.h"
24 #include "messages/MClientRequest.h"
26 #include "StrayManager.h"
28 #define dout_context g_ceph_context
29 #define dout_subsys ceph_subsys_mds
31 #define dout_prefix _prefix(_dout, mds)
32 static ostream
& _prefix(std::ostream
*_dout
, MDSRank
*mds
) {
33 return *_dout
<< "mds." << mds
->get_nodeid() << ".cache.strays ";
36 class StrayManagerIOContext
: public virtual MDSIOContextBase
{
39 MDSRank
*get_mds() override
44 explicit StrayManagerIOContext(StrayManager
*sm_
) : sm(sm_
) {}
47 class StrayManagerLogContext
: public virtual MDSLogContextBase
{
50 MDSRank
*get_mds() override
55 explicit StrayManagerLogContext(StrayManager
*sm_
) : sm(sm_
) {}
58 class StrayManagerContext
: public virtual MDSInternalContextBase
{
61 MDSRank
*get_mds() override
66 explicit StrayManagerContext(StrayManager
*sm_
) : sm(sm_
) {}
71 * Context wrapper for _purge_stray_purged completion
73 class C_IO_PurgeStrayPurged
: public StrayManagerIOContext
{
77 C_IO_PurgeStrayPurged(StrayManager
*sm_
, CDentry
*d
, bool oh
) :
78 StrayManagerIOContext(sm_
), dn(d
), only_head(oh
) { }
79 void finish(int r
) override
{
80 assert(r
== 0 || r
== -ENOENT
);
81 sm
->_purge_stray_purged(dn
, only_head
);
86 void StrayManager::purge(CDentry
*dn
)
88 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
89 CInode
*in
= dnl
->get_inode();
90 dout(10) << __func__
<< " " << *dn
<< " " << *in
<< dendl
;
91 assert(!dn
->is_replicated());
93 // CHEAT. there's no real need to journal our intent to purge, since
94 // that is implicit in the dentry's presence and non-use in the stray
95 // dir. on recovery, we'll need to re-eval all strays anyway.
97 SnapContext nullsnapc
;
100 item
.ino
= in
->inode
.ino
;
102 item
.action
= PurgeItem::PURGE_DIR
;
103 item
.fragtree
= in
->dirfragtree
;
105 item
.action
= PurgeItem::PURGE_FILE
;
107 const SnapContext
*snapc
;
108 SnapRealm
*realm
= in
->find_snaprealm();
110 dout(10) << " realm " << *realm
<< dendl
;
111 snapc
= &realm
->get_snap_context();
113 dout(10) << " NO realm, using null context" << dendl
;
115 assert(in
->last
== CEPH_NOSNAP
);
120 to
= in
->inode
.get_max_size();
121 to
= MAX(in
->inode
.size
, to
);
122 // when truncating a file, the filer does not delete stripe objects that are
123 // truncated to zero. so we need to purge stripe objects up to the max size
124 // the file has ever been.
125 to
= MAX(in
->inode
.max_size_ever
, to
);
128 inode_t
*pi
= in
->get_projected_inode();
131 item
.layout
= pi
->layout
;
132 item
.old_pools
= pi
->old_pools
;
136 purge_queue
.push(item
, new C_IO_PurgeStrayPurged(
140 class C_PurgeStrayLogged
: public StrayManagerLogContext
{
145 C_PurgeStrayLogged(StrayManager
*sm_
, CDentry
*d
, version_t v
, LogSegment
*s
) :
146 StrayManagerLogContext(sm_
), dn(d
), pdv(v
), ls(s
) { }
147 void finish(int r
) override
{
148 sm
->_purge_stray_logged(dn
, pdv
, ls
);
152 class C_TruncateStrayLogged
: public StrayManagerLogContext
{
156 C_TruncateStrayLogged(StrayManager
*sm
, CDentry
*d
, LogSegment
*s
) :
157 StrayManagerLogContext(sm
), dn(d
), ls(s
) { }
158 void finish(int r
) override
{
159 sm
->_truncate_stray_logged(dn
, ls
);
163 void StrayManager::_purge_stray_purged(
164 CDentry
*dn
, bool only_head
)
166 CInode
*in
= dn
->get_projected_linkage()->get_inode();
167 dout(10) << "_purge_stray_purged " << *dn
<< " " << *in
<< dendl
;
169 logger
->inc(l_mdc_strays_enqueued
);
170 num_strays_enqueuing
--;
171 logger
->set(l_mdc_num_strays_enqueuing
, num_strays_enqueuing
);
174 /* This was a ::truncate */
175 EUpdate
*le
= new EUpdate(mds
->mdlog
, "purge_stray truncate");
176 mds
->mdlog
->start_entry(le
);
178 inode_t
*pi
= in
->project_inode();
180 pi
->max_size_ever
= 0;
181 pi
->client_ranges
.clear();
182 pi
->truncate_size
= 0;
183 pi
->truncate_from
= 0;
184 pi
->version
= in
->pre_dirty();
186 le
->metablob
.add_dir_context(dn
->dir
);
187 le
->metablob
.add_primary_dentry(dn
, in
, true);
189 mds
->mdlog
->submit_entry(le
,
190 new C_TruncateStrayLogged(
191 this, dn
, mds
->mdlog
->get_current_segment()));
193 if (in
->get_num_ref() != (int)in
->is_dirty() ||
194 dn
->get_num_ref() != (int)dn
->is_dirty() + !!in
->get_num_ref() + 1/*PIN_PURGING*/) {
195 // Nobody should be taking new references to an inode when it
196 // is being purged (aside from it were
198 derr
<< "Rogue reference after purge to " << *dn
<< dendl
;
199 assert(0 == "rogue reference to purging inode");
203 version_t pdv
= dn
->pre_dirty();
204 dn
->push_projected_linkage(); // NULL
206 EUpdate
*le
= new EUpdate(mds
->mdlog
, "purge_stray");
207 mds
->mdlog
->start_entry(le
);
209 // update dirfrag fragstat, rstat
210 CDir
*dir
= dn
->get_dir();
211 fnode_t
*pf
= dir
->project_fnode();
212 pf
->version
= dir
->pre_dirty();
214 pf
->fragstat
.nsubdirs
--;
216 pf
->fragstat
.nfiles
--;
217 pf
->rstat
.sub(in
->inode
.accounted_rstat
);
219 le
->metablob
.add_dir_context(dn
->dir
);
220 EMetaBlob::dirlump
& dl
= le
->metablob
.add_dir(dn
->dir
, true);
221 le
->metablob
.add_null_dentry(dl
, dn
, true);
222 le
->metablob
.add_destroyed_inode(in
->ino());
224 mds
->mdlog
->submit_entry(le
, new C_PurgeStrayLogged(this, dn
, pdv
,
225 mds
->mdlog
->get_current_segment()));
227 logger
->set(l_mdc_num_strays
, num_strays
);
231 void StrayManager::_purge_stray_logged(CDentry
*dn
, version_t pdv
, LogSegment
*ls
)
233 CInode
*in
= dn
->get_linkage()->get_inode();
234 dout(10) << "_purge_stray_logged " << *dn
<< " " << *in
<< dendl
;
236 assert(!in
->state_test(CInode::STATE_RECOVERING
));
238 bool new_dn
= dn
->is_new();
241 assert(dn
->get_projected_linkage()->is_null());
242 dn
->dir
->unlink_inode(dn
, !new_dn
);
243 dn
->pop_projected_linkage();
244 dn
->mark_dirty(pdv
, ls
);
246 dn
->dir
->pop_and_dirty_projected_fnode(ls
);
248 in
->state_clear(CInode::STATE_ORPHAN
);
249 dn
->state_clear(CDentry::STATE_PURGING
| CDentry::STATE_PURGINGPINNED
);
250 dn
->put(CDentry::PIN_PURGING
);
254 dout(20) << " dn is new, removing" << dendl
;
256 dn
->dir
->remove_dentry(dn
);
262 in
->mdcache
->remove_inode(in
);
265 void StrayManager::enqueue(CDentry
*dn
, bool trunc
)
267 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
269 CInode
*in
= dnl
->get_inode();
272 /* We consider a stray to be purging as soon as it is enqueued, to avoid
273 * enqueing it twice */
274 dn
->state_set(CDentry::STATE_PURGING
);
275 in
->state_set(CInode::STATE_PURGING
);
277 /* We must clear this as soon as enqueuing it, to prevent the journal
278 * expiry code from seeing a dirty parent and trying to write a backtrace */
280 if (in
->is_dirty_parent()) {
281 in
->clear_dirty_parent();
285 dout(20) << __func__
<< ": purging dn: " << *dn
<< dendl
;
287 if (!dn
->state_test(CDentry::STATE_PURGINGPINNED
)) {
288 dn
->get(CDentry::PIN_PURGING
);
289 dn
->state_set(CDentry::STATE_PURGINGPINNED
);
292 ++num_strays_enqueuing
;
293 logger
->set(l_mdc_num_strays_enqueuing
, num_strays_enqueuing
);
295 // Resources are available, acquire them and execute the purge
298 dout(10) << __func__
<< ": purging this dentry immediately: "
302 class C_OpenSnapParents
: public StrayManagerContext
{
306 C_OpenSnapParents(StrayManager
*sm_
, CDentry
*dn_
, bool t
) :
307 StrayManagerContext(sm_
), dn(dn_
), trunc(t
) { }
308 void finish(int r
) override
{
309 sm
->_enqueue(dn
, trunc
);
313 void StrayManager::_enqueue(CDentry
*dn
, bool trunc
)
317 CInode
*in
= dn
->get_linkage()->get_inode();
319 !in
->snaprealm
->have_past_parents_open() &&
320 !in
->snaprealm
->open_parents(new C_OpenSnapParents(this, dn
, trunc
))) {
321 // this can happen if the dentry had been trimmed from cache.
333 void StrayManager::advance_delayed()
338 for (elist
<CDentry
*>::iterator p
= delayed_eval_stray
.begin(); !p
.end(); ) {
341 dn
->item_stray
.remove_myself();
342 num_strays_delayed
--;
344 if (dn
->get_projected_linkage()->is_null()) {
345 /* A special case: a stray dentry can go null if its inode is being
346 * re-linked into another MDS's stray dir during a shutdown migration. */
347 dout(4) << __func__
<< ": delayed dentry is now null: " << *dn
<< dendl
;
351 const bool purging
= eval_stray(dn
);
353 derr
<< "Dentry " << *dn
<< " was purgeable but no longer is!" << dendl
;
355 * This can happen if a stray is purgeable, but has gained an extra
356 * reference by virtue of having its backtrace updated.
357 * FIXME perhaps we could simplify this further by
358 * avoiding writing the backtrace of purge-ready strays, so
359 * that this code could be more rigid?
363 logger
->set(l_mdc_num_strays_delayed
, num_strays_delayed
);
366 void StrayManager::set_num_strays(uint64_t num
)
370 logger
->set(l_mdc_num_strays
, num_strays
);
373 void StrayManager::notify_stray_created()
376 logger
->set(l_mdc_num_strays
, num_strays
);
377 logger
->inc(l_mdc_strays_created
);
380 void StrayManager::notify_stray_removed()
383 logger
->set(l_mdc_num_strays
, num_strays
);
386 struct C_EvalStray
: public StrayManagerContext
{
388 C_EvalStray(StrayManager
*sm_
, CDentry
*d
) : StrayManagerContext(sm_
), dn(d
) {}
389 void finish(int r
) override
{
394 struct C_MDC_EvalStray
: public StrayManagerContext
{
396 C_MDC_EvalStray(StrayManager
*sm_
, CDentry
*d
) : StrayManagerContext(sm_
), dn(d
) {}
397 void finish(int r
) override
{
402 bool StrayManager::_eval_stray(CDentry
*dn
, bool delay
)
404 dout(10) << "eval_stray " << *dn
<< dendl
;
405 CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
406 assert(dnl
->is_primary());
407 dout(10) << " inode is " << *dnl
->get_inode() << dendl
;
408 CInode
*in
= dnl
->get_inode();
410 assert(!in
->state_test(CInode::STATE_REJOINUNDEF
));
412 // The only dentries elegible for purging are those
413 // in the stray directories
414 assert(dn
->get_dir()->get_inode()->is_stray());
416 // Inode may not pass through this function if it
417 // was already identified for purging (i.e. cannot
418 // call eval_stray() after purge()
419 assert(!dn
->state_test(CDentry::STATE_PURGING
));
421 if (!dn
->is_auth()) {
428 if (dn
->item_stray
.is_on_list()) {
432 dn
->item_stray
.remove_myself();
433 num_strays_delayed
--;
434 logger
->set(l_mdc_num_strays_delayed
, num_strays_delayed
);
438 if (in
->inode
.nlink
== 0) {
439 // past snaprealm parents imply snapped dentry remote links.
440 // only important for directories. normal file data snaps are handled
441 // by the object store.
443 if (!in
->snaprealm
->have_past_parents_open() &&
444 !in
->snaprealm
->open_parents(new C_MDC_EvalStray(this, dn
))) {
447 in
->snaprealm
->prune_past_parents();
448 in
->purge_stale_snap_data(in
->snaprealm
->get_snaps());
451 if (in
->snaprealm
&& in
->snaprealm
->has_past_parents()) {
452 dout(20) << " directory has past parents "
453 << in
->snaprealm
->srnode
.past_parents
<< dendl
;
454 if (in
->state_test(CInode::STATE_MISSINGOBJS
)) {
455 mds
->clog
->error() << "previous attempt at committing dirfrag of ino "
456 << in
->ino() << " has failed, missing object";
457 mds
->handle_write_error(-ENOENT
);
459 return false; // not until some snaps are deleted.
462 in
->mdcache
->clear_dirty_bits_for_stray(in
);
464 if (!in
->remote_parents
.empty()) {
465 // unlink any stale remote snap dentry.
466 for (compact_set
<CDentry
*>::iterator p
= in
->remote_parents
.begin();
467 p
!= in
->remote_parents
.end(); ) {
468 CDentry
*remote_dn
= *p
;
470 assert(remote_dn
->last
!= CEPH_NOSNAP
);
471 remote_dn
->unlink_remote(remote_dn
->get_linkage());
475 if (dn
->is_replicated()) {
476 dout(20) << " replicated" << dendl
;
479 if (dn
->is_any_leases() || in
->is_any_caps()) {
480 dout(20) << " caps | leases" << dendl
;
481 return false; // wait
483 if (in
->state_test(CInode::STATE_NEEDSRECOVER
) ||
484 in
->state_test(CInode::STATE_RECOVERING
)) {
485 dout(20) << " pending recovery" << dendl
;
486 return false; // don't mess with file size probing
488 if (in
->get_num_ref() > (int)in
->is_dirty() + (int)in
->is_dirty_parent()) {
489 dout(20) << " too many inode refs" << dendl
;
492 if (dn
->get_num_ref() > (int)dn
->is_dirty() + !!in
->get_num_ref()) {
493 dout(20) << " too many dn refs" << dendl
;
497 if (!dn
->item_stray
.is_on_list()) {
498 delayed_eval_stray
.push_back(&dn
->item_stray
);
499 num_strays_delayed
++;
500 logger
->set(l_mdc_num_strays_delayed
, num_strays_delayed
);
502 // don't purge multiversion inode with snap data
503 } else if (in
->snaprealm
&& in
->snaprealm
->has_past_parents() &&
504 !in
->old_inodes
.empty()) {
505 // A file with snapshots: we will truncate the HEAD revision
506 // but leave the metadata intact.
507 assert(!in
->is_dir());
508 dout(20) << " file has past parents "
509 << in
->snaprealm
->srnode
.past_parents
<< dendl
;
510 if (in
->is_file() && in
->get_projected_inode()->size
> 0) {
511 enqueue(dn
, true); // truncate head objects
514 // A straightforward file, ready to be purged. Enqueue it.
516 in
->close_dirfrags();
525 * Where a stray has some links, they should be remotes, check
526 * if we can do anything with them if we happen to have them in
529 _eval_stray_remote(dn
, NULL
);
534 void StrayManager::activate()
536 dout(10) << __func__
<< dendl
;
538 purge_queue
.activate();
541 bool StrayManager::eval_stray(CDentry
*dn
, bool delay
)
543 // avoid nested eval_stray
544 if (dn
->state_test(CDentry::STATE_EVALUATINGSTRAY
))
547 dn
->state_set(CDentry::STATE_EVALUATINGSTRAY
);
548 bool ret
= _eval_stray(dn
, delay
);
549 dn
->state_clear(CDentry::STATE_EVALUATINGSTRAY
);
553 void StrayManager::eval_remote(CDentry
*remote_dn
)
555 dout(10) << __func__
<< " " << *remote_dn
<< dendl
;
557 CDentry::linkage_t
*dnl
= remote_dn
->get_projected_linkage();
558 assert(dnl
->is_remote());
559 CInode
*in
= dnl
->get_inode();
562 dout(20) << __func__
<< ": no inode, cannot evaluate" << dendl
;
566 if (remote_dn
->last
!= CEPH_NOSNAP
) {
567 dout(20) << __func__
<< ": snap dentry, cannot evaluate" << dendl
;
572 CDentry
*primary_dn
= in
->get_projected_parent_dn();
573 assert(primary_dn
!= NULL
);
574 if (primary_dn
->get_dir()->get_inode()->is_stray()) {
575 _eval_stray_remote(primary_dn
, remote_dn
);
577 dout(20) << __func__
<< ": inode's primary dn not stray" << dendl
;
581 class C_RetryEvalRemote
: public StrayManagerContext
{
584 C_RetryEvalRemote(StrayManager
*sm_
, CDentry
*dn_
) :
585 StrayManagerContext(sm_
), dn(dn_
) {
586 dn
->get(CDentry::PIN_PTRWAITER
);
588 void finish(int r
) override
{
589 if (dn
->get_projected_linkage()->is_remote())
591 dn
->put(CDentry::PIN_PTRWAITER
);
595 void StrayManager::_eval_stray_remote(CDentry
*stray_dn
, CDentry
*remote_dn
)
597 dout(20) << __func__
<< " " << *stray_dn
<< dendl
;
598 assert(stray_dn
!= NULL
);
599 assert(stray_dn
->get_dir()->get_inode()->is_stray());
600 CDentry::linkage_t
*stray_dnl
= stray_dn
->get_projected_linkage();
601 assert(stray_dnl
->is_primary());
602 CInode
*stray_in
= stray_dnl
->get_inode();
603 assert(stray_in
->inode
.nlink
>= 1);
604 assert(stray_in
->last
== CEPH_NOSNAP
);
606 /* If no remote_dn hinted, pick one arbitrarily */
607 if (remote_dn
== NULL
) {
608 if (!stray_in
->remote_parents
.empty()) {
609 for (compact_set
<CDentry
*>::iterator p
= stray_in
->remote_parents
.begin();
610 p
!= stray_in
->remote_parents
.end();
612 if ((*p
)->last
== CEPH_NOSNAP
&& !(*p
)->is_projected()) {
613 if ((*p
)->is_auth()) {
615 if (remote_dn
->dir
->can_auth_pin())
617 } else if (!remote_dn
) {
623 dout(20) << __func__
<< ": not reintegrating (no remote parents in cache)" << dendl
;
627 assert(remote_dn
->last
== CEPH_NOSNAP
);
628 // NOTE: we repeat this check in _rename(), since our submission path is racey.
629 if (!remote_dn
->is_projected()) {
630 if (remote_dn
->is_auth()) {
631 if (remote_dn
->dir
->can_auth_pin()) {
632 reintegrate_stray(stray_dn
, remote_dn
);
634 remote_dn
->dir
->add_waiter(CDir::WAIT_UNFREEZE
, new C_RetryEvalRemote(this, remote_dn
));
635 dout(20) << __func__
<< ": not reintegrating (can't authpin remote parent)" << dendl
;
638 } else if (!remote_dn
->is_auth() && stray_dn
->is_auth()) {
639 migrate_stray(stray_dn
, remote_dn
->authority().first
);
641 dout(20) << __func__
<< ": not reintegrating" << dendl
;
644 // don't do anything if the remote parent is projected, or we may
645 // break user-visible semantics!
646 dout(20) << __func__
<< ": not reintegrating (projected)" << dendl
;
650 void StrayManager::reintegrate_stray(CDentry
*straydn
, CDentry
*rdn
)
652 dout(10) << __func__
<< " " << *straydn
<< " into " << *rdn
<< dendl
;
654 logger
->inc(l_mdc_strays_reintegrated
);
656 // rename it to another mds.
658 straydn
->make_path(src
);
662 MClientRequest
*req
= new MClientRequest(CEPH_MDS_OP_RENAME
);
663 req
->set_filepath(dst
);
664 req
->set_filepath2(src
);
665 req
->set_tid(mds
->issue_tid());
667 mds
->send_message_mds(req
, rdn
->authority().first
);
670 void StrayManager::migrate_stray(CDentry
*dn
, mds_rank_t to
)
672 CInode
*in
= dn
->get_projected_linkage()->get_inode();
674 CInode
*diri
= dn
->dir
->get_inode();
675 assert(diri
->is_stray());
676 dout(10) << "migrate_stray from mds." << MDS_INO_STRAY_OWNER(diri
->inode
.ino
)
678 << " " << *dn
<< " " << *in
<< dendl
;
680 logger
->inc(l_mdc_strays_migrated
);
682 // rename it to another mds.
685 assert(src
.depth() == 2);
687 filepath
dst(MDS_INO_MDSDIR(to
));
688 dst
.push_dentry(src
[0]);
689 dst
.push_dentry(src
[1]);
691 MClientRequest
*req
= new MClientRequest(CEPH_MDS_OP_RENAME
);
692 req
->set_filepath(dst
);
693 req
->set_filepath2(src
);
694 req
->set_tid(mds
->issue_tid());
696 mds
->send_message_mds(req
, to
);
699 StrayManager::StrayManager(MDSRank
*mds
, PurgeQueue
&purge_queue_
)
700 : delayed_eval_stray(member_offset(CDentry
, item_stray
)),
701 mds(mds
), logger(NULL
), started(false), num_strays(0),
702 num_strays_delayed(0), num_strays_enqueuing(0),
703 purge_queue(purge_queue_
)
708 void StrayManager::truncate(CDentry
*dn
)
710 const CDentry::linkage_t
*dnl
= dn
->get_projected_linkage();
711 const CInode
*in
= dnl
->get_inode();
713 dout(10) << __func__
<< ": " << *dn
<< " " << *in
<< dendl
;
714 assert(!dn
->is_replicated());
716 const SnapRealm
*realm
= in
->find_snaprealm();
718 dout(10) << " realm " << *realm
<< dendl
;
719 const SnapContext
*snapc
= &realm
->get_snap_context();
721 uint64_t to
= in
->inode
.get_max_size();
722 to
= MAX(in
->inode
.size
, to
);
723 // when truncating a file, the filer does not delete stripe objects that are
724 // truncated to zero. so we need to purge stripe objects up to the max size
725 // the file has ever been.
726 to
= MAX(in
->inode
.max_size_ever
, to
);
731 item
.ino
= in
->inode
.ino
;
732 item
.layout
= in
->inode
.layout
;
736 purge_queue
.push(item
, new C_IO_PurgeStrayPurged(
740 void StrayManager::_truncate_stray_logged(CDentry
*dn
, LogSegment
*ls
)
742 CInode
*in
= dn
->get_projected_linkage()->get_inode();
744 dout(10) << __func__
<< ": " << *dn
<< " " << *in
<< dendl
;
746 dn
->state_clear(CDentry::STATE_PURGING
| CDentry::STATE_PURGINGPINNED
);
747 dn
->put(CDentry::PIN_PURGING
);
749 in
->pop_and_dirty_projected_inode(ls
);