1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "common/config.h"
16 #include "osdc/Journaler.h"
17 #include "events/ESubtreeMap.h"
18 #include "events/ESession.h"
19 #include "events/ESessions.h"
21 #include "events/EMetaBlob.h"
22 #include "events/EResetJournal.h"
23 #include "events/ENoOp.h"
25 #include "events/EUpdate.h"
26 #include "events/EPeerUpdate.h"
27 #include "events/EOpen.h"
28 #include "events/ECommitted.h"
29 #include "events/EPurged.h"
31 #include "events/EExport.h"
32 #include "events/EImportStart.h"
33 #include "events/EImportFinish.h"
34 #include "events/EFragment.h"
36 #include "events/ETableClient.h"
37 #include "events/ETableServer.h"
39 #include "include/stringify.h"
41 #include "LogSegment.h"
51 #include "MDSTableClient.h"
52 #include "MDSTableServer.h"
56 #define dout_context g_ceph_context
57 #define dout_subsys ceph_subsys_mds
59 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
69 // -----------------------
72 struct BatchStoredBacktrace
: public MDSIOContext
{
74 std::vector
<CInodeCommitOperations
> ops_vec
;
76 BatchStoredBacktrace(MDSRank
*m
, MDSContext
*f
,
77 std::vector
<CInodeCommitOperations
>&& ops
) :
78 MDSIOContext(m
), fin(f
), ops_vec(std::move(ops
)) {}
79 void finish(int r
) override
{
80 for (auto& op
: ops_vec
) {
81 op
.in
->_stored_backtrace(r
, op
.version
, nullptr);
85 void print(ostream
& out
) const override
{
86 out
<< "batch backtrace_store";
90 struct BatchCommitBacktrace
: public Context
{
93 std::vector
<CInodeCommitOperations
> ops_vec
;
95 BatchCommitBacktrace(MDSRank
*m
, MDSContext
*f
,
96 std::vector
<CInodeCommitOperations
>&& ops
) :
97 mds(m
), fin(f
), ops_vec(std::move(ops
)) {}
98 void finish(int r
) override
{
99 C_GatherBuilder
gather(g_ceph_context
);
101 for (auto &op
: ops_vec
) {
102 op
.in
->_commit_ops(r
, gather
, op
.ops_vec
, op
.bt
);
106 ceph_assert(gather
.has_subs());
107 gather
.set_finisher(new C_OnFinisher(
108 new BatchStoredBacktrace(mds
, fin
, std::move(ops_vec
)),
114 void LogSegment::try_to_expire(MDSRank
*mds
, MDSGatherBuilder
&gather_bld
, int op_prio
)
118 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire" << dendl
;
120 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 1);
123 for (elist
<CDir
*>::iterator p
= new_dirfrags
.begin(); !p
.end(); ++p
) {
124 dout(20) << " new_dirfrag " << **p
<< dendl
;
125 ceph_assert((*p
)->is_auth());
128 for (elist
<CDir
*>::iterator p
= dirty_dirfrags
.begin(); !p
.end(); ++p
) {
129 dout(20) << " dirty_dirfrag " << **p
<< dendl
;
130 ceph_assert((*p
)->is_auth());
133 for (elist
<CDentry
*>::iterator p
= dirty_dentries
.begin(); !p
.end(); ++p
) {
134 dout(20) << " dirty_dentry " << **p
<< dendl
;
135 ceph_assert((*p
)->is_auth());
136 commit
.insert((*p
)->get_dir());
138 for (elist
<CInode
*>::iterator p
= dirty_inodes
.begin(); !p
.end(); ++p
) {
139 dout(20) << " dirty_inode " << **p
<< dendl
;
140 ceph_assert((*p
)->is_auth());
141 if ((*p
)->is_base()) {
142 (*p
)->store(gather_bld
.new_sub());
144 commit
.insert((*p
)->get_parent_dn()->get_dir());
147 if (!commit
.empty()) {
148 for (set
<CDir
*>::iterator p
= commit
.begin();
152 ceph_assert(dir
->is_auth());
153 if (dir
->can_auth_pin()) {
154 dout(15) << "try_to_expire committing " << *dir
<< dendl
;
155 dir
->commit(0, gather_bld
.new_sub(), false, op_prio
);
157 dout(15) << "try_to_expire waiting for unfreeze on " << *dir
<< dendl
;
158 dir
->add_waiter(CDir::WAIT_UNFREEZE
, gather_bld
.new_sub());
163 // leader ops with possibly uncommitted peers
164 for (set
<metareqid_t
>::iterator p
= uncommitted_leaders
.begin();
165 p
!= uncommitted_leaders
.end();
167 dout(10) << "try_to_expire waiting for peers to ack commit on " << *p
<< dendl
;
168 mds
->mdcache
->wait_for_uncommitted_leader(*p
, gather_bld
.new_sub());
171 // peer ops that haven't been committed
172 for (set
<metareqid_t
>::iterator p
= uncommitted_peers
.begin();
173 p
!= uncommitted_peers
.end();
175 dout(10) << "try_to_expire waiting for leader to ack OP_FINISH on " << *p
<< dendl
;
176 mds
->mdcache
->wait_for_uncommitted_peer(*p
, gather_bld
.new_sub());
179 // uncommitted fragments
180 for (set
<dirfrag_t
>::iterator p
= uncommitted_fragments
.begin();
181 p
!= uncommitted_fragments
.end();
183 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p
<< dendl
;
184 mds
->mdcache
->wait_for_uncommitted_fragment(*p
, gather_bld
.new_sub());
187 // nudge scatterlocks
188 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_dir
.begin(); !p
.end(); ++p
) {
190 dout(10) << "try_to_expire waiting for dirlock flush on " << *in
<< dendl
;
191 mds
->locker
->scatter_nudge(&in
->filelock
, gather_bld
.new_sub());
193 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_dirfragtree
.begin(); !p
.end(); ++p
) {
195 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in
<< dendl
;
196 mds
->locker
->scatter_nudge(&in
->dirfragtreelock
, gather_bld
.new_sub());
198 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_nest
.begin(); !p
.end(); ++p
) {
200 dout(10) << "try_to_expire waiting for nest flush on " << *in
<< dendl
;
201 mds
->locker
->scatter_nudge(&in
->nestlock
, gather_bld
.new_sub());
204 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 2);
206 // open files and snap inodes
207 if (!open_files
.empty()) {
208 ceph_assert(!mds
->mdlog
->is_capped()); // hmm FIXME
210 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
211 ceph_assert(ls
!= this);
212 elist
<CInode
*>::iterator p
= open_files
.begin(member_offset(CInode
, item_open_file
));
216 if (in
->last
!= CEPH_NOSNAP
&& in
->is_auth() && !in
->client_snap_caps
.empty()) {
217 // journal snap inodes that need flush. This simplify the mds failover hanlding
218 dout(20) << "try_to_expire requeueing snap needflush inode " << *in
<< dendl
;
220 le
= new EOpen(mds
->mdlog
);
221 mds
->mdlog
->start_entry(le
);
223 le
->add_clean_inode(in
);
224 ls
->open_files
.push_back(&in
->item_open_file
);
226 // open files are tracked by open file table, no need to journal them again
227 in
->item_open_file
.remove_myself();
231 mds
->mdlog
->submit_entry(le
);
232 mds
->mdlog
->wait_for_safe(gather_bld
.new_sub());
233 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl
;
237 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 3);
240 for (elist
<CInode
*>::iterator it
= dirty_parent_inodes
.begin(); !it
.end(); ++it
)
243 std::vector
<CInodeCommitOperations
> ops_vec
;
244 ops_vec
.reserve(count
);
245 // backtraces to be stored/updated
246 for (elist
<CInode
*>::iterator p
= dirty_parent_inodes
.begin(); !p
.end(); ++p
) {
248 ceph_assert(in
->is_auth());
249 if (in
->can_auth_pin()) {
250 dout(15) << "try_to_expire waiting for storing backtrace on " << *in
<< dendl
;
251 ops_vec
.resize(ops_vec
.size() + 1);
252 in
->store_backtrace(ops_vec
.back(), op_prio
);
254 dout(15) << "try_to_expire waiting for unfreeze on " << *in
<< dendl
;
255 in
->add_waiter(CInode::WAIT_UNFREEZE
, gather_bld
.new_sub());
258 if (!ops_vec
.empty())
259 mds
->finisher
->queue(new BatchCommitBacktrace(mds
, gather_bld
.new_sub(), std::move(ops_vec
)));
261 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 4);
264 if (inotablev
> mds
->inotable
->get_committed_version()) {
265 dout(10) << "try_to_expire saving inotable table, need " << inotablev
266 << ", committed is " << mds
->inotable
->get_committed_version()
267 << " (" << mds
->inotable
->get_committing_version() << ")"
269 mds
->inotable
->save(gather_bld
.new_sub(), inotablev
);
273 if (sessionmapv
> mds
->sessionmap
.get_committed()) {
274 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
275 << ", committed is " << mds
->sessionmap
.get_committed()
276 << " (" << mds
->sessionmap
.get_committing() << ")"
278 mds
->sessionmap
.save(gather_bld
.new_sub(), sessionmapv
);
281 // updates to sessions for completed_requests
282 mds
->sessionmap
.save_if_dirty(touched_sessions
, &gather_bld
);
283 touched_sessions
.clear();
285 // pending commit atids
286 for (map
<int, ceph::unordered_set
<version_t
> >::iterator p
= pending_commit_tids
.begin();
287 p
!= pending_commit_tids
.end();
289 MDSTableClient
*client
= mds
->get_table_client(p
->first
);
291 for (ceph::unordered_set
<version_t
>::iterator q
= p
->second
.begin();
292 q
!= p
->second
.end();
294 dout(10) << "try_to_expire " << get_mdstable_name(p
->first
) << " transaction " << *q
295 << " pending commit (not yet acked), waiting" << dendl
;
296 ceph_assert(!client
->has_committed(*q
));
297 client
->wait_for_ack(*q
, gather_bld
.new_sub());
302 for (map
<int, version_t
>::iterator p
= tablev
.begin();
305 MDSTableServer
*server
= mds
->get_table_server(p
->first
);
307 if (p
->second
> server
->get_committed_version()) {
308 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p
->first
)
309 << " to save, need " << p
->second
<< dendl
;
310 server
->save(gather_bld
.new_sub());
315 for (set
<CInode
*>::iterator p
= truncating_inodes
.begin();
316 p
!= truncating_inodes
.end();
318 dout(10) << "try_to_expire waiting for truncate of " << **p
<< dendl
;
319 (*p
)->add_waiter(CInode::WAIT_TRUNC
, gather_bld
.new_sub());
322 dout(10) << "try_to_expire waiting for purge of " << purging_inodes
<< dendl
;
323 if (purging_inodes
.size())
324 set_purged_cb(gather_bld
.new_sub());
326 if (gather_bld
.has_subs()) {
327 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire waiting" << dendl
;
330 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 5);
331 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire success" << dendl
;
335 // -----------------------
338 void EMetaBlob::add_dir_context(CDir
*dir
, int mode
)
340 MDSRank
*mds
= dir
->mdcache
->mds
;
342 list
<CDentry
*> parents
;
344 // it may be okay not to include the maybe items, if
345 // - we journaled the maybe child inode in this segment
346 // - that subtree turns out to be unambiguously auth
347 list
<CDentry
*> maybe
;
348 bool maybenot
= false;
351 // already have this dir? (we must always add in order)
352 if (lump_map
.count(dir
->dirfrag())) {
353 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") have lump " << dir
->dirfrag() << dendl
;
357 // stop at root/stray
358 CInode
*diri
= dir
->get_inode();
359 CDentry
*parent
= diri
->get_projected_parent_dn();
361 if (mode
== TO_AUTH_SUBTREE_ROOT
) {
363 if (dir
->is_subtree_root()) {
364 // match logic in MDCache::create_subtree_map()
365 if (dir
->get_dir_auth().first
== mds
->get_nodeid()) {
366 mds_authority_t parent_auth
= parent
? parent
->authority() : CDIR_AUTH_UNDEF
;
367 if (parent_auth
.first
== dir
->get_dir_auth().first
) {
368 if (parent_auth
.second
== CDIR_AUTH_UNKNOWN
&&
369 !dir
->is_ambiguous_dir_auth() &&
370 !dir
->state_test(CDir::STATE_EXPORTBOUND
) &&
371 !dir
->state_test(CDir::STATE_AUXSUBTREE
) &&
372 !diri
->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
373 dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir
<< dendl
;
376 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") ambiguous or transient subtree " << dendl
;
378 // it's an auth subtree, we don't need maybe (if any), and we're done.
379 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") reached unambig auth subtree, don't need " << maybe
380 << " at " << *dir
<< dendl
;
385 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") reached ambig or !auth subtree, need " << maybe
386 << " at " << *dir
<< dendl
;
387 // we need the maybe list after all!
388 parents
.splice(parents
.begin(), maybe
);
393 // was the inode journaled in this blob?
394 if (event_seq
&& diri
->last_journaled
== event_seq
) {
395 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") already have diri this blob " << *diri
<< dendl
;
399 // have we journaled this inode since the last subtree map?
400 if (!maybenot
&& last_subtree_map
&& diri
->last_journaled
>= last_subtree_map
) {
401 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") already have diri in this segment ("
402 << diri
->last_journaled
<< " >= " << last_subtree_map
<< "), setting maybenot flag "
412 dout(25) << "EMetaBlob::add_dir_context(" << dir
<< ") maybe " << *parent
<< dendl
;
413 maybe
.push_front(parent
);
415 dout(25) << "EMetaBlob::add_dir_context(" << dir
<< ") definitely " << *parent
<< dendl
;
416 parents
.push_front(parent
);
419 dir
= parent
->get_dir();
422 parents
.splice(parents
.begin(), maybe
);
424 dout(20) << "EMetaBlob::add_dir_context final: " << parents
<< dendl
;
425 for (const auto& dentry
: parents
) {
426 ceph_assert(dentry
->get_projected_linkage()->is_primary());
427 add_dentry(dentry
, false);
431 void EMetaBlob::update_segment(LogSegment
*ls
)
433 // dirty inode mtimes
434 // -> handled directly by Server.cc, replay()
436 // alloc table update?
438 ls
->inotablev
= inotablev
;
440 ls
->sessionmapv
= sessionmapv
;
443 // -> handled directly by Server.cc
446 // note the newest request per client
447 //if (!client_reqs.empty())
448 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
451 // EMetaBlob::fullbit
453 void EMetaBlob::fullbit::encode(bufferlist
& bl
, uint64_t features
) const {
454 ENCODE_START(9, 5, bl
);
459 encode(*inode
, bl
, features
);
463 encode((__u32
)0, bl
);
465 if (inode
->is_symlink())
467 if (inode
->is_dir()) {
468 encode(dirfragtree
, bl
);
472 if (!old_inodes
|| old_inodes
->empty()) {
476 encode(*old_inodes
, bl
, features
);
478 if (!inode
->is_dir())
480 encode(oldest_snap
, bl
);
481 encode(alternate_name
, bl
);
485 void EMetaBlob::fullbit::decode(bufferlist::const_iterator
&bl
) {
492 auto _inode
= CInode::allocate_inode();
494 inode
= std::move(_inode
);
497 CInode::mempool_xattr_map tmp
;
498 decode_noshare(tmp
, bl
);
500 xattrs
= CInode::allocate_xattr_map(std::move(tmp
));
502 if (inode
->is_symlink())
504 if (inode
->is_dir()) {
505 decode(dirfragtree
, bl
);
509 bool old_inodes_present
;
510 decode(old_inodes_present
, bl
);
511 if (old_inodes_present
) {
512 auto _old_inodes
= CInode::allocate_old_inode_map();
513 decode(*_old_inodes
, bl
);
514 old_inodes
= std::move(_old_inodes
);
516 if (!inode
->is_dir()) {
519 decode(oldest_snap
, bl
);
521 decode(alternate_name
, bl
);
526 void EMetaBlob::fullbit::dump(Formatter
*f
) const
528 f
->dump_string("dentry", dn
);
529 f
->dump_stream("snapid.first") << dnfirst
;
530 f
->dump_stream("snapid.last") << dnlast
;
531 f
->dump_int("dentry version", dnv
);
532 f
->open_object_section("inode");
534 f
->close_section(); // inode
535 f
->open_object_section("xattrs");
537 for (const auto &p
: *xattrs
) {
538 std::string
s(p
.second
.c_str(), p
.second
.length());
539 f
->dump_string(p
.first
.c_str(), s
);
542 f
->close_section(); // xattrs
543 if (inode
->is_symlink()) {
544 f
->dump_string("symlink", symlink
);
546 if (inode
->is_dir()) {
547 f
->dump_stream("frag tree") << dirfragtree
;
548 f
->dump_string("has_snapbl", snapbl
.length() ? "true" : "false");
549 if (inode
->has_layout()) {
550 f
->open_object_section("file layout policy");
552 f
->dump_string("layout", "the layout exists");
553 f
->close_section(); // file layout policy
556 f
->dump_string("state", state_string());
557 if (old_inodes
&& !old_inodes
->empty()) {
558 f
->open_array_section("old inodes");
559 for (const auto &p
: *old_inodes
) {
560 f
->open_object_section("inode");
561 f
->dump_int("snapid", p
.first
);
563 f
->close_section(); // inode
565 f
->close_section(); // old inodes
567 f
->dump_string("alternate_name", alternate_name
);
570 void EMetaBlob::fullbit::generate_test_instances(std::list
<EMetaBlob::fullbit
*>& ls
)
572 auto _inode
= CInode::allocate_inode();
574 auto _xattrs
= CInode::allocate_xattr_map();
575 bufferlist empty_snapbl
;
576 fullbit
*sample
= new fullbit("/testdn", "", 0, 0, 0,
577 _inode
, fragtree
, _xattrs
, "", 0, empty_snapbl
,
579 ls
.push_back(sample
);
582 void EMetaBlob::fullbit::update_inode(MDSRank
*mds
, CInode
*in
)
584 in
->reset_inode(std::move(inode
));
585 in
->reset_xattrs(std::move(xattrs
));
587 if (is_export_ephemeral_random()) {
588 dout(15) << "random ephemeral pin on " << *in
<< dendl
;
589 in
->set_ephemeral_pin(false, true);
591 in
->maybe_export_pin();
592 if (!(in
->dirfragtree
== dirfragtree
)) {
593 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in
->dirfragtree
<< " -> "
594 << dirfragtree
<< " on " << *in
<< dendl
;
595 in
->dirfragtree
= std::move(dirfragtree
);
596 in
->force_dirfrags();
597 if (in
->get_num_dirfrags() && in
->authority() == CDIR_AUTH_UNDEF
) {
598 auto&& ls
= in
->get_nested_dirfrags();
599 for (const auto& dir
: ls
) {
600 if (dir
->get_num_any() == 0 &&
601 mds
->mdcache
->can_trim_non_auth_dirfrag(dir
)) {
602 dout(10) << " closing empty non-auth dirfrag " << *dir
<< dendl
;
603 in
->close_dirfrag(dir
->get_frag());
608 } else if (in
->is_symlink()) {
609 in
->symlink
= symlink
;
611 in
->reset_old_inodes(std::move(old_inodes
));
612 if (in
->is_any_old_inodes()) {
613 snapid_t min_first
= in
->get_old_inodes()->rbegin()->first
+ 1;
614 if (min_first
> in
->first
)
615 in
->first
= min_first
;
619 * we can do this before linking hte inode bc the split_at would
620 * be a no-op.. we have no children (namely open snaprealms) to
623 in
->oldest_snap
= oldest_snap
;
624 in
->decode_snap_blob(snapbl
);
627 * In case there was anything malformed in the journal that we are
628 * replaying, do sanity checks on the inodes we're replaying and
629 * go damaged instead of letting any trash into a live cache
632 // Files must have valid layouts with a pool set
633 if (in
->get_inode()->layout
.pool_id
== -1 ||
634 !in
->get_inode()->layout
.is_valid()) {
635 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
636 << ": " << in
->get_inode()->layout
<< dendl
;
637 CachedStackStringStream css
;
638 *css
<< "Invalid layout for inode " << in
->ino() << " in journal";
639 mds
->clog
->error() << css
->strv();
641 ceph_abort(); // Should be unreachable because damaged() calls respawn()
646 // EMetaBlob::remotebit
648 void EMetaBlob::remotebit::encode(bufferlist
& bl
) const
650 ENCODE_START(3, 2, bl
);
658 encode(alternate_name
, bl
);
662 void EMetaBlob::remotebit::decode(bufferlist::const_iterator
&bl
)
664 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
673 decode(alternate_name
, bl
);
677 void EMetaBlob::remotebit::dump(Formatter
*f
) const
679 f
->dump_string("dentry", dn
);
680 f
->dump_int("snapid.first", dnfirst
);
681 f
->dump_int("snapid.last", dnlast
);
682 f
->dump_int("dentry version", dnv
);
683 f
->dump_int("inodeno", ino
);
684 uint32_t type
= DTTOIF(d_type
) & S_IFMT
; // convert to type entries
688 type_string
= "file"; break;
690 type_string
= "symlink"; break;
692 type_string
= "directory"; break;
694 type_string
= "fifo"; break;
696 type_string
= "chr"; break;
698 type_string
= "blk"; break;
700 type_string
= "sock"; break;
702 assert (0 == "unknown d_type!");
704 f
->dump_string("d_type", type_string
);
705 f
->dump_string("dirty", dirty
? "true" : "false");
706 f
->dump_string("alternate_name", alternate_name
);
709 void EMetaBlob::remotebit::
710 generate_test_instances(std::list
<EMetaBlob::remotebit
*>& ls
)
712 remotebit
*remote
= new remotebit("/test/dn", "", 0, 10, 15, 1, IFTODT(S_IFREG
), false);
713 ls
.push_back(remote
);
714 remote
= new remotebit("/test/dn2", "foo", 0, 10, 15, 1, IFTODT(S_IFREG
), false);
715 ls
.push_back(remote
);
718 // EMetaBlob::nullbit
720 void EMetaBlob::nullbit::encode(bufferlist
& bl
) const
722 ENCODE_START(2, 2, bl
);
731 void EMetaBlob::nullbit::decode(bufferlist::const_iterator
&bl
)
733 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
742 void EMetaBlob::nullbit::dump(Formatter
*f
) const
744 f
->dump_string("dentry", dn
);
745 f
->dump_int("snapid.first", dnfirst
);
746 f
->dump_int("snapid.last", dnlast
);
747 f
->dump_int("dentry version", dnv
);
748 f
->dump_string("dirty", dirty
? "true" : "false");
751 void EMetaBlob::nullbit::generate_test_instances(std::list
<nullbit
*>& ls
)
753 nullbit
*sample
= new nullbit("/test/dentry", 0, 10, 15, false);
754 nullbit
*sample2
= new nullbit("/test/dirty", 10, 20, 25, true);
755 ls
.push_back(sample
);
756 ls
.push_back(sample2
);
759 // EMetaBlob::dirlump
761 void EMetaBlob::dirlump::encode(bufferlist
& bl
, uint64_t features
) const
763 ENCODE_START(2, 2, bl
);
769 _encode_bits(features
);
774 void EMetaBlob::dirlump::decode(bufferlist::const_iterator
&bl
)
776 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
)
778 auto _fnode
= CDir::allocate_fnode();
780 fnode
= std::move(_fnode
);
787 dn_decoded
= false; // don't decode bits unless we need them.
791 void EMetaBlob::dirlump::dump(Formatter
*f
) const
794 dirlump
*me
= const_cast<dirlump
*>(this);
797 f
->open_object_section("fnode");
799 f
->close_section(); // fnode
800 f
->dump_string("state", state_string());
801 f
->dump_int("nfull", nfull
);
802 f
->dump_int("nremote", nremote
);
803 f
->dump_int("nnull", nnull
);
805 f
->open_array_section("full bits");
806 for (const auto& iter
: dfull
) {
807 f
->open_object_section("fullbit");
809 f
->close_section(); // fullbit
811 f
->close_section(); // full bits
812 f
->open_array_section("remote bits");
813 for (const auto& iter
: dremote
) {
814 f
->open_object_section("remotebit");
816 f
->close_section(); // remotebit
818 f
->close_section(); // remote bits
819 f
->open_array_section("null bits");
820 for (const auto& iter
: dnull
) {
821 f
->open_object_section("null bit");
823 f
->close_section(); // null bit
825 f
->close_section(); // null bits
828 void EMetaBlob::dirlump::generate_test_instances(std::list
<dirlump
*>& ls
)
830 auto dl
= new dirlump();
831 dl
->fnode
= CDir::allocate_fnode();
838 void EMetaBlob::encode(bufferlist
& bl
, uint64_t features
) const
840 ENCODE_START(8, 5, bl
);
841 encode(lump_order
, bl
);
842 encode(lump_map
, bl
, features
);
843 encode(roots
, bl
, features
);
844 encode(table_tids
, bl
);
845 encode(opened_ino
, bl
);
846 encode(allocated_ino
, bl
);
847 encode(used_preallocated_ino
, bl
);
848 encode(preallocated_inos
, bl
);
849 encode(client_name
, bl
);
850 encode(inotablev
, bl
);
851 encode(sessionmapv
, bl
);
852 encode(truncate_start
, bl
);
853 encode(truncate_finish
, bl
);
854 encode(destroyed_inodes
, bl
);
855 encode(client_reqs
, bl
);
856 encode(renamed_dirino
, bl
);
857 encode(renamed_dir_frags
, bl
);
859 // make MDSRank use v6 format happy
865 encode(client_flushes
, bl
);
868 void EMetaBlob::decode(bufferlist::const_iterator
&bl
)
870 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl
);
871 decode(lump_order
, bl
);
872 decode(lump_map
, bl
);
878 if (rootbl
.length()) {
879 auto p
= rootbl
.cbegin();
880 roots
.emplace_back(p
);
883 decode(table_tids
, bl
);
884 decode(opened_ino
, bl
);
885 decode(allocated_ino
, bl
);
886 decode(used_preallocated_ino
, bl
);
887 decode(preallocated_inos
, bl
);
888 decode(client_name
, bl
);
889 decode(inotablev
, bl
);
890 decode(sessionmapv
, bl
);
891 decode(truncate_start
, bl
);
892 decode(truncate_finish
, bl
);
893 decode(destroyed_inodes
, bl
);
895 decode(client_reqs
, bl
);
900 client_reqs
.push_back(pair
<metareqid_t
,uint64_t>(r
.front(), 0));
905 decode(renamed_dirino
, bl
);
906 decode(renamed_dir_frags
, bl
);
916 decode(client_flushes
, bl
);
923 * Get all inodes touched by this metablob. Includes the 'bits' within
924 * dirlumps, and the inodes of the dirs themselves.
926 void EMetaBlob::get_inodes(
927 std::set
<inodeno_t
> &inodes
) const
929 // For all dirlumps in this metablob
930 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
931 // Record inode of dirlump
932 inodeno_t
const dir_ino
= i
->first
.ino
;
933 inodes
.insert(dir_ino
);
935 // Decode dirlump bits
936 dirlump
const &dl
= i
->second
;
939 // Record inodes of fullbits
940 for (const auto& iter
: dl
.get_dfull()) {
941 inodes
.insert(iter
.inode
->ino
);
944 // Record inodes of remotebits
945 for (const auto& iter
: dl
.get_dremote()) {
946 inodes
.insert(iter
.ino
);
953 * Get a map of dirfrag to set of dentries in that dirfrag which are
954 * touched in this operation.
956 void EMetaBlob::get_dentries(std::map
<dirfrag_t
, std::set
<std::string
> > &dentries
) const
958 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
959 dirlump
const &dl
= i
->second
;
960 dirfrag_t
const &df
= i
->first
;
965 // For all bits, store dentry
966 for (const auto& iter
: dl
.get_dfull()) {
967 dentries
[df
].insert(iter
.dn
);
969 for (const auto& iter
: dl
.get_dremote()) {
970 dentries
[df
].insert(iter
.dn
);
972 for (const auto& iter
: dl
.get_dnull()) {
973 dentries
[df
].insert(iter
.dn
);
981 * Calculate all paths that we can infer are touched by this metablob. Only uses
982 * information local to this metablob so it may only be the path within the
985 void EMetaBlob::get_paths(
986 std::vector
<std::string
> &paths
) const
988 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
989 typedef std::pair
<inodeno_t
, std::string
> Location
;
991 // Whenever we see a dentry within a dirlump, we remember it as a child of
992 // the dirlump's inode
993 std::map
<inodeno_t
, std::vector
<std::string
> > children
;
995 // Whenever we see a location for an inode, remember it: this allows us to
996 // build a path given an inode
997 std::map
<inodeno_t
, Location
> ino_locations
;
999 // Special case: operations on root inode populate roots but not dirlumps
1000 if (lump_map
.empty() && !roots
.empty()) {
1001 paths
.push_back("/");
1007 // Build a tiny local metadata cache for the path structure in this metablob
1008 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
1009 inodeno_t
const dir_ino
= i
->first
.ino
;
1010 dirlump
const &dl
= i
->second
;
1013 for (const auto& iter
: dl
.get_dfull()) {
1014 std::string_view dentry
= iter
.dn
;
1015 children
[dir_ino
].emplace_back(dentry
);
1016 ino_locations
[iter
.inode
->ino
] = Location(dir_ino
, dentry
);
1019 for (const auto& iter
: dl
.get_dremote()) {
1020 std::string_view dentry
= iter
.dn
;
1021 children
[dir_ino
].emplace_back(dentry
);
1024 for (const auto& iter
: dl
.get_dnull()) {
1025 std::string_view dentry
= iter
.dn
;
1026 children
[dir_ino
].emplace_back(dentry
);
1030 std::vector
<Location
> leaf_locations
;
1034 // Output paths for all childless nodes in the metablob
1035 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
1036 inodeno_t
const dir_ino
= i
->first
.ino
;
1037 dirlump
const &dl
= i
->second
;
1040 for (const auto& iter
: dl
.get_dfull()) {
1041 std::string_view dentry
= iter
.dn
;
1042 if (children
.find(iter
.inode
->ino
) == children
.end()) {
1043 leaf_locations
.push_back(Location(dir_ino
, dentry
));
1047 for (const auto& iter
: dl
.get_dremote()) {
1048 std::string_view dentry
= iter
.dn
;
1049 leaf_locations
.push_back(Location(dir_ino
, dentry
));
1052 for (const auto& iter
: dl
.get_dnull()) {
1053 std::string_view dentry
= iter
.dn
;
1054 leaf_locations
.push_back(Location(dir_ino
, dentry
));
1058 // For all the leaf locations identified, generate paths
1059 for (std::vector
<Location
>::iterator i
= leaf_locations
.begin(); i
!= leaf_locations
.end(); ++i
) {
1060 Location
const &loc
= *i
;
1061 std::string path
= loc
.second
;
1062 inodeno_t ino
= loc
.first
;
1063 std::map
<inodeno_t
, Location
>::iterator iter
= ino_locations
.find(ino
);
1064 while(iter
!= ino_locations
.end()) {
1065 Location
const &loc
= iter
->second
;
1066 if (!path
.empty()) {
1067 path
= loc
.second
+ "/" + path
;
1069 path
= loc
.second
+ path
;
1071 iter
= ino_locations
.find(loc
.first
);
1074 paths
.push_back(path
);
1079 void EMetaBlob::dump(Formatter
*f
) const
1081 f
->open_array_section("lumps");
1082 for (const auto& d
: lump_order
) {
1083 f
->open_object_section("lump");
1084 f
->open_object_section("dirfrag");
1085 f
->dump_stream("dirfrag") << d
;
1086 f
->close_section(); // dirfrag
1087 f
->open_object_section("dirlump");
1088 lump_map
.at(d
).dump(f
);
1089 f
->close_section(); // dirlump
1090 f
->close_section(); // lump
1092 f
->close_section(); // lumps
1094 f
->open_array_section("roots");
1095 for (const auto& iter
: roots
) {
1096 f
->open_object_section("root");
1098 f
->close_section(); // root
1100 f
->close_section(); // roots
1102 f
->open_array_section("tableclient tranactions");
1103 for (const auto& p
: table_tids
) {
1104 f
->open_object_section("transaction");
1105 f
->dump_int("tid", p
.first
);
1106 f
->dump_int("version", p
.second
);
1107 f
->close_section(); // transaction
1109 f
->close_section(); // tableclient transactions
1111 f
->dump_int("renamed directory inodeno", renamed_dirino
);
1113 f
->open_array_section("renamed directory fragments");
1114 for (const auto& p
: renamed_dir_frags
) {
1115 f
->dump_int("frag", p
);
1117 f
->close_section(); // renamed directory fragments
1119 f
->dump_int("inotable version", inotablev
);
1120 f
->dump_int("SessionMap version", sessionmapv
);
1121 f
->dump_int("allocated ino", allocated_ino
);
1123 f
->dump_stream("preallocated inos") << preallocated_inos
;
1124 f
->dump_int("used preallocated ino", used_preallocated_ino
);
1126 f
->open_object_section("client name");
1127 client_name
.dump(f
);
1128 f
->close_section(); // client name
1130 f
->open_array_section("inodes starting a truncate");
1131 for(const auto& ino
: truncate_start
) {
1132 f
->dump_int("inodeno", ino
);
1134 f
->close_section(); // truncate inodes
1135 f
->open_array_section("inodes finishing a truncated");
1136 for(const auto& p
: truncate_finish
) {
1137 f
->open_object_section("inode+segment");
1138 f
->dump_int("inodeno", p
.first
);
1139 f
->dump_int("truncate starting segment", p
.second
);
1140 f
->close_section(); // truncated inode
1142 f
->close_section(); // truncate finish inodes
1144 f
->open_array_section("destroyed inodes");
1145 for(vector
<inodeno_t
>::const_iterator i
= destroyed_inodes
.begin();
1146 i
!= destroyed_inodes
.end(); ++i
) {
1147 f
->dump_int("inodeno", *i
);
1149 f
->close_section(); // destroyed inodes
1151 f
->open_array_section("client requests");
1152 for(const auto& p
: client_reqs
) {
1153 f
->open_object_section("Client request");
1154 f
->dump_stream("request ID") << p
.first
;
1155 f
->dump_int("oldest request on client", p
.second
);
1156 f
->close_section(); // request
1158 f
->close_section(); // client requests
1161 void EMetaBlob::generate_test_instances(std::list
<EMetaBlob
*>& ls
)
1163 ls
.push_back(new EMetaBlob());
1166 void EMetaBlob::replay(MDSRank
*mds
, LogSegment
*logseg
, MDPeerUpdate
*peerup
)
1168 dout(10) << "EMetaBlob.replay " << lump_map
.size() << " dirlumps by " << client_name
<< dendl
;
1170 ceph_assert(logseg
);
1172 ceph_assert(g_conf()->mds_kill_journal_replay_at
!= 1);
1174 for (auto& p
: roots
) {
1175 CInode
*in
= mds
->mdcache
->get_inode(p
.inode
->ino
);
1176 bool isnew
= in
? false:true;
1178 in
= new CInode(mds
->mdcache
, false, 2, CEPH_NOSNAP
);
1179 p
.update_inode(mds
, in
);
1182 mds
->mdcache
->add_inode(in
);
1183 if (p
.is_dirty()) in
->_mark_dirty(logseg
);
1184 dout(10) << "EMetaBlob.replay " << (isnew
? " added root ":" updated root ") << *in
<< dendl
;
1187 CInode
*renamed_diri
= 0;
1189 if (renamed_dirino
) {
1190 renamed_diri
= mds
->mdcache
->get_inode(renamed_dirino
);
1192 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri
<< dendl
;
1194 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino
<< dendl
;
1197 for (const auto& lp
: lump_order
) {
1198 dirlump
&lump
= lump_map
[lp
];
1200 dout(10) << "EMetaBlob.replay found null dentry in dir " << lp
<< dendl
;
1201 nnull
+= lump
.nnull
;
1204 ceph_assert(nnull
<= 1);
1207 // keep track of any inodes we unlink and don't relink elsewhere
1208 map
<CInode
*, CDir
*> unlinked
;
1209 set
<CInode
*> linked
;
1211 // walk through my dirs (in order!)
1213 for (const auto& lp
: lump_order
) {
1214 dout(10) << "EMetaBlob.replay dir " << lp
<< dendl
;
1215 dirlump
&lump
= lump_map
[lp
];
1218 CDir
*dir
= mds
->mdcache
->get_force_dirfrag(lp
, true);
1220 // hmm. do i have the inode?
1221 CInode
*diri
= mds
->mdcache
->get_inode((lp
).ino
);
1223 if (MDS_INO_IS_MDSDIR(lp
.ino
)) {
1224 ceph_assert(MDS_INO_MDSDIR(mds
->get_nodeid()) != lp
.ino
);
1225 diri
= mds
->mdcache
->create_system_inode(lp
.ino
, S_IFDIR
|0755);
1226 diri
->state_clear(CInode::STATE_AUTH
);
1227 dout(10) << "EMetaBlob.replay created base " << *diri
<< dendl
;
1229 dout(0) << "EMetaBlob.replay missing dir ino " << lp
.ino
<< dendl
;
1230 mds
->clog
->error() << "failure replaying journal (EMetaBlob)";
1232 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1236 // create the dirfrag
1237 dir
= diri
->get_or_open_dirfrag(mds
->mdcache
, lp
.frag
);
1239 if (MDS_INO_IS_BASE(lp
.ino
))
1240 mds
->mdcache
->adjust_subtree_auth(dir
, CDIR_AUTH_UNDEF
);
1242 dout(10) << "EMetaBlob.replay added dir " << *dir
<< dendl
;
1244 dir
->reset_fnode(std::move(lump
.fnode
));
1245 dir
->update_projected_version();
1247 if (lump
.is_importing()) {
1248 dir
->state_set(CDir::STATE_AUTH
);
1249 dir
->state_clear(CDir::STATE_COMPLETE
);
1251 if (lump
.is_dirty()) {
1252 dir
->_mark_dirty(logseg
);
1254 if (!(dir
->get_fnode()->rstat
== dir
->get_fnode()->accounted_rstat
)) {
1255 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir
<< dendl
;
1256 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->nestlock
);
1257 logseg
->dirty_dirfrag_nest
.push_back(&dir
->inode
->item_dirty_dirfrag_nest
);
1259 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir
<< dendl
;
1261 if (!(dir
->get_fnode()->fragstat
== dir
->get_fnode()->accounted_fragstat
)) {
1262 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir
<< dendl
;
1263 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->filelock
);
1264 logseg
->dirty_dirfrag_dir
.push_back(&dir
->inode
->item_dirty_dirfrag_dir
);
1266 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir
<< dendl
;
1269 if (lump
.is_dirty_dft()) {
1270 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir
<< dendl
;
1271 dir
->state_set(CDir::STATE_DIRTYDFT
);
1272 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->dirfragtreelock
);
1273 logseg
->dirty_dirfrag_dirfragtree
.push_back(&dir
->inode
->item_dirty_dirfrag_dirfragtree
);
1276 dir
->mark_new(logseg
);
1277 if (lump
.is_complete())
1278 dir
->mark_complete();
1280 dout(10) << "EMetaBlob.replay updated dir " << *dir
<< dendl
;
1283 lump
._decode_bits();
1285 // full dentry+inode pairs
1286 for (auto& fb
: lump
._get_dfull()) {
1287 CDentry
*dn
= dir
->lookup_exact_snap(fb
.dn
, fb
.dnlast
);
1289 dn
= dir
->add_null_dentry(fb
.dn
, fb
.dnfirst
, fb
.dnlast
);
1290 dn
->set_version(fb
.dnv
);
1291 if (fb
.is_dirty()) dn
->_mark_dirty(logseg
);
1292 dout(10) << "EMetaBlob.replay added (full) " << *dn
<< dendl
;
1294 dn
->set_version(fb
.dnv
);
1295 if (fb
.is_dirty()) dn
->_mark_dirty(logseg
);
1296 dout(10) << "EMetaBlob.replay for [" << fb
.dnfirst
<< "," << fb
.dnlast
<< "] had " << *dn
<< dendl
;
1297 dn
->first
= fb
.dnfirst
;
1298 ceph_assert(dn
->last
== fb
.dnlast
);
1300 if (lump
.is_importing())
1303 CInode
*in
= mds
->mdcache
->get_inode(fb
.inode
->ino
, fb
.dnlast
);
1305 in
= new CInode(mds
->mdcache
, dn
->is_auth(), fb
.dnfirst
, fb
.dnlast
);
1306 fb
.update_inode(mds
, in
);
1307 mds
->mdcache
->add_inode(in
);
1308 if (!dn
->get_linkage()->is_null()) {
1309 if (dn
->get_linkage()->is_primary()) {
1310 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1311 CachedStackStringStream css
;
1312 *css
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1313 << " " << *dn
->get_linkage()->get_inode() << " should be " << in
->ino();
1314 dout(0) << css
->strv() << dendl
;
1315 mds
->clog
->warn() << css
->strv();
1317 dir
->unlink_inode(dn
, false);
1319 if (unlinked
.count(in
))
1321 dir
->link_primary_inode(dn
, in
);
1322 dout(10) << "EMetaBlob.replay added " << *in
<< dendl
;
1324 in
->first
= fb
.dnfirst
;
1325 fb
.update_inode(mds
, in
);
1326 if (dn
->get_linkage()->get_inode() != in
&& in
->get_parent_dn()) {
1327 dout(10) << "EMetaBlob.replay unlinking " << *in
<< dendl
;
1328 unlinked
[in
] = in
->get_parent_dir();
1329 in
->get_parent_dir()->unlink_inode(in
->get_parent_dn());
1331 if (dn
->get_linkage()->get_inode() != in
) {
1332 if (!dn
->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1333 if (dn
->get_linkage()->is_primary()) {
1334 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1335 CachedStackStringStream css
;
1336 *css
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1337 << " " << *dn
->get_linkage()->get_inode() << " should be " << in
->ino();
1338 dout(0) << css
->strv() << dendl
;
1339 mds
->clog
->warn() << css
->strv();
1341 dir
->unlink_inode(dn
, false);
1343 if (unlinked
.count(in
))
1345 dir
->link_primary_inode(dn
, in
);
1346 dout(10) << "EMetaBlob.replay linked " << *in
<< dendl
;
1348 dout(10) << "EMetaBlob.replay for [" << fb
.dnfirst
<< "," << fb
.dnlast
<< "] had " << *in
<< dendl
;
1350 ceph_assert(in
->first
== fb
.dnfirst
||
1351 (in
->is_multiversion() && in
->first
> fb
.dnfirst
));
1354 in
->_mark_dirty(logseg
);
1355 if (fb
.is_dirty_parent())
1356 in
->mark_dirty_parent(logseg
, fb
.is_dirty_pool());
1357 if (fb
.need_snapflush())
1358 logseg
->open_files
.push_back(&in
->item_open_file
);
1360 in
->state_set(CInode::STATE_AUTH
);
1362 in
->state_clear(CInode::STATE_AUTH
);
1363 ceph_assert(g_conf()->mds_kill_journal_replay_at
!= 2);
1366 auto do_corruption
= mds
->get_inject_journal_corrupt_dentry_first();
1367 if (unlikely(do_corruption
> 0.0)) {
1368 auto r
= ceph::util::generate_random_number(0.0, 1.0);
1369 if (r
< do_corruption
) {
1370 dout(0) << "corrupting dn: " << *dn
<< dendl
;
1376 if (!(++count
% mds
->heartbeat_reset_grace()))
1377 mds
->heartbeat_reset();
1381 for (const auto& rb
: lump
.get_dremote()) {
1382 CDentry
*dn
= dir
->lookup_exact_snap(rb
.dn
, rb
.dnlast
);
1384 dn
= dir
->add_remote_dentry(rb
.dn
, rb
.ino
, rb
.d_type
, mempool::mds_co::string(rb
.alternate_name
), rb
.dnfirst
, rb
.dnlast
);
1385 dn
->set_version(rb
.dnv
);
1386 if (rb
.dirty
) dn
->_mark_dirty(logseg
);
1387 dout(10) << "EMetaBlob.replay added " << *dn
<< dendl
;
1389 if (!dn
->get_linkage()->is_null()) {
1390 dout(10) << "EMetaBlob.replay unlinking " << *dn
<< dendl
;
1391 if (dn
->get_linkage()->is_primary()) {
1392 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1393 CachedStackStringStream css
;
1394 *css
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1395 << " " << *dn
->get_linkage()->get_inode() << " should be remote " << rb
.ino
;
1396 dout(0) << css
->strv() << dendl
;
1398 dir
->unlink_inode(dn
, false);
1400 dn
->set_alternate_name(mempool::mds_co::string(rb
.alternate_name
));
1401 dir
->link_remote_inode(dn
, rb
.ino
, rb
.d_type
);
1402 dn
->set_version(rb
.dnv
);
1403 if (rb
.dirty
) dn
->_mark_dirty(logseg
);
1404 dout(10) << "EMetaBlob.replay for [" << rb
.dnfirst
<< "," << rb
.dnlast
<< "] had " << *dn
<< dendl
;
1405 dn
->first
= rb
.dnfirst
;
1406 ceph_assert(dn
->last
== rb
.dnlast
);
1408 if (lump
.is_importing())
1411 if (!(++count
% mds
->heartbeat_reset_grace()))
1412 mds
->heartbeat_reset();
1416 for (const auto& nb
: lump
.get_dnull()) {
1417 CDentry
*dn
= dir
->lookup_exact_snap(nb
.dn
, nb
.dnlast
);
1419 dn
= dir
->add_null_dentry(nb
.dn
, nb
.dnfirst
, nb
.dnlast
);
1420 dn
->set_version(nb
.dnv
);
1421 if (nb
.dirty
) dn
->_mark_dirty(logseg
);
1422 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn
<< dendl
;
1424 dn
->first
= nb
.dnfirst
;
1425 if (!dn
->get_linkage()->is_null()) {
1426 dout(10) << "EMetaBlob.replay unlinking " << *dn
<< dendl
;
1427 CInode
*in
= dn
->get_linkage()->get_inode();
1428 // For renamed inode, We may call CInode::force_dirfrag() later.
1429 // CInode::force_dirfrag() doesn't work well when inode is detached
1430 // from the hierarchy.
1431 if (!renamed_diri
|| renamed_diri
!= in
) {
1432 if (dn
->get_linkage()->is_primary())
1434 dir
->unlink_inode(dn
);
1437 dn
->set_version(nb
.dnv
);
1438 if (nb
.dirty
) dn
->_mark_dirty(logseg
);
1439 dout(10) << "EMetaBlob.replay had " << *dn
<< dendl
;
1440 ceph_assert(dn
->last
== nb
.dnlast
);
1443 if (lump
.is_importing())
1446 // Make null dentries the first things we trim
1447 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn
<< dendl
;
1449 if (!(++count
% mds
->heartbeat_reset_grace()))
1450 mds
->heartbeat_reset();
1454 ceph_assert(g_conf()->mds_kill_journal_replay_at
!= 3);
1456 if (renamed_dirino
) {
1458 ceph_assert(unlinked
.count(renamed_diri
));
1459 ceph_assert(linked
.count(renamed_diri
));
1460 olddir
= unlinked
[renamed_diri
];
1462 // we imported a diri we haven't seen before
1463 renamed_diri
= mds
->mdcache
->get_inode(renamed_dirino
);
1464 ceph_assert(renamed_diri
); // it was in the metablob
1468 if (olddir
->authority() != CDIR_AUTH_UNDEF
&&
1469 renamed_diri
->authority() == CDIR_AUTH_UNDEF
) {
1470 ceph_assert(peerup
); // auth to non-auth, must be peer prepare
1472 renamed_diri
->dirfragtree
.get_leaves(leaves
);
1473 for (const auto& leaf
: leaves
) {
1474 CDir
*dir
= renamed_diri
->get_dirfrag(leaf
);
1476 if (dir
->get_dir_auth() == CDIR_AUTH_UNDEF
)
1477 // preserve subtree bound until peer commit
1478 peerup
->olddirs
.insert(dir
->inode
);
1480 dir
->state_set(CDir::STATE_AUTH
);
1482 if (!(++count
% mds
->heartbeat_reset_grace()))
1483 mds
->heartbeat_reset();
1487 mds
->mdcache
->adjust_subtree_after_rename(renamed_diri
, olddir
, false);
1489 // see if we can discard the subtree we renamed out of
1490 CDir
*root
= mds
->mdcache
->get_subtree_root(olddir
);
1491 if (root
->get_dir_auth() == CDIR_AUTH_UNDEF
) {
1492 if (peerup
) // preserve the old dir until peer commit
1493 peerup
->olddirs
.insert(olddir
->inode
);
1495 mds
->mdcache
->try_trim_non_auth_subtree(root
);
1499 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1500 if (renamed_diri
->authority() != CDIR_AUTH_UNDEF
) {
1501 for (const auto& p
: renamed_dir_frags
) {
1502 CDir
*dir
= renamed_diri
->get_dirfrag(p
);
1504 // we already had the inode before, and we already adjusted this subtree accordingly.
1505 dout(10) << " already had+adjusted rename import bound " << *dir
<< dendl
;
1506 ceph_assert(olddir
);
1509 dir
= renamed_diri
->get_or_open_dirfrag(mds
->mdcache
, p
);
1510 dout(10) << " creating new rename import bound " << *dir
<< dendl
;
1511 dir
->state_clear(CDir::STATE_AUTH
);
1512 mds
->mdcache
->adjust_subtree_auth(dir
, CDIR_AUTH_UNDEF
);
1514 if (!(++count
% mds
->heartbeat_reset_grace()))
1515 mds
->heartbeat_reset();
1519 // rename may overwrite an empty directory and move it into stray dir.
1520 unlinked
.erase(renamed_diri
);
1521 for (map
<CInode
*, CDir
*>::iterator p
= unlinked
.begin(); p
!= unlinked
.end(); ++p
) {
1522 if (!linked
.count(p
->first
))
1524 ceph_assert(p
->first
->is_dir());
1525 mds
->mdcache
->adjust_subtree_after_rename(p
->first
, p
->second
, false);
1527 if (!(++count
% mds
->heartbeat_reset_grace()))
1528 mds
->heartbeat_reset();
1532 if (!unlinked
.empty()) {
1533 for (set
<CInode
*>::iterator p
= linked
.begin(); p
!= linked
.end(); ++p
)
1535 dout(10) << " unlinked set contains " << unlinked
<< dendl
;
1536 for (map
<CInode
*, CDir
*>::iterator p
= unlinked
.begin(); p
!= unlinked
.end(); ++p
) {
1537 CInode
*in
= p
->first
;
1538 if (peerup
) { // preserve unlinked inodes until peer commit
1539 peerup
->unlinked
.insert(in
);
1541 in
->snaprealm
->adjust_parent();
1543 mds
->mdcache
->remove_inode_recursive(in
);
1545 if (!(++count
% mds
->heartbeat_reset_grace()))
1546 mds
->heartbeat_reset();
1550 // table client transactions
1551 for (const auto& p
: table_tids
) {
1552 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p
.first
)
1553 << " transaction " << p
.second
<< dendl
;
1554 MDSTableClient
*client
= mds
->get_table_client(p
.first
);
1556 client
->got_journaled_agree(p
.second
, logseg
);
1558 if (!(++count
% mds
->heartbeat_reset_grace()))
1559 mds
->heartbeat_reset();
1564 CInode
*in
= mds
->mdcache
->get_inode(opened_ino
);
1566 dout(10) << "EMetaBlob.replay noting opened inode " << *in
<< dendl
;
1567 logseg
->open_files
.push_back(&in
->item_open_file
);
1572 if (mds
->inotable
->get_version() >= inotablev
) {
1573 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1574 << " <= table " << mds
->inotable
->get_version() << dendl
;
1576 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1577 << " - 1 == table " << mds
->inotable
->get_version()
1578 << " allocated+used " << allocated_ino
1579 << " prealloc " << preallocated_inos
1582 mds
->inotable
->replay_alloc_id(allocated_ino
);
1583 if (preallocated_inos
.size())
1584 mds
->inotable
->replay_alloc_ids(preallocated_inos
);
1586 // repair inotable updates in case inotable wasn't persist in time
1587 if (inotablev
> mds
->inotable
->get_version()) {
1588 mds
->clog
->error() << "journal replay inotablev mismatch "
1589 << mds
->inotable
->get_version() << " -> " << inotablev
1590 << ", will force replay it.";
1591 mds
->inotable
->force_replay_version(inotablev
);
1594 ceph_assert(inotablev
== mds
->inotable
->get_version());
1598 if (mds
->sessionmap
.get_version() >= sessionmapv
) {
1599 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1600 << " <= table " << mds
->sessionmap
.get_version() << dendl
;
1602 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1603 << ", table " << mds
->sessionmap
.get_version()
1604 << " prealloc " << preallocated_inos
1605 << " used " << used_preallocated_ino
1607 Session
*session
= mds
->sessionmap
.get_session(client_name
);
1609 dout(20) << " (session prealloc " << session
->info
.prealloc_inos
<< ")" << dendl
;
1610 if (used_preallocated_ino
) {
1611 if (!session
->info
.prealloc_inos
.empty()) {
1612 inodeno_t ino
= session
->take_ino(used_preallocated_ino
);
1613 session
->info
.prealloc_inos
.erase(ino
);
1614 ceph_assert(ino
== used_preallocated_ino
);
1616 mds
->sessionmap
.replay_dirty_session(session
);
1618 if (!preallocated_inos
.empty()) {
1619 session
->free_prealloc_inos
.insert(preallocated_inos
);
1620 session
->info
.prealloc_inos
.insert(preallocated_inos
);
1621 mds
->sessionmap
.replay_dirty_session(session
);
1624 dout(10) << "EMetaBlob.replay no session for " << client_name
<< dendl
;
1625 if (used_preallocated_ino
)
1626 mds
->sessionmap
.replay_advance_version();
1628 if (!preallocated_inos
.empty())
1629 mds
->sessionmap
.replay_advance_version();
1632 // repair sessionmap updates in case sessionmap wasn't persist in time
1633 if (sessionmapv
> mds
->sessionmap
.get_version()) {
1634 mds
->clog
->error() << "EMetaBlob.replay sessionmapv mismatch "
1635 << sessionmapv
<< " -> " << mds
->sessionmap
.get_version()
1636 << ", will force replay it.";
1637 if (g_conf()->mds_wipe_sessions
) {
1638 mds
->sessionmap
.wipe();
1640 // force replay sessionmap version
1641 mds
->sessionmap
.set_version(sessionmapv
);
1643 ceph_assert(sessionmapv
== mds
->sessionmap
.get_version());
1647 // truncating inodes
1648 for (const auto& ino
: truncate_start
) {
1649 CInode
*in
= mds
->mdcache
->get_inode(ino
);
1651 mds
->mdcache
->add_recovered_truncate(in
, logseg
);
1653 if (!(++count
% mds
->heartbeat_reset_grace()))
1654 mds
->heartbeat_reset();
1656 for (const auto& p
: truncate_finish
) {
1657 LogSegment
*ls
= mds
->mdlog
->get_segment(p
.second
);
1659 CInode
*in
= mds
->mdcache
->get_inode(p
.first
);
1661 mds
->mdcache
->remove_recovered_truncate(in
, ls
);
1664 if (!(++count
% mds
->heartbeat_reset_grace()))
1665 mds
->heartbeat_reset();
1669 if (!destroyed_inodes
.empty()) {
1670 for (vector
<inodeno_t
>::iterator p
= destroyed_inodes
.begin();
1671 p
!= destroyed_inodes
.end();
1673 CInode
*in
= mds
->mdcache
->get_inode(*p
);
1675 dout(10) << "EMetaBlob.replay destroyed " << *p
<< ", dropping " << *in
<< dendl
;
1676 CDentry
*parent
= in
->get_parent_dn();
1677 mds
->mdcache
->remove_inode(in
);
1679 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent
<< dendl
;
1680 ceph_assert(parent
->get_linkage()->is_null());
1683 dout(10) << "EMetaBlob.replay destroyed " << *p
<< ", not in cache" << dendl
;
1686 if (!(++count
% mds
->heartbeat_reset_grace()))
1687 mds
->heartbeat_reset();
1689 mds
->mdcache
->open_file_table
.note_destroyed_inos(logseg
->seq
, destroyed_inodes
);
1693 for (const auto& p
: client_reqs
) {
1694 if (p
.first
.name
.is_client()) {
1695 dout(10) << "EMetaBlob.replay request " << p
.first
<< " trim_to " << p
.second
<< dendl
;
1696 inodeno_t created
= allocated_ino
? allocated_ino
: used_preallocated_ino
;
1697 // if we allocated an inode, there should be exactly one client request id.
1698 ceph_assert(created
== inodeno_t() || client_reqs
.size() == 1);
1700 Session
*session
= mds
->sessionmap
.get_session(p
.first
.name
);
1702 session
->add_completed_request(p
.first
.tid
, created
);
1704 session
->trim_completed_requests(p
.second
);
1708 if (!(++count
% mds
->heartbeat_reset_grace()))
1709 mds
->heartbeat_reset();
1713 for (const auto& p
: client_flushes
) {
1714 if (p
.first
.name
.is_client()) {
1715 dout(10) << "EMetaBlob.replay flush " << p
.first
<< " trim_to " << p
.second
<< dendl
;
1716 Session
*session
= mds
->sessionmap
.get_session(p
.first
.name
);
1718 session
->add_completed_flush(p
.first
.tid
);
1720 session
->trim_completed_flushes(p
.second
);
1724 if (!(++count
% mds
->heartbeat_reset_grace()))
1725 mds
->heartbeat_reset();
1729 update_segment(logseg
);
1731 ceph_assert(g_conf()->mds_kill_journal_replay_at
!= 4);
1734 // -----------------------
1736 void EPurged::update_segment()
1738 if (inos
.size() && inotablev
)
1739 get_segment()->inotablev
= inotablev
;
1743 void EPurged::replay(MDSRank
*mds
)
1746 LogSegment
*ls
= mds
->mdlog
->get_segment(seq
);
1748 ls
->purging_inodes
.subtract(inos
);
1750 if (mds
->inotable
->get_version() >= inotablev
) {
1751 dout(10) << "EPurged.replay inotable " << mds
->inotable
->get_version()
1752 << " >= " << inotablev
<< ", noop" << dendl
;
1754 dout(10) << "EPurged.replay inotable " << mds
->inotable
->get_version()
1755 << " < " << inotablev
<< " " << dendl
;
1756 mds
->inotable
->replay_release_ids(inos
);
1757 ceph_assert(mds
->inotable
->get_version() == inotablev
);
1763 void EPurged::encode(bufferlist
& bl
, uint64_t features
) const
1765 ENCODE_START(1, 1, bl
);
1767 encode(inotablev
, bl
);
1772 void EPurged::decode(bufferlist::const_iterator
& bl
)
1774 DECODE_START(1, bl
);
1776 decode(inotablev
, bl
);
1781 void EPurged::dump(Formatter
*f
) const
1783 f
->dump_stream("inos") << inos
;
1784 f
->dump_int("inotable version", inotablev
);
1785 f
->dump_int("segment seq", seq
);
1788 // -----------------------
1791 void ESession::update_segment()
1793 get_segment()->sessionmapv
= cmapv
;
1794 if (inos_to_free
.size() && inotablev
)
1795 get_segment()->inotablev
= inotablev
;
1798 void ESession::replay(MDSRank
*mds
)
1800 if (inos_to_purge
.size())
1801 get_segment()->purging_inodes
.insert(inos_to_purge
);
1803 if (mds
->sessionmap
.get_version() >= cmapv
) {
1804 dout(10) << "ESession.replay sessionmap " << mds
->sessionmap
.get_version()
1805 << " >= " << cmapv
<< ", noop" << dendl
;
1806 } else if (mds
->sessionmap
.get_version() + 1 == cmapv
) {
1807 dout(10) << "ESession.replay sessionmap " << mds
->sessionmap
.get_version()
1808 << " < " << cmapv
<< " " << (open
? "open":"close") << " " << client_inst
<< dendl
;
1811 session
= mds
->sessionmap
.get_or_add_session(client_inst
);
1812 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
1813 session
->set_client_metadata(client_metadata
);
1814 dout(10) << " opened session " << session
->info
.inst
<< dendl
;
1816 session
= mds
->sessionmap
.get_session(client_inst
.name
);
1817 if (session
) { // there always should be a session, but there's a bug
1818 if (session
->get_connection() == NULL
) {
1819 dout(10) << " removed session " << session
->info
.inst
<< dendl
;
1820 mds
->sessionmap
.remove_session(session
);
1823 session
->clear(); // the client has reconnected; keep the Session, but reset
1824 dout(10) << " reset session " << session
->info
.inst
<< " (they reconnected)" << dendl
;
1827 mds
->clog
->error() << "replayed stray Session close event for " << client_inst
1828 << " from time " << stamp
<< ", ignoring";
1832 mds
->sessionmap
.replay_dirty_session(session
);
1834 mds
->sessionmap
.replay_advance_version();
1836 ceph_assert(mds
->sessionmap
.get_version() == cmapv
);
1838 mds
->clog
->error() << "ESession.replay sessionmap v " << cmapv
1839 << " - 1 > table " << mds
->sessionmap
.get_version();
1840 ceph_assert(g_conf()->mds_wipe_sessions
);
1841 mds
->sessionmap
.wipe();
1842 mds
->sessionmap
.set_version(cmapv
);
1845 if (inos_to_free
.size() && inotablev
) {
1846 if (mds
->inotable
->get_version() >= inotablev
) {
1847 dout(10) << "ESession.replay inotable " << mds
->inotable
->get_version()
1848 << " >= " << inotablev
<< ", noop" << dendl
;
1850 dout(10) << "ESession.replay inotable " << mds
->inotable
->get_version()
1851 << " < " << inotablev
<< " " << (open
? "add":"remove") << dendl
;
1852 ceph_assert(!open
); // for now
1853 mds
->inotable
->replay_release_ids(inos_to_free
);
1854 ceph_assert(mds
->inotable
->get_version() == inotablev
);
1861 void ESession::encode(bufferlist
&bl
, uint64_t features
) const
1863 ENCODE_START(6, 5, bl
);
1865 encode(client_inst
, bl
, features
);
1868 encode(inos_to_free
, bl
);
1869 encode(inotablev
, bl
);
1870 encode(client_metadata
, bl
);
1871 encode(inos_to_purge
, bl
);
1875 void ESession::decode(bufferlist::const_iterator
&bl
)
1877 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl
);
1880 decode(client_inst
, bl
);
1883 decode(inos_to_free
, bl
);
1884 decode(inotablev
, bl
);
1885 if (struct_v
== 4) {
1886 decode(client_metadata
.kv_map
, bl
);
1887 } else if (struct_v
>= 5) {
1888 decode(client_metadata
, bl
);
1891 decode(inos_to_purge
, bl
);
1897 void ESession::dump(Formatter
*f
) const
1899 f
->dump_stream("client instance") << client_inst
;
1900 f
->dump_string("open", open
? "true" : "false");
1901 f
->dump_int("client map version", cmapv
);
1902 f
->dump_stream("inos_to_free") << inos_to_free
;
1903 f
->dump_int("inotable version", inotablev
);
1904 f
->open_object_section("client_metadata");
1905 f
->dump_stream("inos_to_purge") << inos_to_purge
;
1906 client_metadata
.dump(f
);
1907 f
->close_section(); // client_metadata
1910 void ESession::generate_test_instances(std::list
<ESession
*>& ls
)
1912 ls
.push_back(new ESession
);
1915 // -----------------------
1918 void ESessions::encode(bufferlist
&bl
, uint64_t features
) const
1920 ENCODE_START(2, 1, bl
);
1921 encode(client_map
, bl
, features
);
1924 encode(client_metadata_map
, bl
);
1928 void ESessions::decode_old(bufferlist::const_iterator
&bl
)
1931 decode(client_map
, bl
);
1937 void ESessions::decode_new(bufferlist::const_iterator
&bl
)
1939 DECODE_START(2, bl
);
1940 decode(client_map
, bl
);
1944 decode(client_metadata_map
, bl
);
1948 void ESessions::dump(Formatter
*f
) const
1950 f
->dump_int("client map version", cmapv
);
1952 f
->open_array_section("client map");
1953 for (map
<client_t
,entity_inst_t
>::const_iterator i
= client_map
.begin();
1954 i
!= client_map
.end(); ++i
) {
1955 f
->open_object_section("client");
1956 f
->dump_int("client id", i
->first
.v
);
1957 f
->dump_stream("client entity") << i
->second
;
1958 f
->close_section(); // client
1960 f
->close_section(); // client map
1963 void ESessions::generate_test_instances(std::list
<ESessions
*>& ls
)
1965 ls
.push_back(new ESessions());
1968 void ESessions::update_segment()
1970 get_segment()->sessionmapv
= cmapv
;
1973 void ESessions::replay(MDSRank
*mds
)
1975 if (mds
->sessionmap
.get_version() >= cmapv
) {
1976 dout(10) << "ESessions.replay sessionmap " << mds
->sessionmap
.get_version()
1977 << " >= " << cmapv
<< ", noop" << dendl
;
1979 dout(10) << "ESessions.replay sessionmap " << mds
->sessionmap
.get_version()
1980 << " < " << cmapv
<< dendl
;
1981 mds
->sessionmap
.replay_open_sessions(cmapv
, client_map
, client_metadata_map
);
1987 // -----------------------
1990 void ETableServer::encode(bufferlist
& bl
, uint64_t features
) const
1992 ENCODE_START(3, 3, bl
);
1998 encode(mutation
, bl
);
2000 encode(version
, bl
);
2004 void ETableServer::decode(bufferlist::const_iterator
&bl
)
2006 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2013 decode(mutation
, bl
);
2015 decode(version
, bl
);
2019 void ETableServer::dump(Formatter
*f
) const
2021 f
->dump_int("table id", table
);
2022 f
->dump_int("op", op
);
2023 f
->dump_int("request id", reqid
);
2024 f
->dump_int("by mds", bymds
);
2025 f
->dump_int("tid", tid
);
2026 f
->dump_int("version", version
);
2029 void ETableServer::generate_test_instances(std::list
<ETableServer
*>& ls
)
2031 ls
.push_back(new ETableServer());
2035 void ETableServer::update_segment()
2037 get_segment()->tablev
[table
] = version
;
2040 void ETableServer::replay(MDSRank
*mds
)
2042 MDSTableServer
*server
= mds
->get_table_server(table
);
2046 if (server
->get_version() >= version
) {
2047 dout(10) << "ETableServer.replay " << get_mdstable_name(table
)
2048 << " " << get_mdstableserver_opname(op
)
2049 << " event " << version
2050 << " <= table " << server
->get_version() << dendl
;
2054 dout(10) << " ETableServer.replay " << get_mdstable_name(table
)
2055 << " " << get_mdstableserver_opname(op
)
2056 << " event " << version
<< " - 1 == table " << server
->get_version() << dendl
;
2057 ceph_assert(version
-1 == server
->get_version());
2060 case TABLESERVER_OP_PREPARE
: {
2061 server
->_note_prepare(bymds
, reqid
, true);
2063 server
->_prepare(mutation
, reqid
, bymds
, out
);
2064 mutation
= std::move(out
);
2067 case TABLESERVER_OP_COMMIT
:
2068 server
->_commit(tid
, ref_t
<MMDSTableRequest
>());
2069 server
->_note_commit(tid
, true);
2071 case TABLESERVER_OP_ROLLBACK
:
2072 server
->_rollback(tid
);
2073 server
->_note_rollback(tid
, true);
2075 case TABLESERVER_OP_SERVER_UPDATE
:
2076 server
->_server_update(mutation
);
2077 server
->_note_server_update(mutation
, true);
2080 mds
->clog
->error() << "invalid tableserver op in ETableServer";
2082 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2085 ceph_assert(version
== server
->get_version());
2090 // ---------------------
2093 void ETableClient::encode(bufferlist
& bl
, uint64_t features
) const
2095 ENCODE_START(3, 3, bl
);
2103 void ETableClient::decode(bufferlist::const_iterator
&bl
)
2105 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2114 void ETableClient::dump(Formatter
*f
) const
2116 f
->dump_int("table", table
);
2117 f
->dump_int("op", op
);
2118 f
->dump_int("tid", tid
);
2121 void ETableClient::generate_test_instances(std::list
<ETableClient
*>& ls
)
2123 ls
.push_back(new ETableClient());
2126 void ETableClient::replay(MDSRank
*mds
)
2128 dout(10) << " ETableClient.replay " << get_mdstable_name(table
)
2129 << " op " << get_mdstableserver_opname(op
)
2130 << " tid " << tid
<< dendl
;
2132 MDSTableClient
*client
= mds
->get_table_client(table
);
2136 ceph_assert(op
== TABLESERVER_OP_ACK
);
2137 client
->got_journaled_ack(tid
);
2141 // -----------------------
2144 void ESnap::update_segment()
2146 get_segment()->tablev[TABLE_SNAP] = version;
2149 void ESnap::replay(MDSRank *mds)
2151 if (mds->snaptable->get_version() >= version) {
2152 dout(10) << "ESnap.replay event " << version
2153 << " <= table " << mds->snaptable->get_version() << dendl;
2157 dout(10) << " ESnap.replay event " << version
2158 << " - 1 == table " << mds->snaptable->get_version() << dendl;
2159 ceph_assert(version-1 == mds->snaptable->get_version());
2163 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
2164 ceph_assert(s == snap.snapid);
2166 mds->snaptable->remove(snap.snapid);
2169 ceph_assert(version == mds->snaptable->get_version());
2175 // -----------------------
2178 void EUpdate::encode(bufferlist
&bl
, uint64_t features
) const
2180 ENCODE_START(4, 4, bl
);
2183 encode(metablob
, bl
, features
);
2184 encode(client_map
, bl
);
2187 encode(had_peers
, bl
);
2191 void EUpdate::decode(bufferlist::const_iterator
&bl
)
2193 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl
);
2197 decode(metablob
, bl
);
2198 decode(client_map
, bl
);
2202 decode(had_peers
, bl
);
2206 void EUpdate::dump(Formatter
*f
) const
2208 f
->open_object_section("metablob");
2210 f
->close_section(); // metablob
2212 f
->dump_string("type", type
);
2213 f
->dump_int("client map length", client_map
.length());
2214 f
->dump_int("client map version", cmapv
);
2215 f
->dump_stream("reqid") << reqid
;
2216 f
->dump_string("had peers", had_peers
? "true" : "false");
2219 void EUpdate::generate_test_instances(std::list
<EUpdate
*>& ls
)
2221 ls
.push_back(new EUpdate());
2225 void EUpdate::update_segment()
2227 auto&& segment
= get_segment();
2228 metablob
.update_segment(segment
);
2230 if (client_map
.length())
2231 segment
->sessionmapv
= cmapv
;
2234 segment
->uncommitted_leaders
.insert(reqid
);
2237 void EUpdate::replay(MDSRank
*mds
)
2239 auto&& segment
= get_segment();
2240 metablob
.replay(mds
, segment
);
2243 dout(10) << "EUpdate.replay " << reqid
<< " had peers, expecting a matching ECommitted" << dendl
;
2244 segment
->uncommitted_leaders
.insert(reqid
);
2245 set
<mds_rank_t
> peers
;
2246 mds
->mdcache
->add_uncommitted_leader(reqid
, segment
, peers
, true);
2249 if (client_map
.length()) {
2250 if (mds
->sessionmap
.get_version() >= cmapv
) {
2251 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2252 << " <= table " << mds
->sessionmap
.get_version() << dendl
;
2254 dout(10) << "EUpdate.replay sessionmap " << mds
->sessionmap
.get_version()
2255 << " < " << cmapv
<< dendl
;
2256 // open client sessions?
2257 map
<client_t
,entity_inst_t
> cm
;
2258 map
<client_t
,client_metadata_t
> cmm
;
2259 auto blp
= client_map
.cbegin();
2264 mds
->sessionmap
.replay_open_sessions(cmapv
, cm
, cmm
);
2271 // ------------------------
2274 void EOpen::encode(bufferlist
&bl
, uint64_t features
) const {
2275 ENCODE_START(4, 3, bl
);
2277 encode(metablob
, bl
, features
);
2279 encode(snap_inos
, bl
);
2283 void EOpen::decode(bufferlist::const_iterator
&bl
) {
2284 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2287 decode(metablob
, bl
);
2290 decode(snap_inos
, bl
);
2294 void EOpen::dump(Formatter
*f
) const
2296 f
->open_object_section("metablob");
2298 f
->close_section(); // metablob
2299 f
->open_array_section("inos involved");
2300 for (vector
<inodeno_t
>::const_iterator i
= inos
.begin();
2301 i
!= inos
.end(); ++i
) {
2302 f
->dump_int("ino", *i
);
2304 f
->close_section(); // inos
2307 void EOpen::generate_test_instances(std::list
<EOpen
*>& ls
)
2309 ls
.push_back(new EOpen());
2310 ls
.push_back(new EOpen());
2311 ls
.back()->add_ino(0);
2314 void EOpen::update_segment()
2319 void EOpen::replay(MDSRank
*mds
)
2321 dout(10) << "EOpen.replay " << dendl
;
2322 auto&& segment
= get_segment();
2323 metablob
.replay(mds
, segment
);
2325 // note which segments inodes belong to, so we don't have to start rejournaling them
2326 for (const auto &ino
: inos
) {
2327 CInode
*in
= mds
->mdcache
->get_inode(ino
);
2329 dout(0) << "EOpen.replay ino " << ino
<< " not in metablob" << dendl
;
2332 segment
->open_files
.push_back(&in
->item_open_file
);
2334 for (const auto &vino
: snap_inos
) {
2335 CInode
*in
= mds
->mdcache
->get_inode(vino
);
2337 dout(0) << "EOpen.replay ino " << vino
<< " not in metablob" << dendl
;
2340 segment
->open_files
.push_back(&in
->item_open_file
);
2345 // -----------------------
2348 void ECommitted::replay(MDSRank
*mds
)
2350 if (mds
->mdcache
->uncommitted_leaders
.count(reqid
)) {
2351 dout(10) << "ECommitted.replay " << reqid
<< dendl
;
2352 mds
->mdcache
->uncommitted_leaders
[reqid
].ls
->uncommitted_leaders
.erase(reqid
);
2353 mds
->mdcache
->uncommitted_leaders
.erase(reqid
);
2355 dout(10) << "ECommitted.replay " << reqid
<< " -- didn't see original op" << dendl
;
2359 void ECommitted::encode(bufferlist
& bl
, uint64_t features
) const
2361 ENCODE_START(3, 3, bl
);
2367 void ECommitted::decode(bufferlist::const_iterator
& bl
)
2369 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2376 void ECommitted::dump(Formatter
*f
) const {
2377 f
->dump_stream("stamp") << stamp
;
2378 f
->dump_stream("reqid") << reqid
;
2381 void ECommitted::generate_test_instances(std::list
<ECommitted
*>& ls
)
2383 ls
.push_back(new ECommitted
);
2384 ls
.push_back(new ECommitted
);
2385 ls
.back()->stamp
= utime_t(1, 2);
2386 ls
.back()->reqid
= metareqid_t(entity_name_t::CLIENT(123), 456);
2389 // -----------------------
2392 void link_rollback::encode(bufferlist
&bl
) const
2394 ENCODE_START(3, 2, bl
);
2397 encode(was_inc
, bl
);
2398 encode(old_ctime
, bl
);
2399 encode(old_dir_mtime
, bl
);
2400 encode(old_dir_rctime
, bl
);
2405 void link_rollback::decode(bufferlist::const_iterator
&bl
)
2407 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
2410 decode(was_inc
, bl
);
2411 decode(old_ctime
, bl
);
2412 decode(old_dir_mtime
, bl
);
2413 decode(old_dir_rctime
, bl
);
2419 void link_rollback::dump(Formatter
*f
) const
2421 f
->dump_stream("metareqid") << reqid
;
2422 f
->dump_int("ino", ino
);
2423 f
->dump_string("was incremented", was_inc
? "true" : "false");
2424 f
->dump_stream("old_ctime") << old_ctime
;
2425 f
->dump_stream("old_dir_mtime") << old_dir_mtime
;
2426 f
->dump_stream("old_dir_rctime") << old_dir_rctime
;
2429 void link_rollback::generate_test_instances(std::list
<link_rollback
*>& ls
)
2431 ls
.push_back(new link_rollback());
2434 void rmdir_rollback::encode(bufferlist
& bl
) const
2436 ENCODE_START(3, 2, bl
);
2438 encode(src_dir
, bl
);
2439 encode(src_dname
, bl
);
2440 encode(dest_dir
, bl
);
2441 encode(dest_dname
, bl
);
2446 void rmdir_rollback::decode(bufferlist::const_iterator
& bl
)
2448 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
2450 decode(src_dir
, bl
);
2451 decode(src_dname
, bl
);
2452 decode(dest_dir
, bl
);
2453 decode(dest_dname
, bl
);
2459 void rmdir_rollback::dump(Formatter
*f
) const
2461 f
->dump_stream("metareqid") << reqid
;
2462 f
->dump_stream("source directory") << src_dir
;
2463 f
->dump_string("source dname", src_dname
);
2464 f
->dump_stream("destination directory") << dest_dir
;
2465 f
->dump_string("destination dname", dest_dname
);
2468 void rmdir_rollback::generate_test_instances(std::list
<rmdir_rollback
*>& ls
)
2470 ls
.push_back(new rmdir_rollback());
2473 void rename_rollback::drec::encode(bufferlist
&bl
) const
2475 ENCODE_START(2, 2, bl
);
2476 encode(dirfrag
, bl
);
2477 encode(dirfrag_old_mtime
, bl
);
2478 encode(dirfrag_old_rctime
, bl
);
2480 encode(remote_ino
, bl
);
2482 encode(remote_d_type
, bl
);
2483 encode(old_ctime
, bl
);
2487 void rename_rollback::drec::decode(bufferlist::const_iterator
&bl
)
2489 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2490 decode(dirfrag
, bl
);
2491 decode(dirfrag_old_mtime
, bl
);
2492 decode(dirfrag_old_rctime
, bl
);
2494 decode(remote_ino
, bl
);
2496 decode(remote_d_type
, bl
);
2497 decode(old_ctime
, bl
);
2501 void rename_rollback::drec::dump(Formatter
*f
) const
2503 f
->dump_stream("directory fragment") << dirfrag
;
2504 f
->dump_stream("directory old mtime") << dirfrag_old_mtime
;
2505 f
->dump_stream("directory old rctime") << dirfrag_old_rctime
;
2506 f
->dump_int("ino", ino
);
2507 f
->dump_int("remote ino", remote_ino
);
2508 f
->dump_string("dname", dname
);
2509 uint32_t type
= DTTOIF(remote_d_type
) & S_IFMT
; // convert to type entries
2513 type_string
= "file"; break;
2515 type_string
= "symlink"; break;
2517 type_string
= "directory"; break;
2519 type_string
= "UNKNOWN-" + stringify((int)type
); break;
2521 f
->dump_string("remote dtype", type_string
);
2522 f
->dump_stream("old ctime") << old_ctime
;
2525 void rename_rollback::drec::generate_test_instances(std::list
<drec
*>& ls
)
2527 ls
.push_back(new drec());
2528 ls
.back()->remote_d_type
= IFTODT(S_IFREG
);
2531 void rename_rollback::encode(bufferlist
&bl
) const
2533 ENCODE_START(3, 2, bl
);
2535 encode(orig_src
, bl
);
2536 encode(orig_dest
, bl
);
2539 encode(srci_snapbl
, bl
);
2540 encode(desti_snapbl
, bl
);
2544 void rename_rollback::decode(bufferlist::const_iterator
&bl
)
2546 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
2548 decode(orig_src
, bl
);
2549 decode(orig_dest
, bl
);
2552 if (struct_v
>= 3) {
2553 decode(srci_snapbl
, bl
);
2554 decode(desti_snapbl
, bl
);
2559 void rename_rollback::dump(Formatter
*f
) const
2561 f
->dump_stream("request id") << reqid
;
2562 f
->open_object_section("original src drec");
2564 f
->close_section(); // original src drec
2565 f
->open_object_section("original dest drec");
2567 f
->close_section(); // original dest drec
2568 f
->open_object_section("stray drec");
2570 f
->close_section(); // stray drec
2571 f
->dump_stream("ctime") << ctime
;
2574 void rename_rollback::generate_test_instances(std::list
<rename_rollback
*>& ls
)
2576 ls
.push_back(new rename_rollback());
2577 ls
.back()->orig_src
.remote_d_type
= IFTODT(S_IFREG
);
2578 ls
.back()->orig_dest
.remote_d_type
= IFTODT(S_IFREG
);
2579 ls
.back()->stray
.remote_d_type
= IFTODT(S_IFREG
);
2582 void EPeerUpdate::encode(bufferlist
&bl
, uint64_t features
) const
2584 ENCODE_START(3, 3, bl
);
2591 encode(commit
, bl
, features
);
2592 encode(rollback
, bl
);
2596 void EPeerUpdate::decode(bufferlist::const_iterator
&bl
)
2598 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2607 decode(rollback
, bl
);
2611 void EPeerUpdate::dump(Formatter
*f
) const
2613 f
->open_object_section("metablob");
2615 f
->close_section(); // metablob
2617 f
->dump_int("rollback length", rollback
.length());
2618 f
->dump_string("type", type
);
2619 f
->dump_stream("metareqid") << reqid
;
2620 f
->dump_int("leader", leader
);
2621 f
->dump_int("op", op
);
2622 f
->dump_int("original op", origop
);
2625 void EPeerUpdate::generate_test_instances(std::list
<EPeerUpdate
*>& ls
)
2627 ls
.push_back(new EPeerUpdate());
2630 void EPeerUpdate::replay(MDSRank
*mds
)
2633 auto&& segment
= get_segment();
2635 case EPeerUpdate::OP_PREPARE
:
2636 dout(10) << "EPeerUpdate.replay prepare " << reqid
<< " for mds." << leader
2637 << ": applying commit, saving rollback info" << dendl
;
2638 su
= new MDPeerUpdate(origop
, rollback
);
2639 commit
.replay(mds
, segment
, su
);
2640 mds
->mdcache
->add_uncommitted_peer(reqid
, segment
, leader
, su
);
2643 case EPeerUpdate::OP_COMMIT
:
2644 dout(10) << "EPeerUpdate.replay commit " << reqid
<< " for mds." << leader
<< dendl
;
2645 mds
->mdcache
->finish_uncommitted_peer(reqid
, false);
2648 case EPeerUpdate::OP_ROLLBACK
:
2649 dout(10) << "EPeerUpdate.replay abort " << reqid
<< " for mds." << leader
2650 << ": applying rollback commit blob" << dendl
;
2651 commit
.replay(mds
, segment
);
2652 mds
->mdcache
->finish_uncommitted_peer(reqid
, false);
2656 mds
->clog
->error() << "invalid op in EPeerUpdate";
2658 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2663 // -----------------------
2666 void ESubtreeMap::encode(bufferlist
& bl
, uint64_t features
) const
2668 ENCODE_START(6, 5, bl
);
2670 encode(metablob
, bl
, features
);
2671 encode(subtrees
, bl
);
2672 encode(ambiguous_subtrees
, bl
);
2673 encode(expire_pos
, bl
);
2674 encode(event_seq
, bl
);
2678 void ESubtreeMap::decode(bufferlist::const_iterator
&bl
)
2680 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl
);
2683 decode(metablob
, bl
);
2684 decode(subtrees
, bl
);
2686 decode(ambiguous_subtrees
, bl
);
2688 decode(expire_pos
, bl
);
2690 decode(event_seq
, bl
);
2694 void ESubtreeMap::dump(Formatter
*f
) const
2696 f
->open_object_section("metablob");
2698 f
->close_section(); // metablob
2700 f
->open_array_section("subtrees");
2701 for(map
<dirfrag_t
,vector
<dirfrag_t
> >::const_iterator i
= subtrees
.begin();
2702 i
!= subtrees
.end(); ++i
) {
2703 f
->open_object_section("tree");
2704 f
->dump_stream("root dirfrag") << i
->first
;
2705 for (vector
<dirfrag_t
>::const_iterator j
= i
->second
.begin();
2706 j
!= i
->second
.end(); ++j
) {
2707 f
->dump_stream("bound dirfrag") << *j
;
2709 f
->close_section(); // tree
2711 f
->close_section(); // subtrees
2713 f
->open_array_section("ambiguous subtrees");
2714 for(set
<dirfrag_t
>::const_iterator i
= ambiguous_subtrees
.begin();
2715 i
!= ambiguous_subtrees
.end(); ++i
) {
2716 f
->dump_stream("dirfrag") << *i
;
2718 f
->close_section(); // ambiguous subtrees
2720 f
->dump_int("expire position", expire_pos
);
2723 void ESubtreeMap::generate_test_instances(std::list
<ESubtreeMap
*>& ls
)
2725 ls
.push_back(new ESubtreeMap());
2728 void ESubtreeMap::replay(MDSRank
*mds
)
2730 if (expire_pos
&& expire_pos
> mds
->mdlog
->journaler
->get_expire_pos())
2731 mds
->mdlog
->journaler
->set_expire_pos(expire_pos
);
2733 // suck up the subtree map?
2734 if (mds
->mdcache
->is_subtrees()) {
2735 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl
;
2738 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= subtrees
.begin();
2739 p
!= subtrees
.end();
2741 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
2743 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2744 << " subtree root " << p
->first
<< " not in cache";
2749 if (!mds
->mdcache
->is_subtree(dir
)) {
2750 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2751 << " subtree root " << p
->first
<< " not a subtree in cache";
2755 if (dir
->get_dir_auth().first
!= mds
->get_nodeid()) {
2756 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2757 << " subtree root " << p
->first
2758 << " is not mine in cache (it's " << dir
->get_dir_auth() << ")";
2763 for (vector
<dirfrag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
2764 mds
->mdcache
->get_force_dirfrag(*q
, true);
2767 mds
->mdcache
->get_subtree_bounds(dir
, bounds
);
2768 for (vector
<dirfrag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
2769 CDir
*b
= mds
->mdcache
->get_dirfrag(*q
);
2771 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2772 << " subtree " << p
->first
<< " bound " << *q
<< " not in cache";
2776 if (bounds
.count(b
) == 0) {
2777 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2778 << " subtree " << p
->first
<< " bound " << *q
<< " not a bound in cache";
2784 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
) {
2785 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2786 << " subtree " << p
->first
<< " has extra bound in cache " << (*q
)->dirfrag();
2790 if (ambiguous_subtrees
.count(p
->first
)) {
2791 if (!mds
->mdcache
->have_ambiguous_import(p
->first
)) {
2792 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2793 << " subtree " << p
->first
<< " is ambiguous but is not in our cache";
2797 if (mds
->mdcache
->have_ambiguous_import(p
->first
)) {
2798 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2799 << " subtree " << p
->first
<< " is not ambiguous but is in our cache";
2805 std::vector
<CDir
*> dirs
;
2806 mds
->mdcache
->get_subtrees(dirs
);
2807 for (const auto& dir
: dirs
) {
2808 if (dir
->get_dir_auth().first
!= mds
->get_nodeid())
2810 if (subtrees
.count(dir
->dirfrag()) == 0) {
2811 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2812 << " does not include cache subtree " << dir
->dirfrag();
2818 dout(0) << "journal subtrees: " << subtrees
<< dendl
;
2819 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees
<< dendl
;
2820 mds
->mdcache
->show_subtrees();
2821 ceph_assert(!g_conf()->mds_debug_subtrees
|| errors
== 0);
2826 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl
;
2828 // first, stick the spanning tree in my cache
2829 //metablob.print(*_dout);
2830 metablob
.replay(mds
, get_segment());
2832 // restore import/export maps
2833 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= subtrees
.begin();
2834 p
!= subtrees
.end();
2836 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
2838 if (ambiguous_subtrees
.count(p
->first
)) {
2840 mds
->mdcache
->add_ambiguous_import(p
->first
, p
->second
);
2841 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, p
->second
,
2842 mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
2845 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, p
->second
, mds
->get_nodeid());
2849 mds
->mdcache
->recalc_auth_bits(true);
2851 mds
->mdcache
->show_subtrees();
2856 // -----------------------
2859 void EFragment::replay(MDSRank
*mds
)
2861 dout(10) << "EFragment.replay " << op_name(op
) << " " << ino
<< " " << basefrag
<< " by " << bits
<< dendl
;
2863 std::vector
<CDir
*> resultfrags
;
2864 MDSContext::vec waiters
;
2866 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2867 // it will be once we replay the metablob , but first we need to
2868 // refragment anything we already have in the cache.
2869 CInode
*in
= mds
->mdcache
->get_inode(ino
);
2871 auto&& segment
= get_segment();
2874 mds
->mdcache
->add_uncommitted_fragment(dirfrag_t(ino
, basefrag
), bits
, orig_frags
, segment
, &rollback
);
2877 mds
->mdcache
->adjust_dir_fragments(in
, basefrag
, bits
, &resultfrags
, waiters
, true);
2881 frag_vec_t old_frags
;
2883 in
->dirfragtree
.get_leaves_under(basefrag
, old_frags
);
2884 if (orig_frags
.empty()) {
2885 // old format EFragment
2886 mds
->mdcache
->adjust_dir_fragments(in
, basefrag
, -bits
, &resultfrags
, waiters
, true);
2888 for (const auto& fg
: orig_frags
)
2889 mds
->mdcache
->force_dir_fragment(in
, fg
);
2892 mds
->mdcache
->rollback_uncommitted_fragment(dirfrag_t(ino
, basefrag
), std::move(old_frags
));
2898 mds
->mdcache
->finish_uncommitted_fragment(dirfrag_t(ino
, basefrag
), op
);
2905 metablob
.replay(mds
, segment
);
2906 if (in
&& g_conf()->mds_debug_frag
)
2907 in
->verify_dirfrags();
2910 void EFragment::encode(bufferlist
&bl
, uint64_t features
) const {
2911 ENCODE_START(5, 4, bl
);
2915 encode(basefrag
, bl
);
2917 encode(metablob
, bl
, features
);
2918 encode(orig_frags
, bl
);
2919 encode(rollback
, bl
);
2923 void EFragment::decode(bufferlist::const_iterator
&bl
) {
2924 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl
);
2930 decode(basefrag
, bl
);
2932 decode(metablob
, bl
);
2933 if (struct_v
>= 5) {
2934 decode(orig_frags
, bl
);
2935 decode(rollback
, bl
);
2940 void EFragment::dump(Formatter
*f
) const
2942 /*f->open_object_section("Metablob");
2943 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2944 f->close_section();*/
2945 f
->dump_string("op", op_name(op
));
2946 f
->dump_stream("ino") << ino
;
2947 f
->dump_stream("base frag") << basefrag
;
2948 f
->dump_int("bits", bits
);
2951 void EFragment::generate_test_instances(std::list
<EFragment
*>& ls
)
2953 ls
.push_back(new EFragment
);
2954 ls
.push_back(new EFragment
);
2955 ls
.back()->op
= OP_PREPARE
;
2957 ls
.back()->bits
= 5;
2960 void dirfrag_rollback::encode(bufferlist
&bl
) const
2962 ENCODE_START(1, 1, bl
);
2967 void dirfrag_rollback::decode(bufferlist::const_iterator
&bl
)
2969 DECODE_START(1, bl
);
2971 auto _fnode
= CDir::allocate_fnode();
2972 decode(*_fnode
, bl
);
2973 fnode
= std::move(_fnode
);
2980 // =========================================================================
2982 // -----------------------
2985 void EExport::replay(MDSRank
*mds
)
2987 dout(10) << "EExport.replay " << base
<< dendl
;
2988 auto&& segment
= get_segment();
2989 metablob
.replay(mds
, segment
);
2991 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
2994 set
<CDir
*> realbounds
;
2995 for (set
<dirfrag_t
>::iterator p
= bounds
.begin();
2998 CDir
*bd
= mds
->mdcache
->get_dirfrag(*p
);
3000 realbounds
.insert(bd
);
3004 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, realbounds
, CDIR_AUTH_UNDEF
);
3006 mds
->mdcache
->try_trim_non_auth_subtree(dir
);
3009 void EExport::encode(bufferlist
& bl
, uint64_t features
) const
3011 ENCODE_START(4, 3, bl
);
3013 encode(metablob
, bl
, features
);
3020 void EExport::decode(bufferlist::const_iterator
&bl
)
3022 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
3025 decode(metablob
, bl
);
3033 void EExport::dump(Formatter
*f
) const
3035 f
->dump_float("stamp", (double)stamp
);
3036 /*f->open_object_section("Metablob");
3037 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
3038 f->close_section();*/
3039 f
->dump_stream("base dirfrag") << base
;
3040 f
->open_array_section("bounds dirfrags");
3041 for (set
<dirfrag_t
>::const_iterator i
= bounds
.begin();
3042 i
!= bounds
.end(); ++i
) {
3043 f
->dump_stream("dirfrag") << *i
;
3045 f
->close_section(); // bounds dirfrags
3048 void EExport::generate_test_instances(std::list
<EExport
*>& ls
)
3050 EExport
*sample
= new EExport();
3051 ls
.push_back(sample
);
3055 // -----------------------
3058 void EImportStart::update_segment()
3060 get_segment()->sessionmapv
= cmapv
;
3063 void EImportStart::replay(MDSRank
*mds
)
3065 dout(10) << "EImportStart.replay " << base
<< " bounds " << bounds
<< dendl
;
3066 //metablob.print(*_dout);
3067 auto&& segment
= get_segment();
3068 metablob
.replay(mds
, segment
);
3070 // put in ambiguous import list
3071 mds
->mdcache
->add_ambiguous_import(base
, bounds
);
3073 // set auth partially to us so we don't trim it
3074 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
3077 set
<CDir
*> realbounds
;
3078 for (vector
<dirfrag_t
>::iterator p
= bounds
.begin();
3081 CDir
*bd
= mds
->mdcache
->get_dirfrag(*p
);
3083 if (!bd
->is_subtree_root())
3084 bd
->state_clear(CDir::STATE_AUTH
);
3085 realbounds
.insert(bd
);
3088 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, realbounds
,
3089 mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
3091 // open client sessions?
3092 if (mds
->sessionmap
.get_version() >= cmapv
) {
3093 dout(10) << "EImportStart.replay sessionmap " << mds
->sessionmap
.get_version()
3094 << " >= " << cmapv
<< ", noop" << dendl
;
3096 dout(10) << "EImportStart.replay sessionmap " << mds
->sessionmap
.get_version()
3097 << " < " << cmapv
<< dendl
;
3098 map
<client_t
,entity_inst_t
> cm
;
3099 map
<client_t
,client_metadata_t
> cmm
;
3100 auto blp
= client_map
.cbegin();
3105 mds
->sessionmap
.replay_open_sessions(cmapv
, cm
, cmm
);
3110 void EImportStart::encode(bufferlist
&bl
, uint64_t features
) const {
3111 ENCODE_START(4, 3, bl
);
3114 encode(metablob
, bl
, features
);
3117 encode(client_map
, bl
);
3122 void EImportStart::decode(bufferlist::const_iterator
&bl
) {
3123 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
3127 decode(metablob
, bl
);
3130 decode(client_map
, bl
);
3136 void EImportStart::dump(Formatter
*f
) const
3138 f
->dump_stream("base dirfrag") << base
;
3139 f
->open_array_section("boundary dirfrags");
3140 for (vector
<dirfrag_t
>::const_iterator iter
= bounds
.begin();
3141 iter
!= bounds
.end(); ++iter
) {
3142 f
->dump_stream("frag") << *iter
;
3147 void EImportStart::generate_test_instances(std::list
<EImportStart
*>& ls
)
3149 ls
.push_back(new EImportStart
);
3152 // -----------------------
3155 void EImportFinish::replay(MDSRank
*mds
)
3157 if (mds
->mdcache
->have_ambiguous_import(base
)) {
3158 dout(10) << "EImportFinish.replay " << base
<< " success=" << success
<< dendl
;
3160 mds
->mdcache
->finish_ambiguous_import(base
);
3162 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
3164 vector
<dirfrag_t
> bounds
;
3165 mds
->mdcache
->get_ambiguous_import_bounds(base
, bounds
);
3166 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, bounds
, CDIR_AUTH_UNDEF
);
3167 mds
->mdcache
->cancel_ambiguous_import(dir
);
3168 mds
->mdcache
->try_trim_non_auth_subtree(dir
);
3171 // this shouldn't happen unless this is an old journal
3172 dout(10) << "EImportFinish.replay " << base
<< " success=" << success
3173 << " on subtree not marked as ambiguous"
3175 mds
->clog
->error() << "failure replaying journal (EImportFinish)";
3177 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3181 void EImportFinish::encode(bufferlist
& bl
, uint64_t features
) const
3183 ENCODE_START(3, 3, bl
);
3186 encode(success
, bl
);
3190 void EImportFinish::decode(bufferlist::const_iterator
&bl
)
3192 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
3196 decode(success
, bl
);
3200 void EImportFinish::dump(Formatter
*f
) const
3202 f
->dump_stream("base dirfrag") << base
;
3203 f
->dump_string("success", success
? "true" : "false");
3205 void EImportFinish::generate_test_instances(std::list
<EImportFinish
*>& ls
)
3207 ls
.push_back(new EImportFinish
);
3208 ls
.push_back(new EImportFinish
);
3209 ls
.back()->success
= true;
3213 // ------------------------
3216 void EResetJournal::encode(bufferlist
& bl
, uint64_t features
) const
3218 ENCODE_START(2, 2, bl
);
3223 void EResetJournal::decode(bufferlist::const_iterator
&bl
)
3225 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
3230 void EResetJournal::dump(Formatter
*f
) const
3232 f
->dump_stream("timestamp") << stamp
;
3235 void EResetJournal::generate_test_instances(std::list
<EResetJournal
*>& ls
)
3237 ls
.push_back(new EResetJournal());
3240 void EResetJournal::replay(MDSRank
*mds
)
3242 dout(1) << "EResetJournal" << dendl
;
3244 mds
->sessionmap
.wipe();
3245 mds
->inotable
->replay_reset();
3247 if (mds
->mdsmap
->get_root() == mds
->get_nodeid()) {
3248 CDir
*rootdir
= mds
->mdcache
->get_root()->get_or_open_dirfrag(mds
->mdcache
, frag_t());
3249 mds
->mdcache
->adjust_subtree_auth(rootdir
, mds
->get_nodeid());
3252 CDir
*mydir
= mds
->mdcache
->get_myin()->get_or_open_dirfrag(mds
->mdcache
, frag_t());
3253 mds
->mdcache
->adjust_subtree_auth(mydir
, mds
->get_nodeid());
3255 mds
->mdcache
->recalc_auth_bits(true);
3257 mds
->mdcache
->show_subtrees();
3261 void ENoOp::encode(bufferlist
&bl
, uint64_t features
) const
3263 ENCODE_START(2, 2, bl
);
3264 encode(pad_size
, bl
);
3265 uint8_t const pad
= 0xff;
3266 for (unsigned int i
= 0; i
< pad_size
; ++i
) {
3273 void ENoOp::decode(bufferlist::const_iterator
&bl
)
3275 DECODE_START(2, bl
);
3276 decode(pad_size
, bl
);
3277 if (bl
.get_remaining() != pad_size
) {
3278 // This is spiritually an assertion, but expressing in a way that will let
3279 // journal debug tools catch it and recognise a malformed entry.
3280 throw buffer::end_of_buffer();
3288 void ENoOp::replay(MDSRank
*mds
)
3290 dout(4) << "ENoOp::replay, " << pad_size
<< " bytes skipped in journal" << dendl
;
3294 * If re-formatting an old journal that used absolute log position
3295 * references as segment sequence numbers, use this function to update
3299 * MDSRank instance, just used for logging
3301 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3304 * True if the event was modified.
3306 bool EMetaBlob::rewrite_truncate_finish(MDSRank
const *mds
,
3307 std::map
<LogSegment::seq_t
, LogSegment::seq_t
> const &old_to_new
)
3309 bool modified
= false;
3310 map
<inodeno_t
, LogSegment::seq_t
> new_trunc_finish
;
3311 for (const auto& p
: truncate_finish
) {
3312 auto q
= old_to_new
.find(p
.second
);
3313 if (q
!= old_to_new
.end()) {
3314 dout(20) << __func__
<< " applying segment seq mapping "
3315 << p
.second
<< " -> " << q
->second
<< dendl
;
3316 new_trunc_finish
.emplace(p
.first
, q
->second
);
3319 dout(20) << __func__
<< " no segment seq mapping found for "
3320 << p
.second
<< dendl
;
3321 new_trunc_finish
.insert(p
);
3324 truncate_finish
.swap(new_trunc_finish
);