1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "common/config.h"
16 #include "osdc/Journaler.h"
17 #include "events/ESubtreeMap.h"
18 #include "events/ESession.h"
19 #include "events/ESessions.h"
21 #include "events/EMetaBlob.h"
22 #include "events/EResetJournal.h"
23 #include "events/ENoOp.h"
25 #include "events/EUpdate.h"
26 #include "events/ESlaveUpdate.h"
27 #include "events/EOpen.h"
28 #include "events/ECommitted.h"
29 #include "events/EPurged.h"
31 #include "events/EExport.h"
32 #include "events/EImportStart.h"
33 #include "events/EImportFinish.h"
34 #include "events/EFragment.h"
36 #include "events/ETableClient.h"
37 #include "events/ETableServer.h"
39 #include "include/stringify.h"
41 #include "LogSegment.h"
51 #include "MDSTableClient.h"
52 #include "MDSTableServer.h"
56 #define dout_context g_ceph_context
57 #define dout_subsys ceph_subsys_mds
59 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
62 // -----------------------
65 void LogSegment::try_to_expire(MDSRank
*mds
, MDSGatherBuilder
&gather_bld
, int op_prio
)
69 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire" << dendl
;
71 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 1);
74 for (elist
<CDir
*>::iterator p
= new_dirfrags
.begin(); !p
.end(); ++p
) {
75 dout(20) << " new_dirfrag " << **p
<< dendl
;
76 ceph_assert((*p
)->is_auth());
79 for (elist
<CDir
*>::iterator p
= dirty_dirfrags
.begin(); !p
.end(); ++p
) {
80 dout(20) << " dirty_dirfrag " << **p
<< dendl
;
81 ceph_assert((*p
)->is_auth());
84 for (elist
<CDentry
*>::iterator p
= dirty_dentries
.begin(); !p
.end(); ++p
) {
85 dout(20) << " dirty_dentry " << **p
<< dendl
;
86 ceph_assert((*p
)->is_auth());
87 commit
.insert((*p
)->get_dir());
89 for (elist
<CInode
*>::iterator p
= dirty_inodes
.begin(); !p
.end(); ++p
) {
90 dout(20) << " dirty_inode " << **p
<< dendl
;
91 ceph_assert((*p
)->is_auth());
92 if ((*p
)->is_base()) {
93 (*p
)->store(gather_bld
.new_sub());
95 commit
.insert((*p
)->get_parent_dn()->get_dir());
98 if (!commit
.empty()) {
99 for (set
<CDir
*>::iterator p
= commit
.begin();
103 ceph_assert(dir
->is_auth());
104 if (dir
->can_auth_pin()) {
105 dout(15) << "try_to_expire committing " << *dir
<< dendl
;
106 dir
->commit(0, gather_bld
.new_sub(), false, op_prio
);
108 dout(15) << "try_to_expire waiting for unfreeze on " << *dir
<< dendl
;
109 dir
->add_waiter(CDir::WAIT_UNFREEZE
, gather_bld
.new_sub());
114 // master ops with possibly uncommitted slaves
115 for (set
<metareqid_t
>::iterator p
= uncommitted_masters
.begin();
116 p
!= uncommitted_masters
.end();
118 dout(10) << "try_to_expire waiting for slaves to ack commit on " << *p
<< dendl
;
119 mds
->mdcache
->wait_for_uncommitted_master(*p
, gather_bld
.new_sub());
122 // slave ops that haven't been committed
123 for (set
<metareqid_t
>::iterator p
= uncommitted_slaves
.begin();
124 p
!= uncommitted_slaves
.end();
126 dout(10) << "try_to_expire waiting for master to ack OP_FINISH on " << *p
<< dendl
;
127 mds
->mdcache
->wait_for_uncommitted_slave(*p
, gather_bld
.new_sub());
130 // uncommitted fragments
131 for (set
<dirfrag_t
>::iterator p
= uncommitted_fragments
.begin();
132 p
!= uncommitted_fragments
.end();
134 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p
<< dendl
;
135 mds
->mdcache
->wait_for_uncommitted_fragment(*p
, gather_bld
.new_sub());
138 // nudge scatterlocks
139 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_dir
.begin(); !p
.end(); ++p
) {
141 dout(10) << "try_to_expire waiting for dirlock flush on " << *in
<< dendl
;
142 mds
->locker
->scatter_nudge(&in
->filelock
, gather_bld
.new_sub());
144 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_dirfragtree
.begin(); !p
.end(); ++p
) {
146 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in
<< dendl
;
147 mds
->locker
->scatter_nudge(&in
->dirfragtreelock
, gather_bld
.new_sub());
149 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_nest
.begin(); !p
.end(); ++p
) {
151 dout(10) << "try_to_expire waiting for nest flush on " << *in
<< dendl
;
152 mds
->locker
->scatter_nudge(&in
->nestlock
, gather_bld
.new_sub());
155 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 2);
157 // open files and snap inodes
158 if (!open_files
.empty()) {
159 ceph_assert(!mds
->mdlog
->is_capped()); // hmm FIXME
161 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
162 ceph_assert(ls
!= this);
163 elist
<CInode
*>::iterator p
= open_files
.begin(member_offset(CInode
, item_open_file
));
167 if (in
->last
!= CEPH_NOSNAP
&& in
->is_auth() && !in
->client_snap_caps
.empty()) {
168 // journal snap inodes that need flush. This simplify the mds failover hanlding
169 dout(20) << "try_to_expire requeueing snap needflush inode " << *in
<< dendl
;
171 le
= new EOpen(mds
->mdlog
);
172 mds
->mdlog
->start_entry(le
);
174 le
->add_clean_inode(in
);
175 ls
->open_files
.push_back(&in
->item_open_file
);
177 // open files are tracked by open file table, no need to journal them again
178 in
->item_open_file
.remove_myself();
182 mds
->mdlog
->submit_entry(le
);
183 mds
->mdlog
->wait_for_safe(gather_bld
.new_sub());
184 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl
;
188 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 3);
190 // backtraces to be stored/updated
191 for (elist
<CInode
*>::iterator p
= dirty_parent_inodes
.begin(); !p
.end(); ++p
) {
193 ceph_assert(in
->is_auth());
194 if (in
->can_auth_pin()) {
195 dout(15) << "try_to_expire waiting for storing backtrace on " << *in
<< dendl
;
196 in
->store_backtrace(gather_bld
.new_sub(), op_prio
);
198 dout(15) << "try_to_expire waiting for unfreeze on " << *in
<< dendl
;
199 in
->add_waiter(CInode::WAIT_UNFREEZE
, gather_bld
.new_sub());
203 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 4);
206 if (inotablev
> mds
->inotable
->get_committed_version()) {
207 dout(10) << "try_to_expire saving inotable table, need " << inotablev
208 << ", committed is " << mds
->inotable
->get_committed_version()
209 << " (" << mds
->inotable
->get_committing_version() << ")"
211 mds
->inotable
->save(gather_bld
.new_sub(), inotablev
);
215 if (sessionmapv
> mds
->sessionmap
.get_committed()) {
216 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
217 << ", committed is " << mds
->sessionmap
.get_committed()
218 << " (" << mds
->sessionmap
.get_committing() << ")"
220 mds
->sessionmap
.save(gather_bld
.new_sub(), sessionmapv
);
223 // updates to sessions for completed_requests
224 mds
->sessionmap
.save_if_dirty(touched_sessions
, &gather_bld
);
225 touched_sessions
.clear();
227 // pending commit atids
228 for (map
<int, ceph::unordered_set
<version_t
> >::iterator p
= pending_commit_tids
.begin();
229 p
!= pending_commit_tids
.end();
231 MDSTableClient
*client
= mds
->get_table_client(p
->first
);
233 for (ceph::unordered_set
<version_t
>::iterator q
= p
->second
.begin();
234 q
!= p
->second
.end();
236 dout(10) << "try_to_expire " << get_mdstable_name(p
->first
) << " transaction " << *q
237 << " pending commit (not yet acked), waiting" << dendl
;
238 ceph_assert(!client
->has_committed(*q
));
239 client
->wait_for_ack(*q
, gather_bld
.new_sub());
244 for (map
<int, version_t
>::iterator p
= tablev
.begin();
247 MDSTableServer
*server
= mds
->get_table_server(p
->first
);
249 if (p
->second
> server
->get_committed_version()) {
250 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p
->first
)
251 << " to save, need " << p
->second
<< dendl
;
252 server
->save(gather_bld
.new_sub());
257 for (set
<CInode
*>::iterator p
= truncating_inodes
.begin();
258 p
!= truncating_inodes
.end();
260 dout(10) << "try_to_expire waiting for truncate of " << **p
<< dendl
;
261 (*p
)->add_waiter(CInode::WAIT_TRUNC
, gather_bld
.new_sub());
264 dout(10) << "try_to_expire waiting for purge of " << purge_inodes
<< dendl
;
265 if (purge_inodes
.size())
266 set_purged_cb(gather_bld
.new_sub());
268 if (gather_bld
.has_subs()) {
269 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire waiting" << dendl
;
272 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 5);
273 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire success" << dendl
;
277 // -----------------------
280 void EMetaBlob::add_dir_context(CDir
*dir
, int mode
)
282 MDSRank
*mds
= dir
->cache
->mds
;
284 list
<CDentry
*> parents
;
286 // it may be okay not to include the maybe items, if
287 // - we journaled the maybe child inode in this segment
288 // - that subtree turns out to be unambiguously auth
289 list
<CDentry
*> maybe
;
290 bool maybenot
= false;
293 // already have this dir? (we must always add in order)
294 if (lump_map
.count(dir
->dirfrag())) {
295 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") have lump " << dir
->dirfrag() << dendl
;
299 // stop at root/stray
300 CInode
*diri
= dir
->get_inode();
301 CDentry
*parent
= diri
->get_projected_parent_dn();
303 if (mode
== TO_AUTH_SUBTREE_ROOT
) {
305 if (dir
->is_subtree_root()) {
306 // match logic in MDCache::create_subtree_map()
307 if (dir
->get_dir_auth().first
== mds
->get_nodeid()) {
308 mds_authority_t parent_auth
= parent
? parent
->authority() : CDIR_AUTH_UNDEF
;
309 if (parent_auth
.first
== dir
->get_dir_auth().first
) {
310 if (parent_auth
.second
== CDIR_AUTH_UNKNOWN
&&
311 !dir
->is_ambiguous_dir_auth() &&
312 !dir
->state_test(CDir::STATE_EXPORTBOUND
) &&
313 !dir
->state_test(CDir::STATE_AUXSUBTREE
) &&
314 !diri
->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
315 dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir
<< dendl
;
318 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") ambiguous or transient subtree " << dendl
;
320 // it's an auth subtree, we don't need maybe (if any), and we're done.
321 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") reached unambig auth subtree, don't need " << maybe
322 << " at " << *dir
<< dendl
;
327 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") reached ambig or !auth subtree, need " << maybe
328 << " at " << *dir
<< dendl
;
329 // we need the maybe list after all!
330 parents
.splice(parents
.begin(), maybe
);
335 // was the inode journaled in this blob?
336 if (event_seq
&& diri
->last_journaled
== event_seq
) {
337 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") already have diri this blob " << *diri
<< dendl
;
341 // have we journaled this inode since the last subtree map?
342 if (!maybenot
&& last_subtree_map
&& diri
->last_journaled
>= last_subtree_map
) {
343 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") already have diri in this segment ("
344 << diri
->last_journaled
<< " >= " << last_subtree_map
<< "), setting maybenot flag "
354 dout(25) << "EMetaBlob::add_dir_context(" << dir
<< ") maybe " << *parent
<< dendl
;
355 maybe
.push_front(parent
);
357 dout(25) << "EMetaBlob::add_dir_context(" << dir
<< ") definitely " << *parent
<< dendl
;
358 parents
.push_front(parent
);
361 dir
= parent
->get_dir();
364 parents
.splice(parents
.begin(), maybe
);
366 dout(20) << "EMetaBlob::add_dir_context final: " << parents
<< dendl
;
367 for (const auto& dentry
: parents
) {
368 ceph_assert(dentry
->get_projected_linkage()->is_primary());
369 add_dentry(dentry
, false);
373 void EMetaBlob::update_segment(LogSegment
*ls
)
375 // dirty inode mtimes
376 // -> handled directly by Server.cc, replay()
378 // alloc table update?
380 ls
->inotablev
= inotablev
;
382 ls
->sessionmapv
= sessionmapv
;
385 // -> handled directly by Server.cc
388 // note the newest request per client
389 //if (!client_reqs.empty())
390 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
393 // EMetaBlob::fullbit
395 void EMetaBlob::fullbit::encode(bufferlist
& bl
, uint64_t features
) const {
396 ENCODE_START(8, 5, bl
);
401 encode(inode
, bl
, features
);
403 if (inode
.is_symlink())
405 if (inode
.is_dir()) {
406 encode(dirfragtree
, bl
);
410 if (old_inodes
.empty()) {
414 encode(old_inodes
, bl
, features
);
418 encode(oldest_snap
, bl
);
422 void EMetaBlob::fullbit::decode(bufferlist::const_iterator
&bl
) {
423 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
429 decode_noshare(xattrs
, bl
);
430 if (inode
.is_symlink())
432 if (inode
.is_dir()) {
433 decode(dirfragtree
, bl
);
435 if ((struct_v
== 2) || (struct_v
== 3)) {
436 bool dir_layout_exists
;
437 decode(dir_layout_exists
, bl
);
438 if (dir_layout_exists
) {
440 decode(dir_struct_v
, bl
); // default_file_layout version
441 decode(inode
.layout
, bl
); // and actual layout, that we care about
450 state
= dirty
? EMetaBlob::fullbit::STATE_DIRTY
: 0;
454 bool old_inodes_present
;
455 decode(old_inodes_present
, bl
);
456 if (old_inodes_present
) {
457 decode(old_inodes
, bl
);
460 if (!inode
.is_dir()) {
465 decode(oldest_snap
, bl
);
467 oldest_snap
= CEPH_NOSNAP
;
472 void EMetaBlob::fullbit::dump(Formatter
*f
) const
474 f
->dump_string("dentry", dn
);
475 f
->dump_stream("snapid.first") << dnfirst
;
476 f
->dump_stream("snapid.last") << dnlast
;
477 f
->dump_int("dentry version", dnv
);
478 f
->open_object_section("inode");
480 f
->close_section(); // inode
481 f
->open_object_section("xattrs");
482 for (const auto &p
: xattrs
) {
483 std::string
s(p
.second
.c_str(), p
.second
.length());
484 f
->dump_string(p
.first
.c_str(), s
);
486 f
->close_section(); // xattrs
487 if (inode
.is_symlink()) {
488 f
->dump_string("symlink", symlink
);
490 if (inode
.is_dir()) {
491 f
->dump_stream("frag tree") << dirfragtree
;
492 f
->dump_string("has_snapbl", snapbl
.length() ? "true" : "false");
493 if (inode
.has_layout()) {
494 f
->open_object_section("file layout policy");
496 f
->dump_string("layout", "the layout exists");
497 f
->close_section(); // file layout policy
500 f
->dump_string("state", state_string());
501 if (!old_inodes
.empty()) {
502 f
->open_array_section("old inodes");
503 for (const auto &p
: old_inodes
) {
504 f
->open_object_section("inode");
505 f
->dump_int("snapid", p
.first
);
507 f
->close_section(); // inode
509 f
->close_section(); // old inodes
513 void EMetaBlob::fullbit::generate_test_instances(std::list
<EMetaBlob::fullbit
*>& ls
)
515 CInode::mempool_inode inode
;
517 CInode::mempool_xattr_map empty_xattrs
;
518 bufferlist empty_snapbl
;
519 fullbit
*sample
= new fullbit("/testdn", 0, 0, 0,
520 inode
, fragtree
, empty_xattrs
, "", 0, empty_snapbl
,
522 ls
.push_back(sample
);
525 void EMetaBlob::fullbit::update_inode(MDSRank
*mds
, CInode
*in
)
529 in
->maybe_export_pin();
530 if (in
->inode
.is_dir()) {
531 if (!(in
->dirfragtree
== dirfragtree
)) {
532 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in
->dirfragtree
<< " -> "
533 << dirfragtree
<< " on " << *in
<< dendl
;
534 in
->dirfragtree
= dirfragtree
;
535 in
->force_dirfrags();
536 if (in
->get_num_dirfrags() && in
->authority() == CDIR_AUTH_UNDEF
) {
537 auto&& ls
= in
->get_nested_dirfrags();
538 for (const auto& dir
: ls
) {
539 if (dir
->get_num_any() == 0 &&
540 mds
->mdcache
->can_trim_non_auth_dirfrag(dir
)) {
541 dout(10) << " closing empty non-auth dirfrag " << *dir
<< dendl
;
542 in
->close_dirfrag(dir
->get_frag());
547 } else if (in
->inode
.is_symlink()) {
548 in
->symlink
= symlink
;
550 in
->old_inodes
= old_inodes
;
551 if (!in
->old_inodes
.empty()) {
552 snapid_t min_first
= in
->old_inodes
.rbegin()->first
+ 1;
553 if (min_first
> in
->first
)
554 in
->first
= min_first
;
558 * we can do this before linking hte inode bc the split_at would
559 * be a no-op.. we have no children (namely open snaprealms) to
562 in
->oldest_snap
= oldest_snap
;
563 in
->decode_snap_blob(snapbl
);
566 * In case there was anything malformed in the journal that we are
567 * replaying, do sanity checks on the inodes we're replaying and
568 * go damaged instead of letting any trash into a live cache
571 // Files must have valid layouts with a pool set
572 if (in
->inode
.layout
.pool_id
== -1 || !in
->inode
.layout
.is_valid()) {
573 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
574 << ": " << in
->inode
.layout
<< dendl
;
575 std::ostringstream oss
;
576 oss
<< "Invalid layout for inode " << in
->ino() << " in journal";
577 mds
->clog
->error() << oss
.str();
579 ceph_abort(); // Should be unreachable because damaged() calls respawn()
584 // EMetaBlob::remotebit
586 void EMetaBlob::remotebit::encode(bufferlist
& bl
) const
588 ENCODE_START(2, 2, bl
);
599 void EMetaBlob::remotebit::decode(bufferlist::const_iterator
&bl
)
601 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
612 void EMetaBlob::remotebit::dump(Formatter
*f
) const
614 f
->dump_string("dentry", dn
);
615 f
->dump_int("snapid.first", dnfirst
);
616 f
->dump_int("snapid.last", dnlast
);
617 f
->dump_int("dentry version", dnv
);
618 f
->dump_int("inodeno", ino
);
619 uint32_t type
= DTTOIF(d_type
) & S_IFMT
; // convert to type entries
623 type_string
= "file"; break;
625 type_string
= "symlink"; break;
627 type_string
= "directory"; break;
629 type_string
= "fifo"; break;
631 type_string
= "chr"; break;
633 type_string
= "blk"; break;
635 type_string
= "sock"; break;
637 assert (0 == "unknown d_type!");
639 f
->dump_string("d_type", type_string
);
640 f
->dump_string("dirty", dirty
? "true" : "false");
643 void EMetaBlob::remotebit::
644 generate_test_instances(std::list
<EMetaBlob::remotebit
*>& ls
)
646 remotebit
*remote
= new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG
), false);
647 ls
.push_back(remote
);
650 // EMetaBlob::nullbit
652 void EMetaBlob::nullbit::encode(bufferlist
& bl
) const
654 ENCODE_START(2, 2, bl
);
663 void EMetaBlob::nullbit::decode(bufferlist::const_iterator
&bl
)
665 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
674 void EMetaBlob::nullbit::dump(Formatter
*f
) const
676 f
->dump_string("dentry", dn
);
677 f
->dump_int("snapid.first", dnfirst
);
678 f
->dump_int("snapid.last", dnlast
);
679 f
->dump_int("dentry version", dnv
);
680 f
->dump_string("dirty", dirty
? "true" : "false");
683 void EMetaBlob::nullbit::generate_test_instances(std::list
<nullbit
*>& ls
)
685 nullbit
*sample
= new nullbit("/test/dentry", 0, 10, 15, false);
686 nullbit
*sample2
= new nullbit("/test/dirty", 10, 20, 25, true);
687 ls
.push_back(sample
);
688 ls
.push_back(sample2
);
691 // EMetaBlob::dirlump
693 void EMetaBlob::dirlump::encode(bufferlist
& bl
, uint64_t features
) const
695 ENCODE_START(2, 2, bl
);
701 _encode_bits(features
);
706 void EMetaBlob::dirlump::decode(bufferlist::const_iterator
&bl
)
708 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
)
715 dn_decoded
= false; // don't decode bits unless we need them.
719 void EMetaBlob::dirlump::dump(Formatter
*f
) const
722 dirlump
*me
= const_cast<dirlump
*>(this);
725 f
->open_object_section("fnode");
727 f
->close_section(); // fnode
728 f
->dump_string("state", state_string());
729 f
->dump_int("nfull", nfull
);
730 f
->dump_int("nremote", nremote
);
731 f
->dump_int("nnull", nnull
);
733 f
->open_array_section("full bits");
734 for (const auto& iter
: dfull
) {
735 f
->open_object_section("fullbit");
737 f
->close_section(); // fullbit
739 f
->close_section(); // full bits
740 f
->open_array_section("remote bits");
741 for (const auto& iter
: dremote
) {
742 f
->open_object_section("remotebit");
744 f
->close_section(); // remotebit
746 f
->close_section(); // remote bits
747 f
->open_array_section("null bits");
748 for (const auto& iter
: dnull
) {
749 f
->open_object_section("null bit");
751 f
->close_section(); // null bit
753 f
->close_section(); // null bits
756 void EMetaBlob::dirlump::generate_test_instances(std::list
<dirlump
*>& ls
)
758 ls
.push_back(new dirlump());
764 void EMetaBlob::encode(bufferlist
& bl
, uint64_t features
) const
766 ENCODE_START(8, 5, bl
);
767 encode(lump_order
, bl
);
768 encode(lump_map
, bl
, features
);
769 encode(roots
, bl
, features
);
770 encode(table_tids
, bl
);
771 encode(opened_ino
, bl
);
772 encode(allocated_ino
, bl
);
773 encode(used_preallocated_ino
, bl
);
774 encode(preallocated_inos
, bl
);
775 encode(client_name
, bl
);
776 encode(inotablev
, bl
);
777 encode(sessionmapv
, bl
);
778 encode(truncate_start
, bl
);
779 encode(truncate_finish
, bl
);
780 encode(destroyed_inodes
, bl
);
781 encode(client_reqs
, bl
);
782 encode(renamed_dirino
, bl
);
783 encode(renamed_dir_frags
, bl
);
785 // make MDSRank use v6 format happy
791 encode(client_flushes
, bl
);
794 void EMetaBlob::decode(bufferlist::const_iterator
&bl
)
796 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl
);
797 decode(lump_order
, bl
);
798 decode(lump_map
, bl
);
804 if (rootbl
.length()) {
805 auto p
= rootbl
.cbegin();
806 roots
.emplace_back(p
);
809 decode(table_tids
, bl
);
810 decode(opened_ino
, bl
);
811 decode(allocated_ino
, bl
);
812 decode(used_preallocated_ino
, bl
);
813 decode(preallocated_inos
, bl
);
814 decode(client_name
, bl
);
815 decode(inotablev
, bl
);
816 decode(sessionmapv
, bl
);
817 decode(truncate_start
, bl
);
818 decode(truncate_finish
, bl
);
819 decode(destroyed_inodes
, bl
);
821 decode(client_reqs
, bl
);
826 client_reqs
.push_back(pair
<metareqid_t
,uint64_t>(r
.front(), 0));
831 decode(renamed_dirino
, bl
);
832 decode(renamed_dir_frags
, bl
);
842 decode(client_flushes
, bl
);
849 * Get all inodes touched by this metablob. Includes the 'bits' within
850 * dirlumps, and the inodes of the dirs themselves.
852 void EMetaBlob::get_inodes(
853 std::set
<inodeno_t
> &inodes
) const
855 // For all dirlumps in this metablob
856 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
857 // Record inode of dirlump
858 inodeno_t
const dir_ino
= i
->first
.ino
;
859 inodes
.insert(dir_ino
);
861 // Decode dirlump bits
862 dirlump
const &dl
= i
->second
;
865 // Record inodes of fullbits
866 for (const auto& iter
: dl
.get_dfull()) {
867 inodes
.insert(iter
.inode
.ino
);
870 // Record inodes of remotebits
871 for (const auto& iter
: dl
.get_dremote()) {
872 inodes
.insert(iter
.ino
);
879 * Get a map of dirfrag to set of dentries in that dirfrag which are
880 * touched in this operation.
882 void EMetaBlob::get_dentries(std::map
<dirfrag_t
, std::set
<std::string
> > &dentries
) const
884 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
885 dirlump
const &dl
= i
->second
;
886 dirfrag_t
const &df
= i
->first
;
891 // For all bits, store dentry
892 for (const auto& iter
: dl
.get_dfull()) {
893 dentries
[df
].insert(iter
.dn
);
895 for (const auto& iter
: dl
.get_dremote()) {
896 dentries
[df
].insert(iter
.dn
);
898 for (const auto& iter
: dl
.get_dnull()) {
899 dentries
[df
].insert(iter
.dn
);
907 * Calculate all paths that we can infer are touched by this metablob. Only uses
908 * information local to this metablob so it may only be the path within the
911 void EMetaBlob::get_paths(
912 std::vector
<std::string
> &paths
) const
914 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
915 typedef std::pair
<inodeno_t
, std::string
> Location
;
917 // Whenever we see a dentry within a dirlump, we remember it as a child of
918 // the dirlump's inode
919 std::map
<inodeno_t
, std::vector
<std::string
> > children
;
921 // Whenever we see a location for an inode, remember it: this allows us to
922 // build a path given an inode
923 std::map
<inodeno_t
, Location
> ino_locations
;
925 // Special case: operations on root inode populate roots but not dirlumps
926 if (lump_map
.empty() && !roots
.empty()) {
927 paths
.push_back("/");
933 // Build a tiny local metadata cache for the path structure in this metablob
934 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
935 inodeno_t
const dir_ino
= i
->first
.ino
;
936 dirlump
const &dl
= i
->second
;
939 for (const auto& iter
: dl
.get_dfull()) {
940 std::string_view dentry
= iter
.dn
;
941 children
[dir_ino
].emplace_back(dentry
);
942 ino_locations
[iter
.inode
.ino
] = Location(dir_ino
, dentry
);
945 for (const auto& iter
: dl
.get_dremote()) {
946 std::string_view dentry
= iter
.dn
;
947 children
[dir_ino
].emplace_back(dentry
);
950 for (const auto& iter
: dl
.get_dnull()) {
951 std::string_view dentry
= iter
.dn
;
952 children
[dir_ino
].emplace_back(dentry
);
956 std::vector
<Location
> leaf_locations
;
960 // Output paths for all childless nodes in the metablob
961 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
962 inodeno_t
const dir_ino
= i
->first
.ino
;
963 dirlump
const &dl
= i
->second
;
966 for (const auto& iter
: dl
.get_dfull()) {
967 std::string_view dentry
= iter
.dn
;
968 if (children
.find(iter
.inode
.ino
) == children
.end()) {
969 leaf_locations
.push_back(Location(dir_ino
, dentry
));
973 for (const auto& iter
: dl
.get_dremote()) {
974 std::string_view dentry
= iter
.dn
;
975 leaf_locations
.push_back(Location(dir_ino
, dentry
));
978 for (const auto& iter
: dl
.get_dnull()) {
979 std::string_view dentry
= iter
.dn
;
980 leaf_locations
.push_back(Location(dir_ino
, dentry
));
984 // For all the leaf locations identified, generate paths
985 for (std::vector
<Location
>::iterator i
= leaf_locations
.begin(); i
!= leaf_locations
.end(); ++i
) {
986 Location
const &loc
= *i
;
987 std::string path
= loc
.second
;
988 inodeno_t ino
= loc
.first
;
989 std::map
<inodeno_t
, Location
>::iterator iter
= ino_locations
.find(ino
);
990 while(iter
!= ino_locations
.end()) {
991 Location
const &loc
= iter
->second
;
993 path
= loc
.second
+ "/" + path
;
995 path
= loc
.second
+ path
;
997 iter
= ino_locations
.find(loc
.first
);
1000 paths
.push_back(path
);
1005 void EMetaBlob::dump(Formatter
*f
) const
1007 f
->open_array_section("lumps");
1008 for (const auto& d
: lump_order
) {
1009 f
->open_object_section("lump");
1010 f
->open_object_section("dirfrag");
1011 f
->dump_stream("dirfrag") << d
;
1012 f
->close_section(); // dirfrag
1013 f
->open_object_section("dirlump");
1014 lump_map
.at(d
).dump(f
);
1015 f
->close_section(); // dirlump
1016 f
->close_section(); // lump
1018 f
->close_section(); // lumps
1020 f
->open_array_section("roots");
1021 for (const auto& iter
: roots
) {
1022 f
->open_object_section("root");
1024 f
->close_section(); // root
1026 f
->close_section(); // roots
1028 f
->open_array_section("tableclient tranactions");
1029 for (const auto& p
: table_tids
) {
1030 f
->open_object_section("transaction");
1031 f
->dump_int("tid", p
.first
);
1032 f
->dump_int("version", p
.second
);
1033 f
->close_section(); // transaction
1035 f
->close_section(); // tableclient transactions
1037 f
->dump_int("renamed directory inodeno", renamed_dirino
);
1039 f
->open_array_section("renamed directory fragments");
1040 for (const auto& p
: renamed_dir_frags
) {
1041 f
->dump_int("frag", p
);
1043 f
->close_section(); // renamed directory fragments
1045 f
->dump_int("inotable version", inotablev
);
1046 f
->dump_int("SessionMap version", sessionmapv
);
1047 f
->dump_int("allocated ino", allocated_ino
);
1049 f
->dump_stream("preallocated inos") << preallocated_inos
;
1050 f
->dump_int("used preallocated ino", used_preallocated_ino
);
1052 f
->open_object_section("client name");
1053 client_name
.dump(f
);
1054 f
->close_section(); // client name
1056 f
->open_array_section("inodes starting a truncate");
1057 for(const auto& ino
: truncate_start
) {
1058 f
->dump_int("inodeno", ino
);
1060 f
->close_section(); // truncate inodes
1061 f
->open_array_section("inodes finishing a truncated");
1062 for(const auto& p
: truncate_finish
) {
1063 f
->open_object_section("inode+segment");
1064 f
->dump_int("inodeno", p
.first
);
1065 f
->dump_int("truncate starting segment", p
.second
);
1066 f
->close_section(); // truncated inode
1068 f
->close_section(); // truncate finish inodes
1070 f
->open_array_section("destroyed inodes");
1071 for(vector
<inodeno_t
>::const_iterator i
= destroyed_inodes
.begin();
1072 i
!= destroyed_inodes
.end(); ++i
) {
1073 f
->dump_int("inodeno", *i
);
1075 f
->close_section(); // destroyed inodes
1077 f
->open_array_section("client requests");
1078 for(const auto& p
: client_reqs
) {
1079 f
->open_object_section("Client request");
1080 f
->dump_stream("request ID") << p
.first
;
1081 f
->dump_int("oldest request on client", p
.second
);
1082 f
->close_section(); // request
1084 f
->close_section(); // client requests
1087 void EMetaBlob::generate_test_instances(std::list
<EMetaBlob
*>& ls
)
1089 ls
.push_back(new EMetaBlob());
1092 void EMetaBlob::replay(MDSRank
*mds
, LogSegment
*logseg
, MDSlaveUpdate
*slaveup
)
1094 dout(10) << "EMetaBlob.replay " << lump_map
.size() << " dirlumps by " << client_name
<< dendl
;
1096 ceph_assert(logseg
);
1098 ceph_assert(g_conf()->mds_kill_journal_replay_at
!= 1);
1100 for (auto& p
: roots
) {
1101 CInode
*in
= mds
->mdcache
->get_inode(p
.inode
.ino
);
1102 bool isnew
= in
? false:true;
1104 in
= new CInode(mds
->mdcache
, false, 2, CEPH_NOSNAP
);
1105 p
.update_inode(mds
, in
);
1108 mds
->mdcache
->add_inode(in
);
1109 if (p
.is_dirty()) in
->_mark_dirty(logseg
);
1110 dout(10) << "EMetaBlob.replay " << (isnew
? " added root ":" updated root ") << *in
<< dendl
;
1113 CInode
*renamed_diri
= 0;
1115 if (renamed_dirino
) {
1116 renamed_diri
= mds
->mdcache
->get_inode(renamed_dirino
);
1118 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri
<< dendl
;
1120 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino
<< dendl
;
1123 for (const auto& lp
: lump_order
) {
1124 dirlump
&lump
= lump_map
[lp
];
1126 dout(10) << "EMetaBlob.replay found null dentry in dir " << lp
<< dendl
;
1127 nnull
+= lump
.nnull
;
1130 ceph_assert(nnull
<= 1);
1133 // keep track of any inodes we unlink and don't relink elsewhere
1134 map
<CInode
*, CDir
*> unlinked
;
1135 set
<CInode
*> linked
;
1137 // walk through my dirs (in order!)
1138 for (const auto& lp
: lump_order
) {
1139 dout(10) << "EMetaBlob.replay dir " << lp
<< dendl
;
1140 dirlump
&lump
= lump_map
[lp
];
1143 CDir
*dir
= mds
->mdcache
->get_force_dirfrag(lp
, true);
1145 // hmm. do i have the inode?
1146 CInode
*diri
= mds
->mdcache
->get_inode((lp
).ino
);
1148 if (MDS_INO_IS_MDSDIR(lp
.ino
)) {
1149 ceph_assert(MDS_INO_MDSDIR(mds
->get_nodeid()) != lp
.ino
);
1150 diri
= mds
->mdcache
->create_system_inode(lp
.ino
, S_IFDIR
|0755);
1151 diri
->state_clear(CInode::STATE_AUTH
);
1152 dout(10) << "EMetaBlob.replay created base " << *diri
<< dendl
;
1154 dout(0) << "EMetaBlob.replay missing dir ino " << lp
.ino
<< dendl
;
1155 mds
->clog
->error() << "failure replaying journal (EMetaBlob)";
1157 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1161 // create the dirfrag
1162 dir
= diri
->get_or_open_dirfrag(mds
->mdcache
, lp
.frag
);
1164 if (MDS_INO_IS_BASE(lp
.ino
))
1165 mds
->mdcache
->adjust_subtree_auth(dir
, CDIR_AUTH_UNDEF
);
1167 dout(10) << "EMetaBlob.replay added dir " << *dir
<< dendl
;
1169 dir
->set_version( lump
.fnode
.version
);
1170 dir
->fnode
= lump
.fnode
;
1172 if (lump
.is_importing()) {
1173 dir
->state_set(CDir::STATE_AUTH
);
1174 dir
->state_clear(CDir::STATE_COMPLETE
);
1176 if (lump
.is_dirty()) {
1177 dir
->_mark_dirty(logseg
);
1179 if (!(dir
->fnode
.rstat
== dir
->fnode
.accounted_rstat
)) {
1180 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir
<< dendl
;
1181 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->nestlock
);
1182 logseg
->dirty_dirfrag_nest
.push_back(&dir
->inode
->item_dirty_dirfrag_nest
);
1184 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir
<< dendl
;
1186 if (!(dir
->fnode
.fragstat
== dir
->fnode
.accounted_fragstat
)) {
1187 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir
<< dendl
;
1188 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->filelock
);
1189 logseg
->dirty_dirfrag_dir
.push_back(&dir
->inode
->item_dirty_dirfrag_dir
);
1191 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir
<< dendl
;
1194 if (lump
.is_dirty_dft()) {
1195 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir
<< dendl
;
1196 dir
->state_set(CDir::STATE_DIRTYDFT
);
1197 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->dirfragtreelock
);
1198 logseg
->dirty_dirfrag_dirfragtree
.push_back(&dir
->inode
->item_dirty_dirfrag_dirfragtree
);
1201 dir
->mark_new(logseg
);
1202 if (lump
.is_complete())
1203 dir
->mark_complete();
1205 dout(10) << "EMetaBlob.replay updated dir " << *dir
<< dendl
;
1208 lump
._decode_bits();
1210 // full dentry+inode pairs
1211 for (auto& fb
: lump
._get_dfull()) {
1212 CDentry
*dn
= dir
->lookup_exact_snap(fb
.dn
, fb
.dnlast
);
1214 dn
= dir
->add_null_dentry(fb
.dn
, fb
.dnfirst
, fb
.dnlast
);
1215 dn
->set_version(fb
.dnv
);
1216 if (fb
.is_dirty()) dn
->_mark_dirty(logseg
);
1217 dout(10) << "EMetaBlob.replay added (full) " << *dn
<< dendl
;
1219 dn
->set_version(fb
.dnv
);
1220 if (fb
.is_dirty()) dn
->_mark_dirty(logseg
);
1221 dout(10) << "EMetaBlob.replay for [" << fb
.dnfirst
<< "," << fb
.dnlast
<< "] had " << *dn
<< dendl
;
1222 dn
->first
= fb
.dnfirst
;
1223 ceph_assert(dn
->last
== fb
.dnlast
);
1225 if (lump
.is_importing())
1226 dn
->state_set(CDentry::STATE_AUTH
);
1228 CInode
*in
= mds
->mdcache
->get_inode(fb
.inode
.ino
, fb
.dnlast
);
1230 in
= new CInode(mds
->mdcache
, dn
->is_auth(), fb
.dnfirst
, fb
.dnlast
);
1231 fb
.update_inode(mds
, in
);
1232 mds
->mdcache
->add_inode(in
);
1233 if (!dn
->get_linkage()->is_null()) {
1234 if (dn
->get_linkage()->is_primary()) {
1235 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1237 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1238 << " " << *dn
->get_linkage()->get_inode() << " should be " << fb
.inode
.ino
;
1239 dout(0) << ss
.str() << dendl
;
1240 mds
->clog
->warn(ss
);
1242 dir
->unlink_inode(dn
, false);
1244 if (unlinked
.count(in
))
1246 dir
->link_primary_inode(dn
, in
);
1247 dout(10) << "EMetaBlob.replay added " << *in
<< dendl
;
1249 in
->first
= fb
.dnfirst
;
1250 fb
.update_inode(mds
, in
);
1251 if (dn
->get_linkage()->get_inode() != in
&& in
->get_parent_dn()) {
1252 dout(10) << "EMetaBlob.replay unlinking " << *in
<< dendl
;
1253 unlinked
[in
] = in
->get_parent_dir();
1254 in
->get_parent_dir()->unlink_inode(in
->get_parent_dn());
1256 if (dn
->get_linkage()->get_inode() != in
) {
1257 if (!dn
->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1258 if (dn
->get_linkage()->is_primary()) {
1259 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1261 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1262 << " " << *dn
->get_linkage()->get_inode() << " should be " << fb
.inode
.ino
;
1263 dout(0) << ss
.str() << dendl
;
1264 mds
->clog
->warn(ss
);
1266 dir
->unlink_inode(dn
, false);
1268 if (unlinked
.count(in
))
1270 dir
->link_primary_inode(dn
, in
);
1271 dout(10) << "EMetaBlob.replay linked " << *in
<< dendl
;
1273 dout(10) << "EMetaBlob.replay for [" << fb
.dnfirst
<< "," << fb
.dnlast
<< "] had " << *in
<< dendl
;
1275 ceph_assert(in
->first
== fb
.dnfirst
||
1276 (in
->is_multiversion() && in
->first
> fb
.dnfirst
));
1279 in
->_mark_dirty(logseg
);
1280 if (fb
.is_dirty_parent())
1281 in
->mark_dirty_parent(logseg
, fb
.is_dirty_pool());
1282 if (fb
.need_snapflush())
1283 logseg
->open_files
.push_back(&in
->item_open_file
);
1285 in
->state_set(CInode::STATE_AUTH
);
1287 in
->state_clear(CInode::STATE_AUTH
);
1288 ceph_assert(g_conf()->mds_kill_journal_replay_at
!= 2);
1292 for (const auto& rb
: lump
.get_dremote()) {
1293 CDentry
*dn
= dir
->lookup_exact_snap(rb
.dn
, rb
.dnlast
);
1295 dn
= dir
->add_remote_dentry(rb
.dn
, rb
.ino
, rb
.d_type
, rb
.dnfirst
, rb
.dnlast
);
1296 dn
->set_version(rb
.dnv
);
1297 if (rb
.dirty
) dn
->_mark_dirty(logseg
);
1298 dout(10) << "EMetaBlob.replay added " << *dn
<< dendl
;
1300 if (!dn
->get_linkage()->is_null()) {
1301 dout(10) << "EMetaBlob.replay unlinking " << *dn
<< dendl
;
1302 if (dn
->get_linkage()->is_primary()) {
1303 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1305 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1306 << " " << *dn
->get_linkage()->get_inode() << " should be remote " << rb
.ino
;
1307 dout(0) << ss
.str() << dendl
;
1309 dir
->unlink_inode(dn
, false);
1311 dir
->link_remote_inode(dn
, rb
.ino
, rb
.d_type
);
1312 dn
->set_version(rb
.dnv
);
1313 if (rb
.dirty
) dn
->_mark_dirty(logseg
);
1314 dout(10) << "EMetaBlob.replay for [" << rb
.dnfirst
<< "," << rb
.dnlast
<< "] had " << *dn
<< dendl
;
1315 dn
->first
= rb
.dnfirst
;
1316 ceph_assert(dn
->last
== rb
.dnlast
);
1318 if (lump
.is_importing())
1319 dn
->state_set(CDentry::STATE_AUTH
);
1323 for (const auto& nb
: lump
.get_dnull()) {
1324 CDentry
*dn
= dir
->lookup_exact_snap(nb
.dn
, nb
.dnlast
);
1326 dn
= dir
->add_null_dentry(nb
.dn
, nb
.dnfirst
, nb
.dnlast
);
1327 dn
->set_version(nb
.dnv
);
1328 if (nb
.dirty
) dn
->_mark_dirty(logseg
);
1329 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn
<< dendl
;
1331 dn
->first
= nb
.dnfirst
;
1332 if (!dn
->get_linkage()->is_null()) {
1333 dout(10) << "EMetaBlob.replay unlinking " << *dn
<< dendl
;
1334 CInode
*in
= dn
->get_linkage()->get_inode();
1335 // For renamed inode, We may call CInode::force_dirfrag() later.
1336 // CInode::force_dirfrag() doesn't work well when inode is detached
1337 // from the hierarchy.
1338 if (!renamed_diri
|| renamed_diri
!= in
) {
1339 if (dn
->get_linkage()->is_primary())
1341 dir
->unlink_inode(dn
);
1344 dn
->set_version(nb
.dnv
);
1345 if (nb
.dirty
) dn
->_mark_dirty(logseg
);
1346 dout(10) << "EMetaBlob.replay had " << *dn
<< dendl
;
1347 ceph_assert(dn
->last
== nb
.dnlast
);
1350 if (lump
.is_importing())
1351 dn
->state_set(CDentry::STATE_AUTH
);
1353 // Make null dentries the first things we trim
1354 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn
<< dendl
;
1358 ceph_assert(g_conf()->mds_kill_journal_replay_at
!= 3);
1360 if (renamed_dirino
) {
1362 ceph_assert(unlinked
.count(renamed_diri
));
1363 ceph_assert(linked
.count(renamed_diri
));
1364 olddir
= unlinked
[renamed_diri
];
1366 // we imported a diri we haven't seen before
1367 renamed_diri
= mds
->mdcache
->get_inode(renamed_dirino
);
1368 ceph_assert(renamed_diri
); // it was in the metablob
1372 if (olddir
->authority() != CDIR_AUTH_UNDEF
&&
1373 renamed_diri
->authority() == CDIR_AUTH_UNDEF
) {
1374 ceph_assert(slaveup
); // auth to non-auth, must be slave prepare
1376 renamed_diri
->dirfragtree
.get_leaves(leaves
);
1377 for (const auto& leaf
: leaves
) {
1378 CDir
*dir
= renamed_diri
->get_dirfrag(leaf
);
1380 if (dir
->get_dir_auth() == CDIR_AUTH_UNDEF
)
1381 // preserve subtree bound until slave commit
1382 slaveup
->olddirs
.insert(dir
->inode
);
1384 dir
->state_set(CDir::STATE_AUTH
);
1388 mds
->mdcache
->adjust_subtree_after_rename(renamed_diri
, olddir
, false);
1390 // see if we can discard the subtree we renamed out of
1391 CDir
*root
= mds
->mdcache
->get_subtree_root(olddir
);
1392 if (root
->get_dir_auth() == CDIR_AUTH_UNDEF
) {
1393 if (slaveup
) // preserve the old dir until slave commit
1394 slaveup
->olddirs
.insert(olddir
->inode
);
1396 mds
->mdcache
->try_trim_non_auth_subtree(root
);
1400 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1401 if (renamed_diri
->authority() != CDIR_AUTH_UNDEF
) {
1402 for (const auto& p
: renamed_dir_frags
) {
1403 CDir
*dir
= renamed_diri
->get_dirfrag(p
);
1405 // we already had the inode before, and we already adjusted this subtree accordingly.
1406 dout(10) << " already had+adjusted rename import bound " << *dir
<< dendl
;
1407 ceph_assert(olddir
);
1410 dir
= renamed_diri
->get_or_open_dirfrag(mds
->mdcache
, p
);
1411 dout(10) << " creating new rename import bound " << *dir
<< dendl
;
1412 dir
->state_clear(CDir::STATE_AUTH
);
1413 mds
->mdcache
->adjust_subtree_auth(dir
, CDIR_AUTH_UNDEF
);
1417 // rename may overwrite an empty directory and move it into stray dir.
1418 unlinked
.erase(renamed_diri
);
1419 for (map
<CInode
*, CDir
*>::iterator p
= unlinked
.begin(); p
!= unlinked
.end(); ++p
) {
1420 if (!linked
.count(p
->first
))
1422 ceph_assert(p
->first
->is_dir());
1423 mds
->mdcache
->adjust_subtree_after_rename(p
->first
, p
->second
, false);
1427 if (!unlinked
.empty()) {
1428 for (set
<CInode
*>::iterator p
= linked
.begin(); p
!= linked
.end(); ++p
)
1430 dout(10) << " unlinked set contains " << unlinked
<< dendl
;
1431 for (map
<CInode
*, CDir
*>::iterator p
= unlinked
.begin(); p
!= unlinked
.end(); ++p
) {
1432 CInode
*in
= p
->first
;
1433 if (slaveup
) { // preserve unlinked inodes until slave commit
1434 slaveup
->unlinked
.insert(in
);
1436 in
->snaprealm
->adjust_parent();
1438 mds
->mdcache
->remove_inode_recursive(in
);
1442 // table client transactions
1443 for (const auto& p
: table_tids
) {
1444 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p
.first
)
1445 << " transaction " << p
.second
<< dendl
;
1446 MDSTableClient
*client
= mds
->get_table_client(p
.first
);
1448 client
->got_journaled_agree(p
.second
, logseg
);
1453 CInode
*in
= mds
->mdcache
->get_inode(opened_ino
);
1455 dout(10) << "EMetaBlob.replay noting opened inode " << *in
<< dendl
;
1456 logseg
->open_files
.push_back(&in
->item_open_file
);
1461 if (mds
->inotable
->get_version() >= inotablev
) {
1462 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1463 << " <= table " << mds
->inotable
->get_version() << dendl
;
1465 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1466 << " - 1 == table " << mds
->inotable
->get_version()
1467 << " allocated+used " << allocated_ino
1468 << " prealloc " << preallocated_inos
1471 mds
->inotable
->replay_alloc_id(allocated_ino
);
1472 if (preallocated_inos
.size())
1473 mds
->inotable
->replay_alloc_ids(preallocated_inos
);
1475 // [repair bad inotable updates]
1476 if (inotablev
> mds
->inotable
->get_version()) {
1477 mds
->clog
->error() << "journal replay inotablev mismatch "
1478 << mds
->inotable
->get_version() << " -> " << inotablev
;
1479 mds
->inotable
->force_replay_version(inotablev
);
1482 ceph_assert(inotablev
== mds
->inotable
->get_version());
1486 unsigned diff
= (used_preallocated_ino
&& !preallocated_inos
.empty()) ? 2 : 1;
1487 if (mds
->sessionmap
.get_version() >= sessionmapv
) {
1488 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1489 << " <= table " << mds
->sessionmap
.get_version() << dendl
;
1490 } else if (mds
->sessionmap
.get_version() + diff
== sessionmapv
) {
1491 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1492 << " - " << diff
<< " == table " << mds
->sessionmap
.get_version()
1493 << " prealloc " << preallocated_inos
1494 << " used " << used_preallocated_ino
1496 Session
*session
= mds
->sessionmap
.get_session(client_name
);
1498 dout(20) << " (session prealloc " << session
->info
.prealloc_inos
<< ")" << dendl
;
1499 if (used_preallocated_ino
) {
1500 if (!session
->info
.prealloc_inos
.empty()) {
1501 inodeno_t i
= session
->take_ino(used_preallocated_ino
);
1502 ceph_assert(i
== used_preallocated_ino
);
1503 session
->info
.used_inos
.clear();
1505 mds
->sessionmap
.replay_dirty_session(session
);
1507 if (!preallocated_inos
.empty()) {
1508 session
->info
.prealloc_inos
.insert(preallocated_inos
);
1509 mds
->sessionmap
.replay_dirty_session(session
);
1513 dout(10) << "EMetaBlob.replay no session for " << client_name
<< dendl
;
1514 if (used_preallocated_ino
)
1515 mds
->sessionmap
.replay_advance_version();
1517 if (!preallocated_inos
.empty())
1518 mds
->sessionmap
.replay_advance_version();
1520 ceph_assert(sessionmapv
== mds
->sessionmap
.get_version());
1522 mds
->clog
->error() << "EMetaBlob.replay sessionmap v " << sessionmapv
1523 << " - " << diff
<< " > table " << mds
->sessionmap
.get_version();
1524 ceph_assert(g_conf()->mds_wipe_sessions
);
1525 mds
->sessionmap
.wipe();
1526 mds
->sessionmap
.set_version(sessionmapv
);
1530 // truncating inodes
1531 for (const auto& ino
: truncate_start
) {
1532 CInode
*in
= mds
->mdcache
->get_inode(ino
);
1534 mds
->mdcache
->add_recovered_truncate(in
, logseg
);
1536 for (const auto& p
: truncate_finish
) {
1537 LogSegment
*ls
= mds
->mdlog
->get_segment(p
.second
);
1539 CInode
*in
= mds
->mdcache
->get_inode(p
.first
);
1541 mds
->mdcache
->remove_recovered_truncate(in
, ls
);
1546 if (!destroyed_inodes
.empty()) {
1547 for (vector
<inodeno_t
>::iterator p
= destroyed_inodes
.begin();
1548 p
!= destroyed_inodes
.end();
1550 CInode
*in
= mds
->mdcache
->get_inode(*p
);
1552 dout(10) << "EMetaBlob.replay destroyed " << *p
<< ", dropping " << *in
<< dendl
;
1553 CDentry
*parent
= in
->get_parent_dn();
1554 mds
->mdcache
->remove_inode(in
);
1556 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent
<< dendl
;
1557 ceph_assert(parent
->get_linkage()->is_null());
1560 dout(10) << "EMetaBlob.replay destroyed " << *p
<< ", not in cache" << dendl
;
1563 mds
->mdcache
->open_file_table
.note_destroyed_inos(logseg
->seq
, destroyed_inodes
);
1567 for (const auto& p
: client_reqs
) {
1568 if (p
.first
.name
.is_client()) {
1569 dout(10) << "EMetaBlob.replay request " << p
.first
<< " trim_to " << p
.second
<< dendl
;
1570 inodeno_t created
= allocated_ino
? allocated_ino
: used_preallocated_ino
;
1571 // if we allocated an inode, there should be exactly one client request id.
1572 ceph_assert(created
== inodeno_t() || client_reqs
.size() == 1);
1574 Session
*session
= mds
->sessionmap
.get_session(p
.first
.name
);
1576 session
->add_completed_request(p
.first
.tid
, created
);
1578 session
->trim_completed_requests(p
.second
);
1584 for (const auto& p
: client_flushes
) {
1585 if (p
.first
.name
.is_client()) {
1586 dout(10) << "EMetaBlob.replay flush " << p
.first
<< " trim_to " << p
.second
<< dendl
;
1587 Session
*session
= mds
->sessionmap
.get_session(p
.first
.name
);
1589 session
->add_completed_flush(p
.first
.tid
);
1591 session
->trim_completed_flushes(p
.second
);
1597 update_segment(logseg
);
1599 ceph_assert(g_conf()->mds_kill_journal_replay_at
!= 4);
1602 // -----------------------
1604 void EPurged::update_segment()
1606 if (inos
.size() && inotablev
)
1607 get_segment()->inotablev
= inotablev
;
1611 void EPurged::replay(MDSRank
*mds
)
1614 LogSegment
*ls
= mds
->mdlog
->get_segment(seq
);
1616 ls
->purge_inodes
.subtract(inos
);
1618 if (mds
->inotable
->get_version() >= inotablev
) {
1619 dout(10) << "EPurged.replay inotable " << mds
->inotable
->get_version()
1620 << " >= " << inotablev
<< ", noop" << dendl
;
1622 dout(10) << "EPurged.replay inotable " << mds
->inotable
->get_version()
1623 << " < " << inotablev
<< " " << dendl
;
1624 mds
->inotable
->replay_release_ids(inos
);
1625 assert(mds
->inotable
->get_version() == inotablev
);
1631 void EPurged::encode(bufferlist
& bl
, uint64_t features
) const
1633 ENCODE_START(1, 1, bl
);
1635 encode(inotablev
, bl
);
1640 void EPurged::decode(bufferlist::const_iterator
& bl
)
1642 DECODE_START(1, bl
);
1644 decode(inotablev
, bl
);
1649 void EPurged::dump(Formatter
*f
) const
1651 f
->dump_stream("inos") << inos
;
1652 f
->dump_int("inotable version", inotablev
);
1653 f
->dump_int("segment seq", seq
);
1656 // -----------------------
1659 void ESession::update_segment()
1661 get_segment()->sessionmapv
= cmapv
;
1662 if (inos
.size() && inotablev
)
1663 get_segment()->inotablev
= inotablev
;
1666 void ESession::replay(MDSRank
*mds
)
1668 if (purge_inos
.size())
1669 get_segment()->purge_inodes
.insert(purge_inos
);
1671 if (mds
->sessionmap
.get_version() >= cmapv
) {
1672 dout(10) << "ESession.replay sessionmap " << mds
->sessionmap
.get_version()
1673 << " >= " << cmapv
<< ", noop" << dendl
;
1674 } else if (mds
->sessionmap
.get_version() + 1 == cmapv
) {
1675 dout(10) << "ESession.replay sessionmap " << mds
->sessionmap
.get_version()
1676 << " < " << cmapv
<< " " << (open
? "open":"close") << " " << client_inst
<< dendl
;
1679 session
= mds
->sessionmap
.get_or_add_session(client_inst
);
1680 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
1681 session
->set_client_metadata(client_metadata
);
1682 dout(10) << " opened session " << session
->info
.inst
<< dendl
;
1684 session
= mds
->sessionmap
.get_session(client_inst
.name
);
1685 if (session
) { // there always should be a session, but there's a bug
1686 if (session
->get_connection() == NULL
) {
1687 dout(10) << " removed session " << session
->info
.inst
<< dendl
;
1688 mds
->sessionmap
.remove_session(session
);
1691 session
->clear(); // the client has reconnected; keep the Session, but reset
1692 dout(10) << " reset session " << session
->info
.inst
<< " (they reconnected)" << dendl
;
1695 mds
->clog
->error() << "replayed stray Session close event for " << client_inst
1696 << " from time " << stamp
<< ", ignoring";
1700 mds
->sessionmap
.replay_dirty_session(session
);
1702 mds
->sessionmap
.replay_advance_version();
1704 ceph_assert(mds
->sessionmap
.get_version() == cmapv
);
1706 mds
->clog
->error() << "ESession.replay sessionmap v " << cmapv
1707 << " - 1 > table " << mds
->sessionmap
.get_version();
1708 ceph_assert(g_conf()->mds_wipe_sessions
);
1709 mds
->sessionmap
.wipe();
1710 mds
->sessionmap
.set_version(cmapv
);
1713 if (inos
.size() && inotablev
) {
1714 if (mds
->inotable
->get_version() >= inotablev
) {
1715 dout(10) << "ESession.replay inotable " << mds
->inotable
->get_version()
1716 << " >= " << inotablev
<< ", noop" << dendl
;
1718 dout(10) << "ESession.replay inotable " << mds
->inotable
->get_version()
1719 << " < " << inotablev
<< " " << (open
? "add":"remove") << dendl
;
1720 ceph_assert(!open
); // for now
1721 mds
->inotable
->replay_release_ids(inos
);
1722 ceph_assert(mds
->inotable
->get_version() == inotablev
);
1729 void ESession::encode(bufferlist
&bl
, uint64_t features
) const
1731 ENCODE_START(6, 5, bl
);
1733 encode(client_inst
, bl
, features
);
1737 encode(inotablev
, bl
);
1738 encode(client_metadata
, bl
);
1739 encode(purge_inos
, bl
);
1743 void ESession::decode(bufferlist::const_iterator
&bl
)
1745 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl
);
1748 decode(client_inst
, bl
);
1752 decode(inotablev
, bl
);
1753 if (struct_v
== 4) {
1754 decode(client_metadata
.kv_map
, bl
);
1755 } else if (struct_v
>= 5) {
1756 decode(client_metadata
, bl
);
1759 decode(purge_inos
, bl
);
1765 void ESession::dump(Formatter
*f
) const
1767 f
->dump_stream("client instance") << client_inst
;
1768 f
->dump_string("open", open
? "true" : "false");
1769 f
->dump_int("client map version", cmapv
);
1770 f
->dump_stream("inos") << inos
;
1771 f
->dump_int("inotable version", inotablev
);
1772 f
->open_object_section("client_metadata");
1773 client_metadata
.dump(f
);
1774 f
->close_section(); // client_metadata
1777 void ESession::generate_test_instances(std::list
<ESession
*>& ls
)
1779 ls
.push_back(new ESession
);
1782 // -----------------------
1785 void ESessions::encode(bufferlist
&bl
, uint64_t features
) const
1787 ENCODE_START(2, 1, bl
);
1788 encode(client_map
, bl
, features
);
1791 encode(client_metadata_map
, bl
);
1795 void ESessions::decode_old(bufferlist::const_iterator
&bl
)
1798 decode(client_map
, bl
);
1804 void ESessions::decode_new(bufferlist::const_iterator
&bl
)
1806 DECODE_START(2, bl
);
1807 decode(client_map
, bl
);
1811 decode(client_metadata_map
, bl
);
1815 void ESessions::dump(Formatter
*f
) const
1817 f
->dump_int("client map version", cmapv
);
1819 f
->open_array_section("client map");
1820 for (map
<client_t
,entity_inst_t
>::const_iterator i
= client_map
.begin();
1821 i
!= client_map
.end(); ++i
) {
1822 f
->open_object_section("client");
1823 f
->dump_int("client id", i
->first
.v
);
1824 f
->dump_stream("client entity") << i
->second
;
1825 f
->close_section(); // client
1827 f
->close_section(); // client map
1830 void ESessions::generate_test_instances(std::list
<ESessions
*>& ls
)
1832 ls
.push_back(new ESessions());
1835 void ESessions::update_segment()
1837 get_segment()->sessionmapv
= cmapv
;
1840 void ESessions::replay(MDSRank
*mds
)
1842 if (mds
->sessionmap
.get_version() >= cmapv
) {
1843 dout(10) << "ESessions.replay sessionmap " << mds
->sessionmap
.get_version()
1844 << " >= " << cmapv
<< ", noop" << dendl
;
1846 dout(10) << "ESessions.replay sessionmap " << mds
->sessionmap
.get_version()
1847 << " < " << cmapv
<< dendl
;
1848 mds
->sessionmap
.replay_open_sessions(cmapv
, client_map
, client_metadata_map
);
1854 // -----------------------
1857 void ETableServer::encode(bufferlist
& bl
, uint64_t features
) const
1859 ENCODE_START(3, 3, bl
);
1865 encode(mutation
, bl
);
1867 encode(version
, bl
);
1871 void ETableServer::decode(bufferlist::const_iterator
&bl
)
1873 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
1880 decode(mutation
, bl
);
1882 decode(version
, bl
);
1886 void ETableServer::dump(Formatter
*f
) const
1888 f
->dump_int("table id", table
);
1889 f
->dump_int("op", op
);
1890 f
->dump_int("request id", reqid
);
1891 f
->dump_int("by mds", bymds
);
1892 f
->dump_int("tid", tid
);
1893 f
->dump_int("version", version
);
1896 void ETableServer::generate_test_instances(std::list
<ETableServer
*>& ls
)
1898 ls
.push_back(new ETableServer());
1902 void ETableServer::update_segment()
1904 get_segment()->tablev
[table
] = version
;
1907 void ETableServer::replay(MDSRank
*mds
)
1909 MDSTableServer
*server
= mds
->get_table_server(table
);
1913 if (server
->get_version() >= version
) {
1914 dout(10) << "ETableServer.replay " << get_mdstable_name(table
)
1915 << " " << get_mdstableserver_opname(op
)
1916 << " event " << version
1917 << " <= table " << server
->get_version() << dendl
;
1921 dout(10) << " ETableServer.replay " << get_mdstable_name(table
)
1922 << " " << get_mdstableserver_opname(op
)
1923 << " event " << version
<< " - 1 == table " << server
->get_version() << dendl
;
1924 ceph_assert(version
-1 == server
->get_version());
1927 case TABLESERVER_OP_PREPARE
: {
1928 server
->_note_prepare(bymds
, reqid
, true);
1930 server
->_prepare(mutation
, reqid
, bymds
, out
);
1931 mutation
= std::move(out
);
1934 case TABLESERVER_OP_COMMIT
:
1935 server
->_commit(tid
, ref_t
<MMDSTableRequest
>());
1936 server
->_note_commit(tid
, true);
1938 case TABLESERVER_OP_ROLLBACK
:
1939 server
->_rollback(tid
);
1940 server
->_note_rollback(tid
, true);
1942 case TABLESERVER_OP_SERVER_UPDATE
:
1943 server
->_server_update(mutation
);
1944 server
->_note_server_update(mutation
, true);
1947 mds
->clog
->error() << "invalid tableserver op in ETableServer";
1949 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1952 ceph_assert(version
== server
->get_version());
1957 // ---------------------
1960 void ETableClient::encode(bufferlist
& bl
, uint64_t features
) const
1962 ENCODE_START(3, 3, bl
);
1970 void ETableClient::decode(bufferlist::const_iterator
&bl
)
1972 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
1981 void ETableClient::dump(Formatter
*f
) const
1983 f
->dump_int("table", table
);
1984 f
->dump_int("op", op
);
1985 f
->dump_int("tid", tid
);
1988 void ETableClient::generate_test_instances(std::list
<ETableClient
*>& ls
)
1990 ls
.push_back(new ETableClient());
1993 void ETableClient::replay(MDSRank
*mds
)
1995 dout(10) << " ETableClient.replay " << get_mdstable_name(table
)
1996 << " op " << get_mdstableserver_opname(op
)
1997 << " tid " << tid
<< dendl
;
1999 MDSTableClient
*client
= mds
->get_table_client(table
);
2003 ceph_assert(op
== TABLESERVER_OP_ACK
);
2004 client
->got_journaled_ack(tid
);
2008 // -----------------------
2011 void ESnap::update_segment()
2013 get_segment()->tablev[TABLE_SNAP] = version;
2016 void ESnap::replay(MDSRank *mds)
2018 if (mds->snaptable->get_version() >= version) {
2019 dout(10) << "ESnap.replay event " << version
2020 << " <= table " << mds->snaptable->get_version() << dendl;
2024 dout(10) << " ESnap.replay event " << version
2025 << " - 1 == table " << mds->snaptable->get_version() << dendl;
2026 ceph_assert(version-1 == mds->snaptable->get_version());
2030 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
2031 ceph_assert(s == snap.snapid);
2033 mds->snaptable->remove(snap.snapid);
2036 ceph_assert(version == mds->snaptable->get_version());
2042 // -----------------------
2045 void EUpdate::encode(bufferlist
&bl
, uint64_t features
) const
2047 ENCODE_START(4, 4, bl
);
2050 encode(metablob
, bl
, features
);
2051 encode(client_map
, bl
);
2054 encode(had_slaves
, bl
);
2058 void EUpdate::decode(bufferlist::const_iterator
&bl
)
2060 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl
);
2064 decode(metablob
, bl
);
2065 decode(client_map
, bl
);
2069 decode(had_slaves
, bl
);
2073 void EUpdate::dump(Formatter
*f
) const
2075 f
->open_object_section("metablob");
2077 f
->close_section(); // metablob
2079 f
->dump_string("type", type
);
2080 f
->dump_int("client map length", client_map
.length());
2081 f
->dump_int("client map version", cmapv
);
2082 f
->dump_stream("reqid") << reqid
;
2083 f
->dump_string("had slaves", had_slaves
? "true" : "false");
2086 void EUpdate::generate_test_instances(std::list
<EUpdate
*>& ls
)
2088 ls
.push_back(new EUpdate());
2092 void EUpdate::update_segment()
2094 auto&& segment
= get_segment();
2095 metablob
.update_segment(segment
);
2097 if (client_map
.length())
2098 segment
->sessionmapv
= cmapv
;
2101 segment
->uncommitted_masters
.insert(reqid
);
2104 void EUpdate::replay(MDSRank
*mds
)
2106 auto&& segment
= get_segment();
2107 metablob
.replay(mds
, segment
);
2110 dout(10) << "EUpdate.replay " << reqid
<< " had slaves, expecting a matching ECommitted" << dendl
;
2111 segment
->uncommitted_masters
.insert(reqid
);
2112 set
<mds_rank_t
> slaves
;
2113 mds
->mdcache
->add_uncommitted_master(reqid
, segment
, slaves
, true);
2116 if (client_map
.length()) {
2117 if (mds
->sessionmap
.get_version() >= cmapv
) {
2118 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2119 << " <= table " << mds
->sessionmap
.get_version() << dendl
;
2121 dout(10) << "EUpdate.replay sessionmap " << mds
->sessionmap
.get_version()
2122 << " < " << cmapv
<< dendl
;
2123 // open client sessions?
2124 map
<client_t
,entity_inst_t
> cm
;
2125 map
<client_t
,client_metadata_t
> cmm
;
2126 auto blp
= client_map
.cbegin();
2131 mds
->sessionmap
.replay_open_sessions(cmapv
, cm
, cmm
);
2138 // ------------------------
2141 void EOpen::encode(bufferlist
&bl
, uint64_t features
) const {
2142 ENCODE_START(4, 3, bl
);
2144 encode(metablob
, bl
, features
);
2146 encode(snap_inos
, bl
);
2150 void EOpen::decode(bufferlist::const_iterator
&bl
) {
2151 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2154 decode(metablob
, bl
);
2157 decode(snap_inos
, bl
);
2161 void EOpen::dump(Formatter
*f
) const
2163 f
->open_object_section("metablob");
2165 f
->close_section(); // metablob
2166 f
->open_array_section("inos involved");
2167 for (vector
<inodeno_t
>::const_iterator i
= inos
.begin();
2168 i
!= inos
.end(); ++i
) {
2169 f
->dump_int("ino", *i
);
2171 f
->close_section(); // inos
2174 void EOpen::generate_test_instances(std::list
<EOpen
*>& ls
)
2176 ls
.push_back(new EOpen());
2177 ls
.push_back(new EOpen());
2178 ls
.back()->add_ino(0);
2181 void EOpen::update_segment()
2186 void EOpen::replay(MDSRank
*mds
)
2188 dout(10) << "EOpen.replay " << dendl
;
2189 auto&& segment
= get_segment();
2190 metablob
.replay(mds
, segment
);
2192 // note which segments inodes belong to, so we don't have to start rejournaling them
2193 for (const auto &ino
: inos
) {
2194 CInode
*in
= mds
->mdcache
->get_inode(ino
);
2196 dout(0) << "EOpen.replay ino " << ino
<< " not in metablob" << dendl
;
2199 segment
->open_files
.push_back(&in
->item_open_file
);
2201 for (const auto &vino
: snap_inos
) {
2202 CInode
*in
= mds
->mdcache
->get_inode(vino
);
2204 dout(0) << "EOpen.replay ino " << vino
<< " not in metablob" << dendl
;
2207 segment
->open_files
.push_back(&in
->item_open_file
);
2212 // -----------------------
2215 void ECommitted::replay(MDSRank
*mds
)
2217 if (mds
->mdcache
->uncommitted_masters
.count(reqid
)) {
2218 dout(10) << "ECommitted.replay " << reqid
<< dendl
;
2219 mds
->mdcache
->uncommitted_masters
[reqid
].ls
->uncommitted_masters
.erase(reqid
);
2220 mds
->mdcache
->uncommitted_masters
.erase(reqid
);
2222 dout(10) << "ECommitted.replay " << reqid
<< " -- didn't see original op" << dendl
;
2226 void ECommitted::encode(bufferlist
& bl
, uint64_t features
) const
2228 ENCODE_START(3, 3, bl
);
2234 void ECommitted::decode(bufferlist::const_iterator
& bl
)
2236 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2243 void ECommitted::dump(Formatter
*f
) const {
2244 f
->dump_stream("stamp") << stamp
;
2245 f
->dump_stream("reqid") << reqid
;
2248 void ECommitted::generate_test_instances(std::list
<ECommitted
*>& ls
)
2250 ls
.push_back(new ECommitted
);
2251 ls
.push_back(new ECommitted
);
2252 ls
.back()->stamp
= utime_t(1, 2);
2253 ls
.back()->reqid
= metareqid_t(entity_name_t::CLIENT(123), 456);
2256 // -----------------------
2259 void link_rollback::encode(bufferlist
&bl
) const
2261 ENCODE_START(3, 2, bl
);
2264 encode(was_inc
, bl
);
2265 encode(old_ctime
, bl
);
2266 encode(old_dir_mtime
, bl
);
2267 encode(old_dir_rctime
, bl
);
2272 void link_rollback::decode(bufferlist::const_iterator
&bl
)
2274 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
2277 decode(was_inc
, bl
);
2278 decode(old_ctime
, bl
);
2279 decode(old_dir_mtime
, bl
);
2280 decode(old_dir_rctime
, bl
);
2286 void link_rollback::dump(Formatter
*f
) const
2288 f
->dump_stream("metareqid") << reqid
;
2289 f
->dump_int("ino", ino
);
2290 f
->dump_string("was incremented", was_inc
? "true" : "false");
2291 f
->dump_stream("old_ctime") << old_ctime
;
2292 f
->dump_stream("old_dir_mtime") << old_dir_mtime
;
2293 f
->dump_stream("old_dir_rctime") << old_dir_rctime
;
2296 void link_rollback::generate_test_instances(std::list
<link_rollback
*>& ls
)
2298 ls
.push_back(new link_rollback());
2301 void rmdir_rollback::encode(bufferlist
& bl
) const
2303 ENCODE_START(3, 2, bl
);
2305 encode(src_dir
, bl
);
2306 encode(src_dname
, bl
);
2307 encode(dest_dir
, bl
);
2308 encode(dest_dname
, bl
);
2313 void rmdir_rollback::decode(bufferlist::const_iterator
& bl
)
2315 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
2317 decode(src_dir
, bl
);
2318 decode(src_dname
, bl
);
2319 decode(dest_dir
, bl
);
2320 decode(dest_dname
, bl
);
2326 void rmdir_rollback::dump(Formatter
*f
) const
2328 f
->dump_stream("metareqid") << reqid
;
2329 f
->dump_stream("source directory") << src_dir
;
2330 f
->dump_string("source dname", src_dname
);
2331 f
->dump_stream("destination directory") << dest_dir
;
2332 f
->dump_string("destination dname", dest_dname
);
2335 void rmdir_rollback::generate_test_instances(std::list
<rmdir_rollback
*>& ls
)
2337 ls
.push_back(new rmdir_rollback());
2340 void rename_rollback::drec::encode(bufferlist
&bl
) const
2342 ENCODE_START(2, 2, bl
);
2343 encode(dirfrag
, bl
);
2344 encode(dirfrag_old_mtime
, bl
);
2345 encode(dirfrag_old_rctime
, bl
);
2347 encode(remote_ino
, bl
);
2349 encode(remote_d_type
, bl
);
2350 encode(old_ctime
, bl
);
2354 void rename_rollback::drec::decode(bufferlist::const_iterator
&bl
)
2356 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2357 decode(dirfrag
, bl
);
2358 decode(dirfrag_old_mtime
, bl
);
2359 decode(dirfrag_old_rctime
, bl
);
2361 decode(remote_ino
, bl
);
2363 decode(remote_d_type
, bl
);
2364 decode(old_ctime
, bl
);
2368 void rename_rollback::drec::dump(Formatter
*f
) const
2370 f
->dump_stream("directory fragment") << dirfrag
;
2371 f
->dump_stream("directory old mtime") << dirfrag_old_mtime
;
2372 f
->dump_stream("directory old rctime") << dirfrag_old_rctime
;
2373 f
->dump_int("ino", ino
);
2374 f
->dump_int("remote ino", remote_ino
);
2375 f
->dump_string("dname", dname
);
2376 uint32_t type
= DTTOIF(remote_d_type
) & S_IFMT
; // convert to type entries
2380 type_string
= "file"; break;
2382 type_string
= "symlink"; break;
2384 type_string
= "directory"; break;
2386 type_string
= "UNKNOWN-" + stringify((int)type
); break;
2388 f
->dump_string("remote dtype", type_string
);
2389 f
->dump_stream("old ctime") << old_ctime
;
2392 void rename_rollback::drec::generate_test_instances(std::list
<drec
*>& ls
)
2394 ls
.push_back(new drec());
2395 ls
.back()->remote_d_type
= IFTODT(S_IFREG
);
2398 void rename_rollback::encode(bufferlist
&bl
) const
2400 ENCODE_START(3, 2, bl
);
2402 encode(orig_src
, bl
);
2403 encode(orig_dest
, bl
);
2406 encode(srci_snapbl
, bl
);
2407 encode(desti_snapbl
, bl
);
2411 void rename_rollback::decode(bufferlist::const_iterator
&bl
)
2413 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
2415 decode(orig_src
, bl
);
2416 decode(orig_dest
, bl
);
2419 if (struct_v
>= 3) {
2420 decode(srci_snapbl
, bl
);
2421 decode(desti_snapbl
, bl
);
2426 void rename_rollback::dump(Formatter
*f
) const
2428 f
->dump_stream("request id") << reqid
;
2429 f
->open_object_section("original src drec");
2431 f
->close_section(); // original src drec
2432 f
->open_object_section("original dest drec");
2434 f
->close_section(); // original dest drec
2435 f
->open_object_section("stray drec");
2437 f
->close_section(); // stray drec
2438 f
->dump_stream("ctime") << ctime
;
2441 void rename_rollback::generate_test_instances(std::list
<rename_rollback
*>& ls
)
2443 ls
.push_back(new rename_rollback());
2444 ls
.back()->orig_src
.remote_d_type
= IFTODT(S_IFREG
);
2445 ls
.back()->orig_dest
.remote_d_type
= IFTODT(S_IFREG
);
2446 ls
.back()->stray
.remote_d_type
= IFTODT(S_IFREG
);
2449 void ESlaveUpdate::encode(bufferlist
&bl
, uint64_t features
) const
2451 ENCODE_START(3, 3, bl
);
2458 encode(commit
, bl
, features
);
2459 encode(rollback
, bl
);
2463 void ESlaveUpdate::decode(bufferlist::const_iterator
&bl
)
2465 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2474 decode(rollback
, bl
);
2478 void ESlaveUpdate::dump(Formatter
*f
) const
2480 f
->open_object_section("metablob");
2482 f
->close_section(); // metablob
2484 f
->dump_int("rollback length", rollback
.length());
2485 f
->dump_string("type", type
);
2486 f
->dump_stream("metareqid") << reqid
;
2487 f
->dump_int("master", master
);
2488 f
->dump_int("op", op
);
2489 f
->dump_int("original op", origop
);
2492 void ESlaveUpdate::generate_test_instances(std::list
<ESlaveUpdate
*>& ls
)
2494 ls
.push_back(new ESlaveUpdate());
2497 void ESlaveUpdate::replay(MDSRank
*mds
)
2500 auto&& segment
= get_segment();
2502 case ESlaveUpdate::OP_PREPARE
:
2503 dout(10) << "ESlaveUpdate.replay prepare " << reqid
<< " for mds." << master
2504 << ": applying commit, saving rollback info" << dendl
;
2505 su
= new MDSlaveUpdate(origop
, rollback
);
2506 commit
.replay(mds
, segment
, su
);
2507 mds
->mdcache
->add_uncommitted_slave(reqid
, segment
, master
, su
);
2510 case ESlaveUpdate::OP_COMMIT
:
2511 dout(10) << "ESlaveUpdate.replay commit " << reqid
<< " for mds." << master
<< dendl
;
2512 mds
->mdcache
->finish_uncommitted_slave(reqid
, false);
2515 case ESlaveUpdate::OP_ROLLBACK
:
2516 dout(10) << "ESlaveUpdate.replay abort " << reqid
<< " for mds." << master
2517 << ": applying rollback commit blob" << dendl
;
2518 commit
.replay(mds
, segment
);
2519 mds
->mdcache
->finish_uncommitted_slave(reqid
, false);
2523 mds
->clog
->error() << "invalid op in ESlaveUpdate";
2525 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2530 // -----------------------
2533 void ESubtreeMap::encode(bufferlist
& bl
, uint64_t features
) const
2535 ENCODE_START(6, 5, bl
);
2537 encode(metablob
, bl
, features
);
2538 encode(subtrees
, bl
);
2539 encode(ambiguous_subtrees
, bl
);
2540 encode(expire_pos
, bl
);
2541 encode(event_seq
, bl
);
2545 void ESubtreeMap::decode(bufferlist::const_iterator
&bl
)
2547 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl
);
2550 decode(metablob
, bl
);
2551 decode(subtrees
, bl
);
2553 decode(ambiguous_subtrees
, bl
);
2555 decode(expire_pos
, bl
);
2557 decode(event_seq
, bl
);
2561 void ESubtreeMap::dump(Formatter
*f
) const
2563 f
->open_object_section("metablob");
2565 f
->close_section(); // metablob
2567 f
->open_array_section("subtrees");
2568 for(map
<dirfrag_t
,vector
<dirfrag_t
> >::const_iterator i
= subtrees
.begin();
2569 i
!= subtrees
.end(); ++i
) {
2570 f
->open_object_section("tree");
2571 f
->dump_stream("root dirfrag") << i
->first
;
2572 for (vector
<dirfrag_t
>::const_iterator j
= i
->second
.begin();
2573 j
!= i
->second
.end(); ++j
) {
2574 f
->dump_stream("bound dirfrag") << *j
;
2576 f
->close_section(); // tree
2578 f
->close_section(); // subtrees
2580 f
->open_array_section("ambiguous subtrees");
2581 for(set
<dirfrag_t
>::const_iterator i
= ambiguous_subtrees
.begin();
2582 i
!= ambiguous_subtrees
.end(); ++i
) {
2583 f
->dump_stream("dirfrag") << *i
;
2585 f
->close_section(); // ambiguous subtrees
2587 f
->dump_int("expire position", expire_pos
);
2590 void ESubtreeMap::generate_test_instances(std::list
<ESubtreeMap
*>& ls
)
2592 ls
.push_back(new ESubtreeMap());
2595 void ESubtreeMap::replay(MDSRank
*mds
)
2597 if (expire_pos
&& expire_pos
> mds
->mdlog
->journaler
->get_expire_pos())
2598 mds
->mdlog
->journaler
->set_expire_pos(expire_pos
);
2600 // suck up the subtree map?
2601 if (mds
->mdcache
->is_subtrees()) {
2602 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl
;
2605 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= subtrees
.begin();
2606 p
!= subtrees
.end();
2608 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
2610 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2611 << " subtree root " << p
->first
<< " not in cache";
2616 if (!mds
->mdcache
->is_subtree(dir
)) {
2617 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2618 << " subtree root " << p
->first
<< " not a subtree in cache";
2622 if (dir
->get_dir_auth().first
!= mds
->get_nodeid()) {
2623 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2624 << " subtree root " << p
->first
2625 << " is not mine in cache (it's " << dir
->get_dir_auth() << ")";
2630 for (vector
<dirfrag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
2631 mds
->mdcache
->get_force_dirfrag(*q
, true);
2634 mds
->mdcache
->get_subtree_bounds(dir
, bounds
);
2635 for (vector
<dirfrag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
2636 CDir
*b
= mds
->mdcache
->get_dirfrag(*q
);
2638 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2639 << " subtree " << p
->first
<< " bound " << *q
<< " not in cache";
2643 if (bounds
.count(b
) == 0) {
2644 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2645 << " subtree " << p
->first
<< " bound " << *q
<< " not a bound in cache";
2651 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
) {
2652 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2653 << " subtree " << p
->first
<< " has extra bound in cache " << (*q
)->dirfrag();
2657 if (ambiguous_subtrees
.count(p
->first
)) {
2658 if (!mds
->mdcache
->have_ambiguous_import(p
->first
)) {
2659 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2660 << " subtree " << p
->first
<< " is ambiguous but is not in our cache";
2664 if (mds
->mdcache
->have_ambiguous_import(p
->first
)) {
2665 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2666 << " subtree " << p
->first
<< " is not ambiguous but is in our cache";
2672 std::vector
<CDir
*> dirs
;
2673 mds
->mdcache
->get_subtrees(dirs
);
2674 for (const auto& dir
: dirs
) {
2675 if (dir
->get_dir_auth().first
!= mds
->get_nodeid())
2677 if (subtrees
.count(dir
->dirfrag()) == 0) {
2678 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2679 << " does not include cache subtree " << dir
->dirfrag();
2685 dout(0) << "journal subtrees: " << subtrees
<< dendl
;
2686 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees
<< dendl
;
2687 mds
->mdcache
->show_subtrees();
2688 ceph_assert(!g_conf()->mds_debug_subtrees
|| errors
== 0);
2693 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl
;
2695 // first, stick the spanning tree in my cache
2696 //metablob.print(*_dout);
2697 metablob
.replay(mds
, get_segment());
2699 // restore import/export maps
2700 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= subtrees
.begin();
2701 p
!= subtrees
.end();
2703 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
2705 if (ambiguous_subtrees
.count(p
->first
)) {
2707 mds
->mdcache
->add_ambiguous_import(p
->first
, p
->second
);
2708 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, p
->second
,
2709 mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
2712 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, p
->second
, mds
->get_nodeid());
2716 mds
->mdcache
->recalc_auth_bits(true);
2718 mds
->mdcache
->show_subtrees();
2723 // -----------------------
2726 void EFragment::replay(MDSRank
*mds
)
2728 dout(10) << "EFragment.replay " << op_name(op
) << " " << ino
<< " " << basefrag
<< " by " << bits
<< dendl
;
2730 std::vector
<CDir
*> resultfrags
;
2731 MDSContext::vec waiters
;
2733 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2734 // it will be once we replay the metablob , but first we need to
2735 // refragment anything we already have in the cache.
2736 CInode
*in
= mds
->mdcache
->get_inode(ino
);
2738 auto&& segment
= get_segment();
2741 mds
->mdcache
->add_uncommitted_fragment(dirfrag_t(ino
, basefrag
), bits
, orig_frags
, segment
, &rollback
);
2744 mds
->mdcache
->adjust_dir_fragments(in
, basefrag
, bits
, &resultfrags
, waiters
, true);
2748 frag_vec_t old_frags
;
2750 in
->dirfragtree
.get_leaves_under(basefrag
, old_frags
);
2751 if (orig_frags
.empty()) {
2752 // old format EFragment
2753 mds
->mdcache
->adjust_dir_fragments(in
, basefrag
, -bits
, &resultfrags
, waiters
, true);
2755 for (const auto& fg
: orig_frags
)
2756 mds
->mdcache
->force_dir_fragment(in
, fg
);
2759 mds
->mdcache
->rollback_uncommitted_fragment(dirfrag_t(ino
, basefrag
), std::move(old_frags
));
2765 mds
->mdcache
->finish_uncommitted_fragment(dirfrag_t(ino
, basefrag
), op
);
2772 metablob
.replay(mds
, segment
);
2773 if (in
&& g_conf()->mds_debug_frag
)
2774 in
->verify_dirfrags();
2777 void EFragment::encode(bufferlist
&bl
, uint64_t features
) const {
2778 ENCODE_START(5, 4, bl
);
2782 encode(basefrag
, bl
);
2784 encode(metablob
, bl
, features
);
2785 encode(orig_frags
, bl
);
2786 encode(rollback
, bl
);
2790 void EFragment::decode(bufferlist::const_iterator
&bl
) {
2791 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl
);
2797 decode(basefrag
, bl
);
2799 decode(metablob
, bl
);
2800 if (struct_v
>= 5) {
2801 decode(orig_frags
, bl
);
2802 decode(rollback
, bl
);
2807 void EFragment::dump(Formatter
*f
) const
2809 /*f->open_object_section("Metablob");
2810 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2811 f->close_section();*/
2812 f
->dump_string("op", op_name(op
));
2813 f
->dump_stream("ino") << ino
;
2814 f
->dump_stream("base frag") << basefrag
;
2815 f
->dump_int("bits", bits
);
2818 void EFragment::generate_test_instances(std::list
<EFragment
*>& ls
)
2820 ls
.push_back(new EFragment
);
2821 ls
.push_back(new EFragment
);
2822 ls
.back()->op
= OP_PREPARE
;
2824 ls
.back()->bits
= 5;
2827 void dirfrag_rollback::encode(bufferlist
&bl
) const
2829 ENCODE_START(1, 1, bl
);
2834 void dirfrag_rollback::decode(bufferlist::const_iterator
&bl
)
2836 DECODE_START(1, bl
);
2843 // =========================================================================
2845 // -----------------------
2848 void EExport::replay(MDSRank
*mds
)
2850 dout(10) << "EExport.replay " << base
<< dendl
;
2851 auto&& segment
= get_segment();
2852 metablob
.replay(mds
, segment
);
2854 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
2857 set
<CDir
*> realbounds
;
2858 for (set
<dirfrag_t
>::iterator p
= bounds
.begin();
2861 CDir
*bd
= mds
->mdcache
->get_dirfrag(*p
);
2863 realbounds
.insert(bd
);
2867 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, realbounds
, CDIR_AUTH_UNDEF
);
2869 mds
->mdcache
->try_trim_non_auth_subtree(dir
);
2872 void EExport::encode(bufferlist
& bl
, uint64_t features
) const
2874 ENCODE_START(4, 3, bl
);
2876 encode(metablob
, bl
, features
);
2883 void EExport::decode(bufferlist::const_iterator
&bl
)
2885 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2888 decode(metablob
, bl
);
2896 void EExport::dump(Formatter
*f
) const
2898 f
->dump_float("stamp", (double)stamp
);
2899 /*f->open_object_section("Metablob");
2900 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2901 f->close_section();*/
2902 f
->dump_stream("base dirfrag") << base
;
2903 f
->open_array_section("bounds dirfrags");
2904 for (set
<dirfrag_t
>::const_iterator i
= bounds
.begin();
2905 i
!= bounds
.end(); ++i
) {
2906 f
->dump_stream("dirfrag") << *i
;
2908 f
->close_section(); // bounds dirfrags
2911 void EExport::generate_test_instances(std::list
<EExport
*>& ls
)
2913 EExport
*sample
= new EExport();
2914 ls
.push_back(sample
);
2918 // -----------------------
2921 void EImportStart::update_segment()
2923 get_segment()->sessionmapv
= cmapv
;
2926 void EImportStart::replay(MDSRank
*mds
)
2928 dout(10) << "EImportStart.replay " << base
<< " bounds " << bounds
<< dendl
;
2929 //metablob.print(*_dout);
2930 auto&& segment
= get_segment();
2931 metablob
.replay(mds
, segment
);
2933 // put in ambiguous import list
2934 mds
->mdcache
->add_ambiguous_import(base
, bounds
);
2936 // set auth partially to us so we don't trim it
2937 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
2940 set
<CDir
*> realbounds
;
2941 for (vector
<dirfrag_t
>::iterator p
= bounds
.begin();
2944 CDir
*bd
= mds
->mdcache
->get_dirfrag(*p
);
2946 if (!bd
->is_subtree_root())
2947 bd
->state_clear(CDir::STATE_AUTH
);
2948 realbounds
.insert(bd
);
2951 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, realbounds
,
2952 mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
2954 // open client sessions?
2955 if (mds
->sessionmap
.get_version() >= cmapv
) {
2956 dout(10) << "EImportStart.replay sessionmap " << mds
->sessionmap
.get_version()
2957 << " >= " << cmapv
<< ", noop" << dendl
;
2959 dout(10) << "EImportStart.replay sessionmap " << mds
->sessionmap
.get_version()
2960 << " < " << cmapv
<< dendl
;
2961 map
<client_t
,entity_inst_t
> cm
;
2962 map
<client_t
,client_metadata_t
> cmm
;
2963 auto blp
= client_map
.cbegin();
2968 mds
->sessionmap
.replay_open_sessions(cmapv
, cm
, cmm
);
2973 void EImportStart::encode(bufferlist
&bl
, uint64_t features
) const {
2974 ENCODE_START(4, 3, bl
);
2977 encode(metablob
, bl
, features
);
2980 encode(client_map
, bl
);
2985 void EImportStart::decode(bufferlist::const_iterator
&bl
) {
2986 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2990 decode(metablob
, bl
);
2993 decode(client_map
, bl
);
2999 void EImportStart::dump(Formatter
*f
) const
3001 f
->dump_stream("base dirfrag") << base
;
3002 f
->open_array_section("boundary dirfrags");
3003 for (vector
<dirfrag_t
>::const_iterator iter
= bounds
.begin();
3004 iter
!= bounds
.end(); ++iter
) {
3005 f
->dump_stream("frag") << *iter
;
3010 void EImportStart::generate_test_instances(std::list
<EImportStart
*>& ls
)
3012 ls
.push_back(new EImportStart
);
3015 // -----------------------
3018 void EImportFinish::replay(MDSRank
*mds
)
3020 if (mds
->mdcache
->have_ambiguous_import(base
)) {
3021 dout(10) << "EImportFinish.replay " << base
<< " success=" << success
<< dendl
;
3023 mds
->mdcache
->finish_ambiguous_import(base
);
3025 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
3027 vector
<dirfrag_t
> bounds
;
3028 mds
->mdcache
->get_ambiguous_import_bounds(base
, bounds
);
3029 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, bounds
, CDIR_AUTH_UNDEF
);
3030 mds
->mdcache
->cancel_ambiguous_import(dir
);
3031 mds
->mdcache
->try_trim_non_auth_subtree(dir
);
3034 // this shouldn't happen unless this is an old journal
3035 dout(10) << "EImportFinish.replay " << base
<< " success=" << success
3036 << " on subtree not marked as ambiguous"
3038 mds
->clog
->error() << "failure replaying journal (EImportFinish)";
3040 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3044 void EImportFinish::encode(bufferlist
& bl
, uint64_t features
) const
3046 ENCODE_START(3, 3, bl
);
3049 encode(success
, bl
);
3053 void EImportFinish::decode(bufferlist::const_iterator
&bl
)
3055 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
3059 decode(success
, bl
);
3063 void EImportFinish::dump(Formatter
*f
) const
3065 f
->dump_stream("base dirfrag") << base
;
3066 f
->dump_string("success", success
? "true" : "false");
3068 void EImportFinish::generate_test_instances(std::list
<EImportFinish
*>& ls
)
3070 ls
.push_back(new EImportFinish
);
3071 ls
.push_back(new EImportFinish
);
3072 ls
.back()->success
= true;
3076 // ------------------------
3079 void EResetJournal::encode(bufferlist
& bl
, uint64_t features
) const
3081 ENCODE_START(2, 2, bl
);
3086 void EResetJournal::decode(bufferlist::const_iterator
&bl
)
3088 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
3093 void EResetJournal::dump(Formatter
*f
) const
3095 f
->dump_stream("timestamp") << stamp
;
3098 void EResetJournal::generate_test_instances(std::list
<EResetJournal
*>& ls
)
3100 ls
.push_back(new EResetJournal());
3103 void EResetJournal::replay(MDSRank
*mds
)
3105 dout(1) << "EResetJournal" << dendl
;
3107 mds
->sessionmap
.wipe();
3108 mds
->inotable
->replay_reset();
3110 if (mds
->mdsmap
->get_root() == mds
->get_nodeid()) {
3111 CDir
*rootdir
= mds
->mdcache
->get_root()->get_or_open_dirfrag(mds
->mdcache
, frag_t());
3112 mds
->mdcache
->adjust_subtree_auth(rootdir
, mds
->get_nodeid());
3115 CDir
*mydir
= mds
->mdcache
->get_myin()->get_or_open_dirfrag(mds
->mdcache
, frag_t());
3116 mds
->mdcache
->adjust_subtree_auth(mydir
, mds
->get_nodeid());
3118 mds
->mdcache
->recalc_auth_bits(true);
3120 mds
->mdcache
->show_subtrees();
3124 void ENoOp::encode(bufferlist
&bl
, uint64_t features
) const
3126 ENCODE_START(2, 2, bl
);
3127 encode(pad_size
, bl
);
3128 uint8_t const pad
= 0xff;
3129 for (unsigned int i
= 0; i
< pad_size
; ++i
) {
3136 void ENoOp::decode(bufferlist::const_iterator
&bl
)
3138 DECODE_START(2, bl
);
3139 decode(pad_size
, bl
);
3140 if (bl
.get_remaining() != pad_size
) {
3141 // This is spiritually an assertion, but expressing in a way that will let
3142 // journal debug tools catch it and recognise a malformed entry.
3143 throw buffer::end_of_buffer();
3151 void ENoOp::replay(MDSRank
*mds
)
3153 dout(4) << "ENoOp::replay, " << pad_size
<< " bytes skipped in journal" << dendl
;
3157 * If re-formatting an old journal that used absolute log position
3158 * references as segment sequence numbers, use this function to update
3162 * MDSRank instance, just used for logging
3164 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3167 * True if the event was modified.
3169 bool EMetaBlob::rewrite_truncate_finish(MDSRank
const *mds
,
3170 std::map
<LogSegment::seq_t
, LogSegment::seq_t
> const &old_to_new
)
3172 bool modified
= false;
3173 map
<inodeno_t
, LogSegment::seq_t
> new_trunc_finish
;
3174 for (const auto& p
: truncate_finish
) {
3175 auto q
= old_to_new
.find(p
.second
);
3176 if (q
!= old_to_new
.end()) {
3177 dout(20) << __func__
<< " applying segment seq mapping "
3178 << p
.second
<< " -> " << q
->second
<< dendl
;
3179 new_trunc_finish
.emplace(p
.first
, q
->second
);
3182 dout(20) << __func__
<< " no segment seq mapping found for "
3183 << p
.second
<< dendl
;
3184 new_trunc_finish
.insert(p
);
3187 truncate_finish
.swap(new_trunc_finish
);