1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "common/config.h"
16 #include "osdc/Journaler.h"
17 #include "events/ESubtreeMap.h"
18 #include "events/ESession.h"
19 #include "events/ESessions.h"
21 #include "events/EMetaBlob.h"
22 #include "events/EResetJournal.h"
23 #include "events/ENoOp.h"
25 #include "events/EUpdate.h"
26 #include "events/ESlaveUpdate.h"
27 #include "events/EOpen.h"
28 #include "events/ECommitted.h"
29 #include "events/EPurged.h"
31 #include "events/EExport.h"
32 #include "events/EImportStart.h"
33 #include "events/EImportFinish.h"
34 #include "events/EFragment.h"
36 #include "events/ETableClient.h"
37 #include "events/ETableServer.h"
39 #include "include/stringify.h"
41 #include "LogSegment.h"
51 #include "MDSTableClient.h"
52 #include "MDSTableServer.h"
56 #define dout_context g_ceph_context
57 #define dout_subsys ceph_subsys_mds
59 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
62 // -----------------------
65 void LogSegment::try_to_expire(MDSRank
*mds
, MDSGatherBuilder
&gather_bld
, int op_prio
)
69 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire" << dendl
;
71 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 1);
74 for (elist
<CDir
*>::iterator p
= new_dirfrags
.begin(); !p
.end(); ++p
) {
75 dout(20) << " new_dirfrag " << **p
<< dendl
;
76 ceph_assert((*p
)->is_auth());
79 for (elist
<CDir
*>::iterator p
= dirty_dirfrags
.begin(); !p
.end(); ++p
) {
80 dout(20) << " dirty_dirfrag " << **p
<< dendl
;
81 ceph_assert((*p
)->is_auth());
84 for (elist
<CDentry
*>::iterator p
= dirty_dentries
.begin(); !p
.end(); ++p
) {
85 dout(20) << " dirty_dentry " << **p
<< dendl
;
86 ceph_assert((*p
)->is_auth());
87 commit
.insert((*p
)->get_dir());
89 for (elist
<CInode
*>::iterator p
= dirty_inodes
.begin(); !p
.end(); ++p
) {
90 dout(20) << " dirty_inode " << **p
<< dendl
;
91 ceph_assert((*p
)->is_auth());
92 if ((*p
)->is_base()) {
93 (*p
)->store(gather_bld
.new_sub());
95 commit
.insert((*p
)->get_parent_dn()->get_dir());
98 if (!commit
.empty()) {
99 for (set
<CDir
*>::iterator p
= commit
.begin();
103 ceph_assert(dir
->is_auth());
104 if (dir
->can_auth_pin()) {
105 dout(15) << "try_to_expire committing " << *dir
<< dendl
;
106 dir
->commit(0, gather_bld
.new_sub(), false, op_prio
);
108 dout(15) << "try_to_expire waiting for unfreeze on " << *dir
<< dendl
;
109 dir
->add_waiter(CDir::WAIT_UNFREEZE
, gather_bld
.new_sub());
114 // master ops with possibly uncommitted slaves
115 for (set
<metareqid_t
>::iterator p
= uncommitted_masters
.begin();
116 p
!= uncommitted_masters
.end();
118 dout(10) << "try_to_expire waiting for slaves to ack commit on " << *p
<< dendl
;
119 mds
->mdcache
->wait_for_uncommitted_master(*p
, gather_bld
.new_sub());
122 // uncommitted fragments
123 for (set
<dirfrag_t
>::iterator p
= uncommitted_fragments
.begin();
124 p
!= uncommitted_fragments
.end();
126 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p
<< dendl
;
127 mds
->mdcache
->wait_for_uncommitted_fragment(*p
, gather_bld
.new_sub());
130 // nudge scatterlocks
131 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_dir
.begin(); !p
.end(); ++p
) {
133 dout(10) << "try_to_expire waiting for dirlock flush on " << *in
<< dendl
;
134 mds
->locker
->scatter_nudge(&in
->filelock
, gather_bld
.new_sub());
136 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_dirfragtree
.begin(); !p
.end(); ++p
) {
138 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in
<< dendl
;
139 mds
->locker
->scatter_nudge(&in
->dirfragtreelock
, gather_bld
.new_sub());
141 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_nest
.begin(); !p
.end(); ++p
) {
143 dout(10) << "try_to_expire waiting for nest flush on " << *in
<< dendl
;
144 mds
->locker
->scatter_nudge(&in
->nestlock
, gather_bld
.new_sub());
147 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 2);
149 // open files and snap inodes
150 if (!open_files
.empty()) {
151 ceph_assert(!mds
->mdlog
->is_capped()); // hmm FIXME
153 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
154 ceph_assert(ls
!= this);
155 elist
<CInode
*>::iterator p
= open_files
.begin(member_offset(CInode
, item_open_file
));
159 if (in
->last
!= CEPH_NOSNAP
&& in
->is_auth() && !in
->client_snap_caps
.empty()) {
160 // journal snap inodes that need flush. This simplify the mds failover hanlding
161 dout(20) << "try_to_expire requeueing snap needflush inode " << *in
<< dendl
;
163 le
= new EOpen(mds
->mdlog
);
164 mds
->mdlog
->start_entry(le
);
166 le
->add_clean_inode(in
);
167 ls
->open_files
.push_back(&in
->item_open_file
);
169 // open files are tracked by open file table, no need to journal them again
170 in
->item_open_file
.remove_myself();
174 mds
->mdlog
->submit_entry(le
);
175 mds
->mdlog
->wait_for_safe(gather_bld
.new_sub());
176 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl
;
180 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 3);
182 // backtraces to be stored/updated
183 for (elist
<CInode
*>::iterator p
= dirty_parent_inodes
.begin(); !p
.end(); ++p
) {
185 ceph_assert(in
->is_auth());
186 if (in
->can_auth_pin()) {
187 dout(15) << "try_to_expire waiting for storing backtrace on " << *in
<< dendl
;
188 in
->store_backtrace(gather_bld
.new_sub(), op_prio
);
190 dout(15) << "try_to_expire waiting for unfreeze on " << *in
<< dendl
;
191 in
->add_waiter(CInode::WAIT_UNFREEZE
, gather_bld
.new_sub());
195 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 4);
198 for (elist
<MDSlaveUpdate
*>::iterator p
= slave_updates
.begin(member_offset(MDSlaveUpdate
,
201 MDSlaveUpdate
*su
= *p
;
202 dout(10) << "try_to_expire waiting on slave update " << su
<< dendl
;
203 ceph_assert(su
->waiter
== 0);
204 su
->waiter
= gather_bld
.new_sub();
208 if (inotablev
> mds
->inotable
->get_committed_version()) {
209 dout(10) << "try_to_expire saving inotable table, need " << inotablev
210 << ", committed is " << mds
->inotable
->get_committed_version()
211 << " (" << mds
->inotable
->get_committing_version() << ")"
213 mds
->inotable
->save(gather_bld
.new_sub(), inotablev
);
217 if (sessionmapv
> mds
->sessionmap
.get_committed()) {
218 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
219 << ", committed is " << mds
->sessionmap
.get_committed()
220 << " (" << mds
->sessionmap
.get_committing() << ")"
222 mds
->sessionmap
.save(gather_bld
.new_sub(), sessionmapv
);
225 // updates to sessions for completed_requests
226 mds
->sessionmap
.save_if_dirty(touched_sessions
, &gather_bld
);
227 touched_sessions
.clear();
229 // pending commit atids
230 for (map
<int, ceph::unordered_set
<version_t
> >::iterator p
= pending_commit_tids
.begin();
231 p
!= pending_commit_tids
.end();
233 MDSTableClient
*client
= mds
->get_table_client(p
->first
);
235 for (ceph::unordered_set
<version_t
>::iterator q
= p
->second
.begin();
236 q
!= p
->second
.end();
238 dout(10) << "try_to_expire " << get_mdstable_name(p
->first
) << " transaction " << *q
239 << " pending commit (not yet acked), waiting" << dendl
;
240 ceph_assert(!client
->has_committed(*q
));
241 client
->wait_for_ack(*q
, gather_bld
.new_sub());
246 for (map
<int, version_t
>::iterator p
= tablev
.begin();
249 MDSTableServer
*server
= mds
->get_table_server(p
->first
);
251 if (p
->second
> server
->get_committed_version()) {
252 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p
->first
)
253 << " to save, need " << p
->second
<< dendl
;
254 server
->save(gather_bld
.new_sub());
259 for (set
<CInode
*>::iterator p
= truncating_inodes
.begin();
260 p
!= truncating_inodes
.end();
262 dout(10) << "try_to_expire waiting for truncate of " << **p
<< dendl
;
263 (*p
)->add_waiter(CInode::WAIT_TRUNC
, gather_bld
.new_sub());
266 dout(10) << "try_to_expire waiting for purge of " << purge_inodes
<< dendl
;
267 if (purge_inodes
.size())
268 set_purged_cb(gather_bld
.new_sub());
270 if (gather_bld
.has_subs()) {
271 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire waiting" << dendl
;
274 ceph_assert(g_conf()->mds_kill_journal_expire_at
!= 5);
275 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire success" << dendl
;
279 // -----------------------
282 void EMetaBlob::add_dir_context(CDir
*dir
, int mode
)
284 MDSRank
*mds
= dir
->cache
->mds
;
286 list
<CDentry
*> parents
;
288 // it may be okay not to include the maybe items, if
289 // - we journaled the maybe child inode in this segment
290 // - that subtree turns out to be unambiguously auth
291 list
<CDentry
*> maybe
;
292 bool maybenot
= false;
295 // already have this dir? (we must always add in order)
296 if (lump_map
.count(dir
->dirfrag())) {
297 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") have lump " << dir
->dirfrag() << dendl
;
301 // stop at root/stray
302 CInode
*diri
= dir
->get_inode();
303 CDentry
*parent
= diri
->get_projected_parent_dn();
305 if (mode
== TO_AUTH_SUBTREE_ROOT
) {
307 if (dir
->is_subtree_root()) {
308 // match logic in MDCache::create_subtree_map()
309 if (dir
->get_dir_auth().first
== mds
->get_nodeid()) {
310 mds_authority_t parent_auth
= parent
? parent
->authority() : CDIR_AUTH_UNDEF
;
311 if (parent_auth
.first
== dir
->get_dir_auth().first
) {
312 if (parent_auth
.second
== CDIR_AUTH_UNKNOWN
&&
313 !dir
->is_ambiguous_dir_auth() &&
314 !dir
->state_test(CDir::STATE_EXPORTBOUND
) &&
315 !dir
->state_test(CDir::STATE_AUXSUBTREE
) &&
316 !diri
->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
317 dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir
<< dendl
;
320 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") ambiguous or transient subtree " << dendl
;
322 // it's an auth subtree, we don't need maybe (if any), and we're done.
323 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") reached unambig auth subtree, don't need " << maybe
324 << " at " << *dir
<< dendl
;
329 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") reached ambig or !auth subtree, need " << maybe
330 << " at " << *dir
<< dendl
;
331 // we need the maybe list after all!
332 parents
.splice(parents
.begin(), maybe
);
337 // was the inode journaled in this blob?
338 if (event_seq
&& diri
->last_journaled
== event_seq
) {
339 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") already have diri this blob " << *diri
<< dendl
;
343 // have we journaled this inode since the last subtree map?
344 if (!maybenot
&& last_subtree_map
&& diri
->last_journaled
>= last_subtree_map
) {
345 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") already have diri in this segment ("
346 << diri
->last_journaled
<< " >= " << last_subtree_map
<< "), setting maybenot flag "
356 dout(25) << "EMetaBlob::add_dir_context(" << dir
<< ") maybe " << *parent
<< dendl
;
357 maybe
.push_front(parent
);
359 dout(25) << "EMetaBlob::add_dir_context(" << dir
<< ") definitely " << *parent
<< dendl
;
360 parents
.push_front(parent
);
363 dir
= parent
->get_dir();
366 parents
.splice(parents
.begin(), maybe
);
368 dout(20) << "EMetaBlob::add_dir_context final: " << parents
<< dendl
;
369 for (const auto& dentry
: parents
) {
370 ceph_assert(dentry
->get_projected_linkage()->is_primary());
371 add_dentry(dentry
, false);
375 void EMetaBlob::update_segment(LogSegment
*ls
)
377 // dirty inode mtimes
378 // -> handled directly by Server.cc, replay()
380 // alloc table update?
382 ls
->inotablev
= inotablev
;
384 ls
->sessionmapv
= sessionmapv
;
387 // -> handled directly by Server.cc
390 // note the newest request per client
391 //if (!client_reqs.empty())
392 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
395 // EMetaBlob::fullbit
397 void EMetaBlob::fullbit::encode(bufferlist
& bl
, uint64_t features
) const {
398 ENCODE_START(8, 5, bl
);
403 encode(inode
, bl
, features
);
405 if (inode
.is_symlink())
407 if (inode
.is_dir()) {
408 encode(dirfragtree
, bl
);
412 if (old_inodes
.empty()) {
416 encode(old_inodes
, bl
, features
);
420 encode(oldest_snap
, bl
);
424 void EMetaBlob::fullbit::decode(bufferlist::const_iterator
&bl
) {
425 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
432 if (inode
.is_symlink())
434 if (inode
.is_dir()) {
435 decode(dirfragtree
, bl
);
437 if ((struct_v
== 2) || (struct_v
== 3)) {
438 bool dir_layout_exists
;
439 decode(dir_layout_exists
, bl
);
440 if (dir_layout_exists
) {
442 decode(dir_struct_v
, bl
); // default_file_layout version
443 decode(inode
.layout
, bl
); // and actual layout, that we care about
452 state
= dirty
? EMetaBlob::fullbit::STATE_DIRTY
: 0;
456 bool old_inodes_present
;
457 decode(old_inodes_present
, bl
);
458 if (old_inodes_present
) {
459 decode(old_inodes
, bl
);
462 if (!inode
.is_dir()) {
467 decode(oldest_snap
, bl
);
469 oldest_snap
= CEPH_NOSNAP
;
474 void EMetaBlob::fullbit::dump(Formatter
*f
) const
476 f
->dump_string("dentry", dn
);
477 f
->dump_stream("snapid.first") << dnfirst
;
478 f
->dump_stream("snapid.last") << dnlast
;
479 f
->dump_int("dentry version", dnv
);
480 f
->open_object_section("inode");
482 f
->close_section(); // inode
483 f
->open_object_section("xattrs");
484 for (const auto &p
: xattrs
) {
485 std::string
s(p
.second
.c_str(), p
.second
.length());
486 f
->dump_string(p
.first
.c_str(), s
);
488 f
->close_section(); // xattrs
489 if (inode
.is_symlink()) {
490 f
->dump_string("symlink", symlink
);
492 if (inode
.is_dir()) {
493 f
->dump_stream("frag tree") << dirfragtree
;
494 f
->dump_string("has_snapbl", snapbl
.length() ? "true" : "false");
495 if (inode
.has_layout()) {
496 f
->open_object_section("file layout policy");
498 f
->dump_string("layout", "the layout exists");
499 f
->close_section(); // file layout policy
502 f
->dump_string("state", state_string());
503 if (!old_inodes
.empty()) {
504 f
->open_array_section("old inodes");
505 for (const auto &p
: old_inodes
) {
506 f
->open_object_section("inode");
507 f
->dump_int("snapid", p
.first
);
509 f
->close_section(); // inode
511 f
->close_section(); // old inodes
515 void EMetaBlob::fullbit::generate_test_instances(std::list
<EMetaBlob::fullbit
*>& ls
)
517 CInode::mempool_inode inode
;
519 CInode::mempool_xattr_map empty_xattrs
;
520 bufferlist empty_snapbl
;
521 fullbit
*sample
= new fullbit("/testdn", 0, 0, 0,
522 inode
, fragtree
, empty_xattrs
, "", 0, empty_snapbl
,
524 ls
.push_back(sample
);
527 void EMetaBlob::fullbit::update_inode(MDSRank
*mds
, CInode
*in
)
531 in
->maybe_export_pin();
532 if (in
->inode
.is_dir()) {
533 if (!(in
->dirfragtree
== dirfragtree
)) {
534 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in
->dirfragtree
<< " -> "
535 << dirfragtree
<< " on " << *in
<< dendl
;
536 in
->dirfragtree
= dirfragtree
;
537 in
->force_dirfrags();
538 if (in
->get_num_dirfrags() && in
->authority() == CDIR_AUTH_UNDEF
) {
539 auto&& ls
= in
->get_nested_dirfrags();
540 for (const auto& dir
: ls
) {
541 if (dir
->get_num_any() == 0 &&
542 mds
->mdcache
->can_trim_non_auth_dirfrag(dir
)) {
543 dout(10) << " closing empty non-auth dirfrag " << *dir
<< dendl
;
544 in
->close_dirfrag(dir
->get_frag());
549 } else if (in
->inode
.is_symlink()) {
550 in
->symlink
= symlink
;
552 in
->old_inodes
= old_inodes
;
553 if (!in
->old_inodes
.empty()) {
554 snapid_t min_first
= in
->old_inodes
.rbegin()->first
+ 1;
555 if (min_first
> in
->first
)
556 in
->first
= min_first
;
560 * we can do this before linking hte inode bc the split_at would
561 * be a no-op.. we have no children (namely open snaprealms) to
564 in
->oldest_snap
= oldest_snap
;
565 in
->decode_snap_blob(snapbl
);
568 * In case there was anything malformed in the journal that we are
569 * replaying, do sanity checks on the inodes we're replaying and
570 * go damaged instead of letting any trash into a live cache
573 // Files must have valid layouts with a pool set
574 if (in
->inode
.layout
.pool_id
== -1 || !in
->inode
.layout
.is_valid()) {
575 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
576 << ": " << in
->inode
.layout
<< dendl
;
577 std::ostringstream oss
;
578 oss
<< "Invalid layout for inode " << in
->ino() << " in journal";
579 mds
->clog
->error() << oss
.str();
581 ceph_abort(); // Should be unreachable because damaged() calls respawn()
586 // EMetaBlob::remotebit
588 void EMetaBlob::remotebit::encode(bufferlist
& bl
) const
590 ENCODE_START(2, 2, bl
);
601 void EMetaBlob::remotebit::decode(bufferlist::const_iterator
&bl
)
603 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
614 void EMetaBlob::remotebit::dump(Formatter
*f
) const
616 f
->dump_string("dentry", dn
);
617 f
->dump_int("snapid.first", dnfirst
);
618 f
->dump_int("snapid.last", dnlast
);
619 f
->dump_int("dentry version", dnv
);
620 f
->dump_int("inodeno", ino
);
621 uint32_t type
= DTTOIF(d_type
) & S_IFMT
; // convert to type entries
625 type_string
= "file"; break;
627 type_string
= "symlink"; break;
629 type_string
= "directory"; break;
631 type_string
= "fifo"; break;
633 type_string
= "chr"; break;
635 type_string
= "blk"; break;
637 type_string
= "sock"; break;
639 assert (0 == "unknown d_type!");
641 f
->dump_string("d_type", type_string
);
642 f
->dump_string("dirty", dirty
? "true" : "false");
645 void EMetaBlob::remotebit::
646 generate_test_instances(std::list
<EMetaBlob::remotebit
*>& ls
)
648 remotebit
*remote
= new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG
), false);
649 ls
.push_back(remote
);
652 // EMetaBlob::nullbit
654 void EMetaBlob::nullbit::encode(bufferlist
& bl
) const
656 ENCODE_START(2, 2, bl
);
665 void EMetaBlob::nullbit::decode(bufferlist::const_iterator
&bl
)
667 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
676 void EMetaBlob::nullbit::dump(Formatter
*f
) const
678 f
->dump_string("dentry", dn
);
679 f
->dump_int("snapid.first", dnfirst
);
680 f
->dump_int("snapid.last", dnlast
);
681 f
->dump_int("dentry version", dnv
);
682 f
->dump_string("dirty", dirty
? "true" : "false");
685 void EMetaBlob::nullbit::generate_test_instances(std::list
<nullbit
*>& ls
)
687 nullbit
*sample
= new nullbit("/test/dentry", 0, 10, 15, false);
688 nullbit
*sample2
= new nullbit("/test/dirty", 10, 20, 25, true);
689 ls
.push_back(sample
);
690 ls
.push_back(sample2
);
693 // EMetaBlob::dirlump
695 void EMetaBlob::dirlump::encode(bufferlist
& bl
, uint64_t features
) const
697 ENCODE_START(2, 2, bl
);
703 _encode_bits(features
);
708 void EMetaBlob::dirlump::decode(bufferlist::const_iterator
&bl
)
710 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
)
717 dn_decoded
= false; // don't decode bits unless we need them.
721 void EMetaBlob::dirlump::dump(Formatter
*f
) const
724 dirlump
*me
= const_cast<dirlump
*>(this);
727 f
->open_object_section("fnode");
729 f
->close_section(); // fnode
730 f
->dump_string("state", state_string());
731 f
->dump_int("nfull", nfull
);
732 f
->dump_int("nremote", nremote
);
733 f
->dump_int("nnull", nnull
);
735 f
->open_array_section("full bits");
736 for (const auto& iter
: dfull
) {
737 f
->open_object_section("fullbit");
739 f
->close_section(); // fullbit
741 f
->close_section(); // full bits
742 f
->open_array_section("remote bits");
743 for (const auto& iter
: dremote
) {
744 f
->open_object_section("remotebit");
746 f
->close_section(); // remotebit
748 f
->close_section(); // remote bits
749 f
->open_array_section("null bits");
750 for (const auto& iter
: dnull
) {
751 f
->open_object_section("null bit");
753 f
->close_section(); // null bit
755 f
->close_section(); // null bits
758 void EMetaBlob::dirlump::generate_test_instances(std::list
<dirlump
*>& ls
)
760 ls
.push_back(new dirlump());
766 void EMetaBlob::encode(bufferlist
& bl
, uint64_t features
) const
768 ENCODE_START(8, 5, bl
);
769 encode(lump_order
, bl
);
770 encode(lump_map
, bl
, features
);
771 encode(roots
, bl
, features
);
772 encode(table_tids
, bl
);
773 encode(opened_ino
, bl
);
774 encode(allocated_ino
, bl
);
775 encode(used_preallocated_ino
, bl
);
776 encode(preallocated_inos
, bl
);
777 encode(client_name
, bl
);
778 encode(inotablev
, bl
);
779 encode(sessionmapv
, bl
);
780 encode(truncate_start
, bl
);
781 encode(truncate_finish
, bl
);
782 encode(destroyed_inodes
, bl
);
783 encode(client_reqs
, bl
);
784 encode(renamed_dirino
, bl
);
785 encode(renamed_dir_frags
, bl
);
787 // make MDSRank use v6 format happy
793 encode(client_flushes
, bl
);
796 void EMetaBlob::decode(bufferlist::const_iterator
&bl
)
798 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl
);
799 decode(lump_order
, bl
);
800 decode(lump_map
, bl
);
806 if (rootbl
.length()) {
807 auto p
= rootbl
.cbegin();
808 roots
.emplace_back(p
);
811 decode(table_tids
, bl
);
812 decode(opened_ino
, bl
);
813 decode(allocated_ino
, bl
);
814 decode(used_preallocated_ino
, bl
);
815 decode(preallocated_inos
, bl
);
816 decode(client_name
, bl
);
817 decode(inotablev
, bl
);
818 decode(sessionmapv
, bl
);
819 decode(truncate_start
, bl
);
820 decode(truncate_finish
, bl
);
821 decode(destroyed_inodes
, bl
);
823 decode(client_reqs
, bl
);
828 client_reqs
.push_back(pair
<metareqid_t
,uint64_t>(r
.front(), 0));
833 decode(renamed_dirino
, bl
);
834 decode(renamed_dir_frags
, bl
);
844 decode(client_flushes
, bl
);
851 * Get all inodes touched by this metablob. Includes the 'bits' within
852 * dirlumps, and the inodes of the dirs themselves.
854 void EMetaBlob::get_inodes(
855 std::set
<inodeno_t
> &inodes
) const
857 // For all dirlumps in this metablob
858 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
859 // Record inode of dirlump
860 inodeno_t
const dir_ino
= i
->first
.ino
;
861 inodes
.insert(dir_ino
);
863 // Decode dirlump bits
864 dirlump
const &dl
= i
->second
;
867 // Record inodes of fullbits
868 for (const auto& iter
: dl
.get_dfull()) {
869 inodes
.insert(iter
.inode
.ino
);
872 // Record inodes of remotebits
873 for (const auto& iter
: dl
.get_dremote()) {
874 inodes
.insert(iter
.ino
);
881 * Get a map of dirfrag to set of dentries in that dirfrag which are
882 * touched in this operation.
884 void EMetaBlob::get_dentries(std::map
<dirfrag_t
, std::set
<std::string
> > &dentries
) const
886 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
887 dirlump
const &dl
= i
->second
;
888 dirfrag_t
const &df
= i
->first
;
893 // For all bits, store dentry
894 for (const auto& iter
: dl
.get_dfull()) {
895 dentries
[df
].insert(iter
.dn
);
897 for (const auto& iter
: dl
.get_dremote()) {
898 dentries
[df
].insert(iter
.dn
);
900 for (const auto& iter
: dl
.get_dnull()) {
901 dentries
[df
].insert(iter
.dn
);
909 * Calculate all paths that we can infer are touched by this metablob. Only uses
910 * information local to this metablob so it may only be the path within the
913 void EMetaBlob::get_paths(
914 std::vector
<std::string
> &paths
) const
916 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
917 typedef std::pair
<inodeno_t
, std::string
> Location
;
919 // Whenever we see a dentry within a dirlump, we remember it as a child of
920 // the dirlump's inode
921 std::map
<inodeno_t
, std::vector
<std::string
> > children
;
923 // Whenever we see a location for an inode, remember it: this allows us to
924 // build a path given an inode
925 std::map
<inodeno_t
, Location
> ino_locations
;
927 // Special case: operations on root inode populate roots but not dirlumps
928 if (lump_map
.empty() && !roots
.empty()) {
929 paths
.push_back("/");
935 // Build a tiny local metadata cache for the path structure in this metablob
936 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
937 inodeno_t
const dir_ino
= i
->first
.ino
;
938 dirlump
const &dl
= i
->second
;
941 for (const auto& iter
: dl
.get_dfull()) {
942 std::string_view dentry
= iter
.dn
;
943 children
[dir_ino
].emplace_back(dentry
);
944 ino_locations
[iter
.inode
.ino
] = Location(dir_ino
, dentry
);
947 for (const auto& iter
: dl
.get_dremote()) {
948 std::string_view dentry
= iter
.dn
;
949 children
[dir_ino
].emplace_back(dentry
);
952 for (const auto& iter
: dl
.get_dnull()) {
953 std::string_view dentry
= iter
.dn
;
954 children
[dir_ino
].emplace_back(dentry
);
958 std::vector
<Location
> leaf_locations
;
962 // Output paths for all childless nodes in the metablob
963 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
964 inodeno_t
const dir_ino
= i
->first
.ino
;
965 dirlump
const &dl
= i
->second
;
968 for (const auto& iter
: dl
.get_dfull()) {
969 std::string_view dentry
= iter
.dn
;
970 if (children
.find(iter
.inode
.ino
) == children
.end()) {
971 leaf_locations
.push_back(Location(dir_ino
, dentry
));
975 for (const auto& iter
: dl
.get_dremote()) {
976 std::string_view dentry
= iter
.dn
;
977 leaf_locations
.push_back(Location(dir_ino
, dentry
));
980 for (const auto& iter
: dl
.get_dnull()) {
981 std::string_view dentry
= iter
.dn
;
982 leaf_locations
.push_back(Location(dir_ino
, dentry
));
986 // For all the leaf locations identified, generate paths
987 for (std::vector
<Location
>::iterator i
= leaf_locations
.begin(); i
!= leaf_locations
.end(); ++i
) {
988 Location
const &loc
= *i
;
989 std::string path
= loc
.second
;
990 inodeno_t ino
= loc
.first
;
991 std::map
<inodeno_t
, Location
>::iterator iter
= ino_locations
.find(ino
);
992 while(iter
!= ino_locations
.end()) {
993 Location
const &loc
= iter
->second
;
995 path
= loc
.second
+ "/" + path
;
997 path
= loc
.second
+ path
;
999 iter
= ino_locations
.find(loc
.first
);
1002 paths
.push_back(path
);
1007 void EMetaBlob::dump(Formatter
*f
) const
1009 f
->open_array_section("lumps");
1010 for (const auto& d
: lump_order
) {
1011 f
->open_object_section("lump");
1012 f
->open_object_section("dirfrag");
1013 f
->dump_stream("dirfrag") << d
;
1014 f
->close_section(); // dirfrag
1015 f
->open_object_section("dirlump");
1016 lump_map
.at(d
).dump(f
);
1017 f
->close_section(); // dirlump
1018 f
->close_section(); // lump
1020 f
->close_section(); // lumps
1022 f
->open_array_section("roots");
1023 for (const auto& iter
: roots
) {
1024 f
->open_object_section("root");
1026 f
->close_section(); // root
1028 f
->close_section(); // roots
1030 f
->open_array_section("tableclient tranactions");
1031 for (const auto& p
: table_tids
) {
1032 f
->open_object_section("transaction");
1033 f
->dump_int("tid", p
.first
);
1034 f
->dump_int("version", p
.second
);
1035 f
->close_section(); // transaction
1037 f
->close_section(); // tableclient transactions
1039 f
->dump_int("renamed directory inodeno", renamed_dirino
);
1041 f
->open_array_section("renamed directory fragments");
1042 for (const auto& p
: renamed_dir_frags
) {
1043 f
->dump_int("frag", p
);
1045 f
->close_section(); // renamed directory fragments
1047 f
->dump_int("inotable version", inotablev
);
1048 f
->dump_int("SessionMap version", sessionmapv
);
1049 f
->dump_int("allocated ino", allocated_ino
);
1051 f
->dump_stream("preallocated inos") << preallocated_inos
;
1052 f
->dump_int("used preallocated ino", used_preallocated_ino
);
1054 f
->open_object_section("client name");
1055 client_name
.dump(f
);
1056 f
->close_section(); // client name
1058 f
->open_array_section("inodes starting a truncate");
1059 for(const auto& ino
: truncate_start
) {
1060 f
->dump_int("inodeno", ino
);
1062 f
->close_section(); // truncate inodes
1063 f
->open_array_section("inodes finishing a truncated");
1064 for(const auto& p
: truncate_finish
) {
1065 f
->open_object_section("inode+segment");
1066 f
->dump_int("inodeno", p
.first
);
1067 f
->dump_int("truncate starting segment", p
.second
);
1068 f
->close_section(); // truncated inode
1070 f
->close_section(); // truncate finish inodes
1072 f
->open_array_section("destroyed inodes");
1073 for(vector
<inodeno_t
>::const_iterator i
= destroyed_inodes
.begin();
1074 i
!= destroyed_inodes
.end(); ++i
) {
1075 f
->dump_int("inodeno", *i
);
1077 f
->close_section(); // destroyed inodes
1079 f
->open_array_section("client requests");
1080 for(const auto& p
: client_reqs
) {
1081 f
->open_object_section("Client request");
1082 f
->dump_stream("request ID") << p
.first
;
1083 f
->dump_int("oldest request on client", p
.second
);
1084 f
->close_section(); // request
1086 f
->close_section(); // client requests
1089 void EMetaBlob::generate_test_instances(std::list
<EMetaBlob
*>& ls
)
1091 ls
.push_back(new EMetaBlob());
1094 void EMetaBlob::replay(MDSRank
*mds
, LogSegment
*logseg
, MDSlaveUpdate
*slaveup
)
1096 dout(10) << "EMetaBlob.replay " << lump_map
.size() << " dirlumps by " << client_name
<< dendl
;
1098 ceph_assert(logseg
);
1100 ceph_assert(g_conf()->mds_kill_journal_replay_at
!= 1);
1102 for (auto& p
: roots
) {
1103 CInode
*in
= mds
->mdcache
->get_inode(p
.inode
.ino
);
1104 bool isnew
= in
? false:true;
1106 in
= new CInode(mds
->mdcache
, false, 2, CEPH_NOSNAP
);
1107 p
.update_inode(mds
, in
);
1110 mds
->mdcache
->add_inode(in
);
1111 if (p
.is_dirty()) in
->_mark_dirty(logseg
);
1112 dout(10) << "EMetaBlob.replay " << (isnew
? " added root ":" updated root ") << *in
<< dendl
;
1115 CInode
*renamed_diri
= 0;
1117 if (renamed_dirino
) {
1118 renamed_diri
= mds
->mdcache
->get_inode(renamed_dirino
);
1120 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri
<< dendl
;
1122 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino
<< dendl
;
1125 for (const auto& lp
: lump_order
) {
1126 dirlump
&lump
= lump_map
[lp
];
1128 dout(10) << "EMetaBlob.replay found null dentry in dir " << lp
<< dendl
;
1129 nnull
+= lump
.nnull
;
1132 ceph_assert(nnull
<= 1);
1135 // keep track of any inodes we unlink and don't relink elsewhere
1136 map
<CInode
*, CDir
*> unlinked
;
1137 set
<CInode
*> linked
;
1139 // walk through my dirs (in order!)
1140 for (const auto& lp
: lump_order
) {
1141 dout(10) << "EMetaBlob.replay dir " << lp
<< dendl
;
1142 dirlump
&lump
= lump_map
[lp
];
1145 CDir
*dir
= mds
->mdcache
->get_force_dirfrag(lp
, true);
1147 // hmm. do i have the inode?
1148 CInode
*diri
= mds
->mdcache
->get_inode((lp
).ino
);
1150 if (MDS_INO_IS_MDSDIR(lp
.ino
)) {
1151 ceph_assert(MDS_INO_MDSDIR(mds
->get_nodeid()) != lp
.ino
);
1152 diri
= mds
->mdcache
->create_system_inode(lp
.ino
, S_IFDIR
|0755);
1153 diri
->state_clear(CInode::STATE_AUTH
);
1154 dout(10) << "EMetaBlob.replay created base " << *diri
<< dendl
;
1156 dout(0) << "EMetaBlob.replay missing dir ino " << lp
.ino
<< dendl
;
1157 mds
->clog
->error() << "failure replaying journal (EMetaBlob)";
1159 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1163 // create the dirfrag
1164 dir
= diri
->get_or_open_dirfrag(mds
->mdcache
, lp
.frag
);
1166 if (MDS_INO_IS_BASE(lp
.ino
))
1167 mds
->mdcache
->adjust_subtree_auth(dir
, CDIR_AUTH_UNDEF
);
1169 dout(10) << "EMetaBlob.replay added dir " << *dir
<< dendl
;
1171 dir
->set_version( lump
.fnode
.version
);
1172 dir
->fnode
= lump
.fnode
;
1174 if (lump
.is_importing()) {
1175 dir
->state_set(CDir::STATE_AUTH
);
1176 dir
->state_clear(CDir::STATE_COMPLETE
);
1178 if (lump
.is_dirty()) {
1179 dir
->_mark_dirty(logseg
);
1181 if (!(dir
->fnode
.rstat
== dir
->fnode
.accounted_rstat
)) {
1182 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir
<< dendl
;
1183 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->nestlock
);
1184 logseg
->dirty_dirfrag_nest
.push_back(&dir
->inode
->item_dirty_dirfrag_nest
);
1186 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir
<< dendl
;
1188 if (!(dir
->fnode
.fragstat
== dir
->fnode
.accounted_fragstat
)) {
1189 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir
<< dendl
;
1190 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->filelock
);
1191 logseg
->dirty_dirfrag_dir
.push_back(&dir
->inode
->item_dirty_dirfrag_dir
);
1193 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir
<< dendl
;
1196 if (lump
.is_dirty_dft()) {
1197 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir
<< dendl
;
1198 dir
->state_set(CDir::STATE_DIRTYDFT
);
1199 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->dirfragtreelock
);
1200 logseg
->dirty_dirfrag_dirfragtree
.push_back(&dir
->inode
->item_dirty_dirfrag_dirfragtree
);
1203 dir
->mark_new(logseg
);
1204 if (lump
.is_complete())
1205 dir
->mark_complete();
1207 dout(10) << "EMetaBlob.replay updated dir " << *dir
<< dendl
;
1210 lump
._decode_bits();
1212 // full dentry+inode pairs
1213 for (auto& fb
: lump
._get_dfull()) {
1214 CDentry
*dn
= dir
->lookup_exact_snap(fb
.dn
, fb
.dnlast
);
1216 dn
= dir
->add_null_dentry(fb
.dn
, fb
.dnfirst
, fb
.dnlast
);
1217 dn
->set_version(fb
.dnv
);
1218 if (fb
.is_dirty()) dn
->_mark_dirty(logseg
);
1219 dout(10) << "EMetaBlob.replay added (full) " << *dn
<< dendl
;
1221 dn
->set_version(fb
.dnv
);
1222 if (fb
.is_dirty()) dn
->_mark_dirty(logseg
);
1223 dout(10) << "EMetaBlob.replay for [" << fb
.dnfirst
<< "," << fb
.dnlast
<< "] had " << *dn
<< dendl
;
1224 dn
->first
= fb
.dnfirst
;
1225 ceph_assert(dn
->last
== fb
.dnlast
);
1227 if (lump
.is_importing())
1228 dn
->state_set(CDentry::STATE_AUTH
);
1230 CInode
*in
= mds
->mdcache
->get_inode(fb
.inode
.ino
, fb
.dnlast
);
1232 in
= new CInode(mds
->mdcache
, dn
->is_auth(), fb
.dnfirst
, fb
.dnlast
);
1233 fb
.update_inode(mds
, in
);
1234 mds
->mdcache
->add_inode(in
);
1235 if (!dn
->get_linkage()->is_null()) {
1236 if (dn
->get_linkage()->is_primary()) {
1237 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1239 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1240 << " " << *dn
->get_linkage()->get_inode() << " should be " << fb
.inode
.ino
;
1241 dout(0) << ss
.str() << dendl
;
1242 mds
->clog
->warn(ss
);
1244 dir
->unlink_inode(dn
, false);
1246 if (unlinked
.count(in
))
1248 dir
->link_primary_inode(dn
, in
);
1249 dout(10) << "EMetaBlob.replay added " << *in
<< dendl
;
1251 in
->first
= fb
.dnfirst
;
1252 fb
.update_inode(mds
, in
);
1253 if (dn
->get_linkage()->get_inode() != in
&& in
->get_parent_dn()) {
1254 dout(10) << "EMetaBlob.replay unlinking " << *in
<< dendl
;
1255 unlinked
[in
] = in
->get_parent_dir();
1256 in
->get_parent_dir()->unlink_inode(in
->get_parent_dn());
1258 if (dn
->get_linkage()->get_inode() != in
) {
1259 if (!dn
->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1260 if (dn
->get_linkage()->is_primary()) {
1261 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1263 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1264 << " " << *dn
->get_linkage()->get_inode() << " should be " << fb
.inode
.ino
;
1265 dout(0) << ss
.str() << dendl
;
1266 mds
->clog
->warn(ss
);
1268 dir
->unlink_inode(dn
, false);
1270 if (unlinked
.count(in
))
1272 dir
->link_primary_inode(dn
, in
);
1273 dout(10) << "EMetaBlob.replay linked " << *in
<< dendl
;
1275 dout(10) << "EMetaBlob.replay for [" << fb
.dnfirst
<< "," << fb
.dnlast
<< "] had " << *in
<< dendl
;
1277 ceph_assert(in
->first
== fb
.dnfirst
||
1278 (in
->is_multiversion() && in
->first
> fb
.dnfirst
));
1281 in
->_mark_dirty(logseg
);
1282 if (fb
.is_dirty_parent())
1283 in
->mark_dirty_parent(logseg
, fb
.is_dirty_pool());
1284 if (fb
.need_snapflush())
1285 logseg
->open_files
.push_back(&in
->item_open_file
);
1287 in
->state_set(CInode::STATE_AUTH
);
1289 in
->state_clear(CInode::STATE_AUTH
);
1290 ceph_assert(g_conf()->mds_kill_journal_replay_at
!= 2);
1294 for (const auto& rb
: lump
.get_dremote()) {
1295 CDentry
*dn
= dir
->lookup_exact_snap(rb
.dn
, rb
.dnlast
);
1297 dn
= dir
->add_remote_dentry(rb
.dn
, rb
.ino
, rb
.d_type
, rb
.dnfirst
, rb
.dnlast
);
1298 dn
->set_version(rb
.dnv
);
1299 if (rb
.dirty
) dn
->_mark_dirty(logseg
);
1300 dout(10) << "EMetaBlob.replay added " << *dn
<< dendl
;
1302 if (!dn
->get_linkage()->is_null()) {
1303 dout(10) << "EMetaBlob.replay unlinking " << *dn
<< dendl
;
1304 if (dn
->get_linkage()->is_primary()) {
1305 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1307 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1308 << " " << *dn
->get_linkage()->get_inode() << " should be remote " << rb
.ino
;
1309 dout(0) << ss
.str() << dendl
;
1311 dir
->unlink_inode(dn
, false);
1313 dir
->link_remote_inode(dn
, rb
.ino
, rb
.d_type
);
1314 dn
->set_version(rb
.dnv
);
1315 if (rb
.dirty
) dn
->_mark_dirty(logseg
);
1316 dout(10) << "EMetaBlob.replay for [" << rb
.dnfirst
<< "," << rb
.dnlast
<< "] had " << *dn
<< dendl
;
1317 dn
->first
= rb
.dnfirst
;
1318 ceph_assert(dn
->last
== rb
.dnlast
);
1320 if (lump
.is_importing())
1321 dn
->state_set(CDentry::STATE_AUTH
);
1325 for (const auto& nb
: lump
.get_dnull()) {
1326 CDentry
*dn
= dir
->lookup_exact_snap(nb
.dn
, nb
.dnlast
);
1328 dn
= dir
->add_null_dentry(nb
.dn
, nb
.dnfirst
, nb
.dnlast
);
1329 dn
->set_version(nb
.dnv
);
1330 if (nb
.dirty
) dn
->_mark_dirty(logseg
);
1331 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn
<< dendl
;
1333 dn
->first
= nb
.dnfirst
;
1334 if (!dn
->get_linkage()->is_null()) {
1335 dout(10) << "EMetaBlob.replay unlinking " << *dn
<< dendl
;
1336 CInode
*in
= dn
->get_linkage()->get_inode();
1337 // For renamed inode, We may call CInode::force_dirfrag() later.
1338 // CInode::force_dirfrag() doesn't work well when inode is detached
1339 // from the hierarchy.
1340 if (!renamed_diri
|| renamed_diri
!= in
) {
1341 if (dn
->get_linkage()->is_primary())
1343 dir
->unlink_inode(dn
);
1346 dn
->set_version(nb
.dnv
);
1347 if (nb
.dirty
) dn
->_mark_dirty(logseg
);
1348 dout(10) << "EMetaBlob.replay had " << *dn
<< dendl
;
1349 ceph_assert(dn
->last
== nb
.dnlast
);
1352 if (lump
.is_importing())
1353 dn
->state_set(CDentry::STATE_AUTH
);
1355 // Make null dentries the first things we trim
1356 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn
<< dendl
;
1360 ceph_assert(g_conf()->mds_kill_journal_replay_at
!= 3);
1362 if (renamed_dirino
) {
1364 ceph_assert(unlinked
.count(renamed_diri
));
1365 ceph_assert(linked
.count(renamed_diri
));
1366 olddir
= unlinked
[renamed_diri
];
1368 // we imported a diri we haven't seen before
1369 renamed_diri
= mds
->mdcache
->get_inode(renamed_dirino
);
1370 ceph_assert(renamed_diri
); // it was in the metablob
1374 if (olddir
->authority() != CDIR_AUTH_UNDEF
&&
1375 renamed_diri
->authority() == CDIR_AUTH_UNDEF
) {
1376 ceph_assert(slaveup
); // auth to non-auth, must be slave prepare
1378 renamed_diri
->dirfragtree
.get_leaves(leaves
);
1379 for (const auto& leaf
: leaves
) {
1380 CDir
*dir
= renamed_diri
->get_dirfrag(leaf
);
1382 if (dir
->get_dir_auth() == CDIR_AUTH_UNDEF
)
1383 // preserve subtree bound until slave commit
1384 slaveup
->olddirs
.insert(dir
->inode
);
1386 dir
->state_set(CDir::STATE_AUTH
);
1390 mds
->mdcache
->adjust_subtree_after_rename(renamed_diri
, olddir
, false);
1392 // see if we can discard the subtree we renamed out of
1393 CDir
*root
= mds
->mdcache
->get_subtree_root(olddir
);
1394 if (root
->get_dir_auth() == CDIR_AUTH_UNDEF
) {
1395 if (slaveup
) // preserve the old dir until slave commit
1396 slaveup
->olddirs
.insert(olddir
->inode
);
1398 mds
->mdcache
->try_trim_non_auth_subtree(root
);
1402 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1403 if (renamed_diri
->authority() != CDIR_AUTH_UNDEF
) {
1404 for (const auto& p
: renamed_dir_frags
) {
1405 CDir
*dir
= renamed_diri
->get_dirfrag(p
);
1407 // we already had the inode before, and we already adjusted this subtree accordingly.
1408 dout(10) << " already had+adjusted rename import bound " << *dir
<< dendl
;
1409 ceph_assert(olddir
);
1412 dir
= renamed_diri
->get_or_open_dirfrag(mds
->mdcache
, p
);
1413 dout(10) << " creating new rename import bound " << *dir
<< dendl
;
1414 dir
->state_clear(CDir::STATE_AUTH
);
1415 mds
->mdcache
->adjust_subtree_auth(dir
, CDIR_AUTH_UNDEF
);
1419 // rename may overwrite an empty directory and move it into stray dir.
1420 unlinked
.erase(renamed_diri
);
1421 for (map
<CInode
*, CDir
*>::iterator p
= unlinked
.begin(); p
!= unlinked
.end(); ++p
) {
1422 if (!linked
.count(p
->first
))
1424 ceph_assert(p
->first
->is_dir());
1425 mds
->mdcache
->adjust_subtree_after_rename(p
->first
, p
->second
, false);
1429 if (!unlinked
.empty()) {
1430 for (set
<CInode
*>::iterator p
= linked
.begin(); p
!= linked
.end(); ++p
)
1432 dout(10) << " unlinked set contains " << unlinked
<< dendl
;
1433 for (map
<CInode
*, CDir
*>::iterator p
= unlinked
.begin(); p
!= unlinked
.end(); ++p
) {
1434 CInode
*in
= p
->first
;
1435 if (slaveup
) { // preserve unlinked inodes until slave commit
1436 slaveup
->unlinked
.insert(in
);
1438 in
->snaprealm
->adjust_parent();
1440 mds
->mdcache
->remove_inode_recursive(in
);
1444 // table client transactions
1445 for (const auto& p
: table_tids
) {
1446 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p
.first
)
1447 << " transaction " << p
.second
<< dendl
;
1448 MDSTableClient
*client
= mds
->get_table_client(p
.first
);
1450 client
->got_journaled_agree(p
.second
, logseg
);
1455 CInode
*in
= mds
->mdcache
->get_inode(opened_ino
);
1457 dout(10) << "EMetaBlob.replay noting opened inode " << *in
<< dendl
;
1458 logseg
->open_files
.push_back(&in
->item_open_file
);
1463 if (mds
->inotable
->get_version() >= inotablev
) {
1464 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1465 << " <= table " << mds
->inotable
->get_version() << dendl
;
1467 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1468 << " - 1 == table " << mds
->inotable
->get_version()
1469 << " allocated+used " << allocated_ino
1470 << " prealloc " << preallocated_inos
1473 mds
->inotable
->replay_alloc_id(allocated_ino
);
1474 if (preallocated_inos
.size())
1475 mds
->inotable
->replay_alloc_ids(preallocated_inos
);
1477 // [repair bad inotable updates]
1478 if (inotablev
> mds
->inotable
->get_version()) {
1479 mds
->clog
->error() << "journal replay inotablev mismatch "
1480 << mds
->inotable
->get_version() << " -> " << inotablev
;
1481 mds
->inotable
->force_replay_version(inotablev
);
1484 ceph_assert(inotablev
== mds
->inotable
->get_version());
1488 unsigned diff
= (used_preallocated_ino
&& !preallocated_inos
.empty()) ? 2 : 1;
1489 if (mds
->sessionmap
.get_version() >= sessionmapv
) {
1490 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1491 << " <= table " << mds
->sessionmap
.get_version() << dendl
;
1492 } else if (mds
->sessionmap
.get_version() + diff
== sessionmapv
) {
1493 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1494 << " - " << diff
<< " == table " << mds
->sessionmap
.get_version()
1495 << " prealloc " << preallocated_inos
1496 << " used " << used_preallocated_ino
1498 Session
*session
= mds
->sessionmap
.get_session(client_name
);
1500 dout(20) << " (session prealloc " << session
->info
.prealloc_inos
<< ")" << dendl
;
1501 if (used_preallocated_ino
) {
1502 if (!session
->info
.prealloc_inos
.empty()) {
1503 inodeno_t i
= session
->take_ino(used_preallocated_ino
);
1504 ceph_assert(i
== used_preallocated_ino
);
1505 session
->info
.used_inos
.clear();
1507 mds
->sessionmap
.replay_dirty_session(session
);
1509 if (!preallocated_inos
.empty()) {
1510 session
->info
.prealloc_inos
.insert(preallocated_inos
);
1511 mds
->sessionmap
.replay_dirty_session(session
);
1515 dout(10) << "EMetaBlob.replay no session for " << client_name
<< dendl
;
1516 if (used_preallocated_ino
)
1517 mds
->sessionmap
.replay_advance_version();
1519 if (!preallocated_inos
.empty())
1520 mds
->sessionmap
.replay_advance_version();
1522 ceph_assert(sessionmapv
== mds
->sessionmap
.get_version());
1524 mds
->clog
->error() << "EMetaBlob.replay sessionmap v " << sessionmapv
1525 << " - " << diff
<< " > table " << mds
->sessionmap
.get_version();
1526 ceph_assert(g_conf()->mds_wipe_sessions
);
1527 mds
->sessionmap
.wipe();
1528 mds
->sessionmap
.set_version(sessionmapv
);
1532 // truncating inodes
1533 for (const auto& ino
: truncate_start
) {
1534 CInode
*in
= mds
->mdcache
->get_inode(ino
);
1536 mds
->mdcache
->add_recovered_truncate(in
, logseg
);
1538 for (const auto& p
: truncate_finish
) {
1539 LogSegment
*ls
= mds
->mdlog
->get_segment(p
.second
);
1541 CInode
*in
= mds
->mdcache
->get_inode(p
.first
);
1543 mds
->mdcache
->remove_recovered_truncate(in
, ls
);
1548 if (!destroyed_inodes
.empty()) {
1549 for (vector
<inodeno_t
>::iterator p
= destroyed_inodes
.begin();
1550 p
!= destroyed_inodes
.end();
1552 CInode
*in
= mds
->mdcache
->get_inode(*p
);
1554 dout(10) << "EMetaBlob.replay destroyed " << *p
<< ", dropping " << *in
<< dendl
;
1555 CDentry
*parent
= in
->get_parent_dn();
1556 mds
->mdcache
->remove_inode(in
);
1558 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent
<< dendl
;
1559 ceph_assert(parent
->get_linkage()->is_null());
1562 dout(10) << "EMetaBlob.replay destroyed " << *p
<< ", not in cache" << dendl
;
1565 mds
->mdcache
->open_file_table
.note_destroyed_inos(logseg
->seq
, destroyed_inodes
);
1569 for (const auto& p
: client_reqs
) {
1570 if (p
.first
.name
.is_client()) {
1571 dout(10) << "EMetaBlob.replay request " << p
.first
<< " trim_to " << p
.second
<< dendl
;
1572 inodeno_t created
= allocated_ino
? allocated_ino
: used_preallocated_ino
;
1573 // if we allocated an inode, there should be exactly one client request id.
1574 ceph_assert(created
== inodeno_t() || client_reqs
.size() == 1);
1576 Session
*session
= mds
->sessionmap
.get_session(p
.first
.name
);
1578 session
->add_completed_request(p
.first
.tid
, created
);
1580 session
->trim_completed_requests(p
.second
);
1586 for (const auto& p
: client_flushes
) {
1587 if (p
.first
.name
.is_client()) {
1588 dout(10) << "EMetaBlob.replay flush " << p
.first
<< " trim_to " << p
.second
<< dendl
;
1589 Session
*session
= mds
->sessionmap
.get_session(p
.first
.name
);
1591 session
->add_completed_flush(p
.first
.tid
);
1593 session
->trim_completed_flushes(p
.second
);
1599 update_segment(logseg
);
1601 ceph_assert(g_conf()->mds_kill_journal_replay_at
!= 4);
1604 // -----------------------
1606 void EPurged::update_segment()
1608 if (inos
.size() && inotablev
)
1609 get_segment()->inotablev
= inotablev
;
1613 void EPurged::replay(MDSRank
*mds
)
1616 LogSegment
*ls
= mds
->mdlog
->get_segment(seq
);
1618 ls
->purge_inodes
.subtract(inos
);
1620 if (mds
->inotable
->get_version() >= inotablev
) {
1621 dout(10) << "EPurged.replay inotable " << mds
->inotable
->get_version()
1622 << " >= " << inotablev
<< ", noop" << dendl
;
1624 dout(10) << "EPurged.replay inotable " << mds
->inotable
->get_version()
1625 << " < " << inotablev
<< " " << dendl
;
1626 mds
->inotable
->replay_release_ids(inos
);
1627 assert(mds
->inotable
->get_version() == inotablev
);
1633 void EPurged::encode(bufferlist
& bl
, uint64_t features
) const
1635 ENCODE_START(1, 1, bl
);
1637 encode(inotablev
, bl
);
1642 void EPurged::decode(bufferlist::const_iterator
& bl
)
1644 DECODE_START(1, bl
);
1646 decode(inotablev
, bl
);
1651 void EPurged::dump(Formatter
*f
) const
1653 f
->dump_stream("inos") << inos
;
1654 f
->dump_int("inotable version", inotablev
);
1655 f
->dump_int("segment seq", seq
);
1658 // -----------------------
1661 void ESession::update_segment()
1663 get_segment()->sessionmapv
= cmapv
;
1664 if (inos
.size() && inotablev
)
1665 get_segment()->inotablev
= inotablev
;
1668 void ESession::replay(MDSRank
*mds
)
1670 if (purge_inos
.size())
1671 get_segment()->purge_inodes
.insert(purge_inos
);
1673 if (mds
->sessionmap
.get_version() >= cmapv
) {
1674 dout(10) << "ESession.replay sessionmap " << mds
->sessionmap
.get_version()
1675 << " >= " << cmapv
<< ", noop" << dendl
;
1676 } else if (mds
->sessionmap
.get_version() + 1 == cmapv
) {
1677 dout(10) << "ESession.replay sessionmap " << mds
->sessionmap
.get_version()
1678 << " < " << cmapv
<< " " << (open
? "open":"close") << " " << client_inst
<< dendl
;
1681 session
= mds
->sessionmap
.get_or_add_session(client_inst
);
1682 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
1683 session
->set_client_metadata(client_metadata
);
1684 dout(10) << " opened session " << session
->info
.inst
<< dendl
;
1686 session
= mds
->sessionmap
.get_session(client_inst
.name
);
1687 if (session
) { // there always should be a session, but there's a bug
1688 if (session
->get_connection() == NULL
) {
1689 dout(10) << " removed session " << session
->info
.inst
<< dendl
;
1690 mds
->sessionmap
.remove_session(session
);
1693 session
->clear(); // the client has reconnected; keep the Session, but reset
1694 dout(10) << " reset session " << session
->info
.inst
<< " (they reconnected)" << dendl
;
1697 mds
->clog
->error() << "replayed stray Session close event for " << client_inst
1698 << " from time " << stamp
<< ", ignoring";
1702 mds
->sessionmap
.replay_dirty_session(session
);
1704 mds
->sessionmap
.replay_advance_version();
1706 ceph_assert(mds
->sessionmap
.get_version() == cmapv
);
1708 mds
->clog
->error() << "ESession.replay sessionmap v " << cmapv
1709 << " - 1 > table " << mds
->sessionmap
.get_version();
1710 ceph_assert(g_conf()->mds_wipe_sessions
);
1711 mds
->sessionmap
.wipe();
1712 mds
->sessionmap
.set_version(cmapv
);
1715 if (inos
.size() && inotablev
) {
1716 if (mds
->inotable
->get_version() >= inotablev
) {
1717 dout(10) << "ESession.replay inotable " << mds
->inotable
->get_version()
1718 << " >= " << inotablev
<< ", noop" << dendl
;
1720 dout(10) << "ESession.replay inotable " << mds
->inotable
->get_version()
1721 << " < " << inotablev
<< " " << (open
? "add":"remove") << dendl
;
1722 ceph_assert(!open
); // for now
1723 mds
->inotable
->replay_release_ids(inos
);
1724 ceph_assert(mds
->inotable
->get_version() == inotablev
);
1731 void ESession::encode(bufferlist
&bl
, uint64_t features
) const
1733 ENCODE_START(6, 5, bl
);
1735 encode(client_inst
, bl
, features
);
1739 encode(inotablev
, bl
);
1740 encode(client_metadata
, bl
);
1741 encode(purge_inos
, bl
);
1745 void ESession::decode(bufferlist::const_iterator
&bl
)
1747 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl
);
1750 decode(client_inst
, bl
);
1754 decode(inotablev
, bl
);
1755 if (struct_v
== 4) {
1756 decode(client_metadata
.kv_map
, bl
);
1757 } else if (struct_v
>= 5) {
1758 decode(client_metadata
, bl
);
1761 decode(purge_inos
, bl
);
1767 void ESession::dump(Formatter
*f
) const
1769 f
->dump_stream("client instance") << client_inst
;
1770 f
->dump_string("open", open
? "true" : "false");
1771 f
->dump_int("client map version", cmapv
);
1772 f
->dump_stream("inos") << inos
;
1773 f
->dump_int("inotable version", inotablev
);
1774 f
->open_object_section("client_metadata");
1775 client_metadata
.dump(f
);
1776 f
->close_section(); // client_metadata
1779 void ESession::generate_test_instances(std::list
<ESession
*>& ls
)
1781 ls
.push_back(new ESession
);
1784 // -----------------------
1787 void ESessions::encode(bufferlist
&bl
, uint64_t features
) const
1789 ENCODE_START(2, 1, bl
);
1790 encode(client_map
, bl
, features
);
1793 encode(client_metadata_map
, bl
);
1797 void ESessions::decode_old(bufferlist::const_iterator
&bl
)
1800 decode(client_map
, bl
);
1806 void ESessions::decode_new(bufferlist::const_iterator
&bl
)
1808 DECODE_START(2, bl
);
1809 decode(client_map
, bl
);
1813 decode(client_metadata_map
, bl
);
1817 void ESessions::dump(Formatter
*f
) const
1819 f
->dump_int("client map version", cmapv
);
1821 f
->open_array_section("client map");
1822 for (map
<client_t
,entity_inst_t
>::const_iterator i
= client_map
.begin();
1823 i
!= client_map
.end(); ++i
) {
1824 f
->open_object_section("client");
1825 f
->dump_int("client id", i
->first
.v
);
1826 f
->dump_stream("client entity") << i
->second
;
1827 f
->close_section(); // client
1829 f
->close_section(); // client map
1832 void ESessions::generate_test_instances(std::list
<ESessions
*>& ls
)
1834 ls
.push_back(new ESessions());
1837 void ESessions::update_segment()
1839 get_segment()->sessionmapv
= cmapv
;
1842 void ESessions::replay(MDSRank
*mds
)
1844 if (mds
->sessionmap
.get_version() >= cmapv
) {
1845 dout(10) << "ESessions.replay sessionmap " << mds
->sessionmap
.get_version()
1846 << " >= " << cmapv
<< ", noop" << dendl
;
1848 dout(10) << "ESessions.replay sessionmap " << mds
->sessionmap
.get_version()
1849 << " < " << cmapv
<< dendl
;
1850 mds
->sessionmap
.replay_open_sessions(cmapv
, client_map
, client_metadata_map
);
1856 // -----------------------
1859 void ETableServer::encode(bufferlist
& bl
, uint64_t features
) const
1861 ENCODE_START(3, 3, bl
);
1867 encode(mutation
, bl
);
1869 encode(version
, bl
);
1873 void ETableServer::decode(bufferlist::const_iterator
&bl
)
1875 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
1882 decode(mutation
, bl
);
1884 decode(version
, bl
);
1888 void ETableServer::dump(Formatter
*f
) const
1890 f
->dump_int("table id", table
);
1891 f
->dump_int("op", op
);
1892 f
->dump_int("request id", reqid
);
1893 f
->dump_int("by mds", bymds
);
1894 f
->dump_int("tid", tid
);
1895 f
->dump_int("version", version
);
1898 void ETableServer::generate_test_instances(std::list
<ETableServer
*>& ls
)
1900 ls
.push_back(new ETableServer());
1904 void ETableServer::update_segment()
1906 get_segment()->tablev
[table
] = version
;
1909 void ETableServer::replay(MDSRank
*mds
)
1911 MDSTableServer
*server
= mds
->get_table_server(table
);
1915 if (server
->get_version() >= version
) {
1916 dout(10) << "ETableServer.replay " << get_mdstable_name(table
)
1917 << " " << get_mdstableserver_opname(op
)
1918 << " event " << version
1919 << " <= table " << server
->get_version() << dendl
;
1923 dout(10) << " ETableServer.replay " << get_mdstable_name(table
)
1924 << " " << get_mdstableserver_opname(op
)
1925 << " event " << version
<< " - 1 == table " << server
->get_version() << dendl
;
1926 ceph_assert(version
-1 == server
->get_version());
1929 case TABLESERVER_OP_PREPARE
: {
1930 server
->_note_prepare(bymds
, reqid
, true);
1932 server
->_prepare(mutation
, reqid
, bymds
, out
);
1933 mutation
= std::move(out
);
1936 case TABLESERVER_OP_COMMIT
:
1937 server
->_commit(tid
, ref_t
<MMDSTableRequest
>());
1938 server
->_note_commit(tid
, true);
1940 case TABLESERVER_OP_ROLLBACK
:
1941 server
->_rollback(tid
);
1942 server
->_note_rollback(tid
, true);
1944 case TABLESERVER_OP_SERVER_UPDATE
:
1945 server
->_server_update(mutation
);
1946 server
->_note_server_update(mutation
, true);
1949 mds
->clog
->error() << "invalid tableserver op in ETableServer";
1951 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1954 ceph_assert(version
== server
->get_version());
1959 // ---------------------
1962 void ETableClient::encode(bufferlist
& bl
, uint64_t features
) const
1964 ENCODE_START(3, 3, bl
);
1972 void ETableClient::decode(bufferlist::const_iterator
&bl
)
1974 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
1983 void ETableClient::dump(Formatter
*f
) const
1985 f
->dump_int("table", table
);
1986 f
->dump_int("op", op
);
1987 f
->dump_int("tid", tid
);
1990 void ETableClient::generate_test_instances(std::list
<ETableClient
*>& ls
)
1992 ls
.push_back(new ETableClient());
1995 void ETableClient::replay(MDSRank
*mds
)
1997 dout(10) << " ETableClient.replay " << get_mdstable_name(table
)
1998 << " op " << get_mdstableserver_opname(op
)
1999 << " tid " << tid
<< dendl
;
2001 MDSTableClient
*client
= mds
->get_table_client(table
);
2005 ceph_assert(op
== TABLESERVER_OP_ACK
);
2006 client
->got_journaled_ack(tid
);
2010 // -----------------------
2013 void ESnap::update_segment()
2015 get_segment()->tablev[TABLE_SNAP] = version;
2018 void ESnap::replay(MDSRank *mds)
2020 if (mds->snaptable->get_version() >= version) {
2021 dout(10) << "ESnap.replay event " << version
2022 << " <= table " << mds->snaptable->get_version() << dendl;
2026 dout(10) << " ESnap.replay event " << version
2027 << " - 1 == table " << mds->snaptable->get_version() << dendl;
2028 ceph_assert(version-1 == mds->snaptable->get_version());
2032 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
2033 ceph_assert(s == snap.snapid);
2035 mds->snaptable->remove(snap.snapid);
2038 ceph_assert(version == mds->snaptable->get_version());
2044 // -----------------------
2047 void EUpdate::encode(bufferlist
&bl
, uint64_t features
) const
2049 ENCODE_START(4, 4, bl
);
2052 encode(metablob
, bl
, features
);
2053 encode(client_map
, bl
);
2056 encode(had_slaves
, bl
);
2060 void EUpdate::decode(bufferlist::const_iterator
&bl
)
2062 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl
);
2066 decode(metablob
, bl
);
2067 decode(client_map
, bl
);
2071 decode(had_slaves
, bl
);
2075 void EUpdate::dump(Formatter
*f
) const
2077 f
->open_object_section("metablob");
2079 f
->close_section(); // metablob
2081 f
->dump_string("type", type
);
2082 f
->dump_int("client map length", client_map
.length());
2083 f
->dump_int("client map version", cmapv
);
2084 f
->dump_stream("reqid") << reqid
;
2085 f
->dump_string("had slaves", had_slaves
? "true" : "false");
2088 void EUpdate::generate_test_instances(std::list
<EUpdate
*>& ls
)
2090 ls
.push_back(new EUpdate());
2094 void EUpdate::update_segment()
2096 auto&& segment
= get_segment();
2097 metablob
.update_segment(segment
);
2099 if (client_map
.length())
2100 segment
->sessionmapv
= cmapv
;
2103 segment
->uncommitted_masters
.insert(reqid
);
2106 void EUpdate::replay(MDSRank
*mds
)
2108 auto&& segment
= get_segment();
2109 metablob
.replay(mds
, segment
);
2112 dout(10) << "EUpdate.replay " << reqid
<< " had slaves, expecting a matching ECommitted" << dendl
;
2113 segment
->uncommitted_masters
.insert(reqid
);
2114 set
<mds_rank_t
> slaves
;
2115 mds
->mdcache
->add_uncommitted_master(reqid
, segment
, slaves
, true);
2118 if (client_map
.length()) {
2119 if (mds
->sessionmap
.get_version() >= cmapv
) {
2120 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2121 << " <= table " << mds
->sessionmap
.get_version() << dendl
;
2123 dout(10) << "EUpdate.replay sessionmap " << mds
->sessionmap
.get_version()
2124 << " < " << cmapv
<< dendl
;
2125 // open client sessions?
2126 map
<client_t
,entity_inst_t
> cm
;
2127 map
<client_t
,client_metadata_t
> cmm
;
2128 auto blp
= client_map
.cbegin();
2133 mds
->sessionmap
.replay_open_sessions(cmapv
, cm
, cmm
);
2140 // ------------------------
2143 void EOpen::encode(bufferlist
&bl
, uint64_t features
) const {
2144 ENCODE_START(4, 3, bl
);
2146 encode(metablob
, bl
, features
);
2148 encode(snap_inos
, bl
);
2152 void EOpen::decode(bufferlist::const_iterator
&bl
) {
2153 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2156 decode(metablob
, bl
);
2159 decode(snap_inos
, bl
);
2163 void EOpen::dump(Formatter
*f
) const
2165 f
->open_object_section("metablob");
2167 f
->close_section(); // metablob
2168 f
->open_array_section("inos involved");
2169 for (vector
<inodeno_t
>::const_iterator i
= inos
.begin();
2170 i
!= inos
.end(); ++i
) {
2171 f
->dump_int("ino", *i
);
2173 f
->close_section(); // inos
2176 void EOpen::generate_test_instances(std::list
<EOpen
*>& ls
)
2178 ls
.push_back(new EOpen());
2179 ls
.push_back(new EOpen());
2180 ls
.back()->add_ino(0);
2183 void EOpen::update_segment()
2188 void EOpen::replay(MDSRank
*mds
)
2190 dout(10) << "EOpen.replay " << dendl
;
2191 auto&& segment
= get_segment();
2192 metablob
.replay(mds
, segment
);
2194 // note which segments inodes belong to, so we don't have to start rejournaling them
2195 for (const auto &ino
: inos
) {
2196 CInode
*in
= mds
->mdcache
->get_inode(ino
);
2198 dout(0) << "EOpen.replay ino " << ino
<< " not in metablob" << dendl
;
2201 segment
->open_files
.push_back(&in
->item_open_file
);
2203 for (const auto &vino
: snap_inos
) {
2204 CInode
*in
= mds
->mdcache
->get_inode(vino
);
2206 dout(0) << "EOpen.replay ino " << vino
<< " not in metablob" << dendl
;
2209 segment
->open_files
.push_back(&in
->item_open_file
);
2214 // -----------------------
2217 void ECommitted::replay(MDSRank
*mds
)
2219 if (mds
->mdcache
->uncommitted_masters
.count(reqid
)) {
2220 dout(10) << "ECommitted.replay " << reqid
<< dendl
;
2221 mds
->mdcache
->uncommitted_masters
[reqid
].ls
->uncommitted_masters
.erase(reqid
);
2222 mds
->mdcache
->uncommitted_masters
.erase(reqid
);
2224 dout(10) << "ECommitted.replay " << reqid
<< " -- didn't see original op" << dendl
;
2228 void ECommitted::encode(bufferlist
& bl
, uint64_t features
) const
2230 ENCODE_START(3, 3, bl
);
2236 void ECommitted::decode(bufferlist::const_iterator
& bl
)
2238 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2245 void ECommitted::dump(Formatter
*f
) const {
2246 f
->dump_stream("stamp") << stamp
;
2247 f
->dump_stream("reqid") << reqid
;
2250 void ECommitted::generate_test_instances(std::list
<ECommitted
*>& ls
)
2252 ls
.push_back(new ECommitted
);
2253 ls
.push_back(new ECommitted
);
2254 ls
.back()->stamp
= utime_t(1, 2);
2255 ls
.back()->reqid
= metareqid_t(entity_name_t::CLIENT(123), 456);
2258 // -----------------------
2261 void link_rollback::encode(bufferlist
&bl
) const
2263 ENCODE_START(3, 2, bl
);
2266 encode(was_inc
, bl
);
2267 encode(old_ctime
, bl
);
2268 encode(old_dir_mtime
, bl
);
2269 encode(old_dir_rctime
, bl
);
2274 void link_rollback::decode(bufferlist::const_iterator
&bl
)
2276 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
2279 decode(was_inc
, bl
);
2280 decode(old_ctime
, bl
);
2281 decode(old_dir_mtime
, bl
);
2282 decode(old_dir_rctime
, bl
);
2288 void link_rollback::dump(Formatter
*f
) const
2290 f
->dump_stream("metareqid") << reqid
;
2291 f
->dump_int("ino", ino
);
2292 f
->dump_string("was incremented", was_inc
? "true" : "false");
2293 f
->dump_stream("old_ctime") << old_ctime
;
2294 f
->dump_stream("old_dir_mtime") << old_dir_mtime
;
2295 f
->dump_stream("old_dir_rctime") << old_dir_rctime
;
2298 void link_rollback::generate_test_instances(std::list
<link_rollback
*>& ls
)
2300 ls
.push_back(new link_rollback());
2303 void rmdir_rollback::encode(bufferlist
& bl
) const
2305 ENCODE_START(3, 2, bl
);
2307 encode(src_dir
, bl
);
2308 encode(src_dname
, bl
);
2309 encode(dest_dir
, bl
);
2310 encode(dest_dname
, bl
);
2315 void rmdir_rollback::decode(bufferlist::const_iterator
& bl
)
2317 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
2319 decode(src_dir
, bl
);
2320 decode(src_dname
, bl
);
2321 decode(dest_dir
, bl
);
2322 decode(dest_dname
, bl
);
2328 void rmdir_rollback::dump(Formatter
*f
) const
2330 f
->dump_stream("metareqid") << reqid
;
2331 f
->dump_stream("source directory") << src_dir
;
2332 f
->dump_string("source dname", src_dname
);
2333 f
->dump_stream("destination directory") << dest_dir
;
2334 f
->dump_string("destination dname", dest_dname
);
2337 void rmdir_rollback::generate_test_instances(std::list
<rmdir_rollback
*>& ls
)
2339 ls
.push_back(new rmdir_rollback());
2342 void rename_rollback::drec::encode(bufferlist
&bl
) const
2344 ENCODE_START(2, 2, bl
);
2345 encode(dirfrag
, bl
);
2346 encode(dirfrag_old_mtime
, bl
);
2347 encode(dirfrag_old_rctime
, bl
);
2349 encode(remote_ino
, bl
);
2351 encode(remote_d_type
, bl
);
2352 encode(old_ctime
, bl
);
2356 void rename_rollback::drec::decode(bufferlist::const_iterator
&bl
)
2358 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2359 decode(dirfrag
, bl
);
2360 decode(dirfrag_old_mtime
, bl
);
2361 decode(dirfrag_old_rctime
, bl
);
2363 decode(remote_ino
, bl
);
2365 decode(remote_d_type
, bl
);
2366 decode(old_ctime
, bl
);
2370 void rename_rollback::drec::dump(Formatter
*f
) const
2372 f
->dump_stream("directory fragment") << dirfrag
;
2373 f
->dump_stream("directory old mtime") << dirfrag_old_mtime
;
2374 f
->dump_stream("directory old rctime") << dirfrag_old_rctime
;
2375 f
->dump_int("ino", ino
);
2376 f
->dump_int("remote ino", remote_ino
);
2377 f
->dump_string("dname", dname
);
2378 uint32_t type
= DTTOIF(remote_d_type
) & S_IFMT
; // convert to type entries
2382 type_string
= "file"; break;
2384 type_string
= "symlink"; break;
2386 type_string
= "directory"; break;
2388 type_string
= "UNKNOWN-" + stringify((int)type
); break;
2390 f
->dump_string("remote dtype", type_string
);
2391 f
->dump_stream("old ctime") << old_ctime
;
2394 void rename_rollback::drec::generate_test_instances(std::list
<drec
*>& ls
)
2396 ls
.push_back(new drec());
2397 ls
.back()->remote_d_type
= IFTODT(S_IFREG
);
2400 void rename_rollback::encode(bufferlist
&bl
) const
2402 ENCODE_START(3, 2, bl
);
2404 encode(orig_src
, bl
);
2405 encode(orig_dest
, bl
);
2408 encode(srci_snapbl
, bl
);
2409 encode(desti_snapbl
, bl
);
2413 void rename_rollback::decode(bufferlist::const_iterator
&bl
)
2415 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl
);
2417 decode(orig_src
, bl
);
2418 decode(orig_dest
, bl
);
2421 if (struct_v
>= 3) {
2422 decode(srci_snapbl
, bl
);
2423 decode(desti_snapbl
, bl
);
2428 void rename_rollback::dump(Formatter
*f
) const
2430 f
->dump_stream("request id") << reqid
;
2431 f
->open_object_section("original src drec");
2433 f
->close_section(); // original src drec
2434 f
->open_object_section("original dest drec");
2436 f
->close_section(); // original dest drec
2437 f
->open_object_section("stray drec");
2439 f
->close_section(); // stray drec
2440 f
->dump_stream("ctime") << ctime
;
2443 void rename_rollback::generate_test_instances(std::list
<rename_rollback
*>& ls
)
2445 ls
.push_back(new rename_rollback());
2446 ls
.back()->orig_src
.remote_d_type
= IFTODT(S_IFREG
);
2447 ls
.back()->orig_dest
.remote_d_type
= IFTODT(S_IFREG
);
2448 ls
.back()->stray
.remote_d_type
= IFTODT(S_IFREG
);
2451 void ESlaveUpdate::encode(bufferlist
&bl
, uint64_t features
) const
2453 ENCODE_START(3, 3, bl
);
2460 encode(commit
, bl
, features
);
2461 encode(rollback
, bl
);
2465 void ESlaveUpdate::decode(bufferlist::const_iterator
&bl
)
2467 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2476 decode(rollback
, bl
);
2480 void ESlaveUpdate::dump(Formatter
*f
) const
2482 f
->open_object_section("metablob");
2484 f
->close_section(); // metablob
2486 f
->dump_int("rollback length", rollback
.length());
2487 f
->dump_string("type", type
);
2488 f
->dump_stream("metareqid") << reqid
;
2489 f
->dump_int("master", master
);
2490 f
->dump_int("op", op
);
2491 f
->dump_int("original op", origop
);
2494 void ESlaveUpdate::generate_test_instances(std::list
<ESlaveUpdate
*>& ls
)
2496 ls
.push_back(new ESlaveUpdate());
2500 void ESlaveUpdate::replay(MDSRank
*mds
)
2503 auto&& segment
= get_segment();
2505 case ESlaveUpdate::OP_PREPARE
:
2506 dout(10) << "ESlaveUpdate.replay prepare " << reqid
<< " for mds." << master
2507 << ": applying commit, saving rollback info" << dendl
;
2508 su
= new MDSlaveUpdate(origop
, rollback
, segment
->slave_updates
);
2509 commit
.replay(mds
, segment
, su
);
2510 mds
->mdcache
->add_uncommitted_slave_update(reqid
, master
, su
);
2513 case ESlaveUpdate::OP_COMMIT
:
2514 su
= mds
->mdcache
->get_uncommitted_slave_update(reqid
, master
);
2516 dout(10) << "ESlaveUpdate.replay commit " << reqid
<< " for mds." << master
<< dendl
;
2517 mds
->mdcache
->finish_uncommitted_slave_update(reqid
, master
);
2519 dout(10) << "ESlaveUpdate.replay commit " << reqid
<< " for mds." << master
2520 << ": ignoring, no previously saved prepare" << dendl
;
2524 case ESlaveUpdate::OP_ROLLBACK
:
2525 dout(10) << "ESlaveUpdate.replay abort " << reqid
<< " for mds." << master
2526 << ": applying rollback commit blob" << dendl
;
2527 commit
.replay(mds
, segment
);
2528 su
= mds
->mdcache
->get_uncommitted_slave_update(reqid
, master
);
2530 mds
->mdcache
->finish_uncommitted_slave_update(reqid
, master
);
2534 mds
->clog
->error() << "invalid op in ESlaveUpdate";
2536 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2541 // -----------------------
2544 void ESubtreeMap::encode(bufferlist
& bl
, uint64_t features
) const
2546 ENCODE_START(6, 5, bl
);
2548 encode(metablob
, bl
, features
);
2549 encode(subtrees
, bl
);
2550 encode(ambiguous_subtrees
, bl
);
2551 encode(expire_pos
, bl
);
2552 encode(event_seq
, bl
);
2556 void ESubtreeMap::decode(bufferlist::const_iterator
&bl
)
2558 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl
);
2561 decode(metablob
, bl
);
2562 decode(subtrees
, bl
);
2564 decode(ambiguous_subtrees
, bl
);
2566 decode(expire_pos
, bl
);
2568 decode(event_seq
, bl
);
2572 void ESubtreeMap::dump(Formatter
*f
) const
2574 f
->open_object_section("metablob");
2576 f
->close_section(); // metablob
2578 f
->open_array_section("subtrees");
2579 for(map
<dirfrag_t
,vector
<dirfrag_t
> >::const_iterator i
= subtrees
.begin();
2580 i
!= subtrees
.end(); ++i
) {
2581 f
->open_object_section("tree");
2582 f
->dump_stream("root dirfrag") << i
->first
;
2583 for (vector
<dirfrag_t
>::const_iterator j
= i
->second
.begin();
2584 j
!= i
->second
.end(); ++j
) {
2585 f
->dump_stream("bound dirfrag") << *j
;
2587 f
->close_section(); // tree
2589 f
->close_section(); // subtrees
2591 f
->open_array_section("ambiguous subtrees");
2592 for(set
<dirfrag_t
>::const_iterator i
= ambiguous_subtrees
.begin();
2593 i
!= ambiguous_subtrees
.end(); ++i
) {
2594 f
->dump_stream("dirfrag") << *i
;
2596 f
->close_section(); // ambiguous subtrees
2598 f
->dump_int("expire position", expire_pos
);
2601 void ESubtreeMap::generate_test_instances(std::list
<ESubtreeMap
*>& ls
)
2603 ls
.push_back(new ESubtreeMap());
2606 void ESubtreeMap::replay(MDSRank
*mds
)
2608 if (expire_pos
&& expire_pos
> mds
->mdlog
->journaler
->get_expire_pos())
2609 mds
->mdlog
->journaler
->set_expire_pos(expire_pos
);
2611 // suck up the subtree map?
2612 if (mds
->mdcache
->is_subtrees()) {
2613 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl
;
2616 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= subtrees
.begin();
2617 p
!= subtrees
.end();
2619 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
2621 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2622 << " subtree root " << p
->first
<< " not in cache";
2627 if (!mds
->mdcache
->is_subtree(dir
)) {
2628 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2629 << " subtree root " << p
->first
<< " not a subtree in cache";
2633 if (dir
->get_dir_auth().first
!= mds
->get_nodeid()) {
2634 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2635 << " subtree root " << p
->first
2636 << " is not mine in cache (it's " << dir
->get_dir_auth() << ")";
2641 for (vector
<dirfrag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
2642 mds
->mdcache
->get_force_dirfrag(*q
, true);
2645 mds
->mdcache
->get_subtree_bounds(dir
, bounds
);
2646 for (vector
<dirfrag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
2647 CDir
*b
= mds
->mdcache
->get_dirfrag(*q
);
2649 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2650 << " subtree " << p
->first
<< " bound " << *q
<< " not in cache";
2654 if (bounds
.count(b
) == 0) {
2655 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2656 << " subtree " << p
->first
<< " bound " << *q
<< " not a bound in cache";
2662 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
) {
2663 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2664 << " subtree " << p
->first
<< " has extra bound in cache " << (*q
)->dirfrag();
2668 if (ambiguous_subtrees
.count(p
->first
)) {
2669 if (!mds
->mdcache
->have_ambiguous_import(p
->first
)) {
2670 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2671 << " subtree " << p
->first
<< " is ambiguous but is not in our cache";
2675 if (mds
->mdcache
->have_ambiguous_import(p
->first
)) {
2676 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2677 << " subtree " << p
->first
<< " is not ambiguous but is in our cache";
2683 std::vector
<CDir
*> dirs
;
2684 mds
->mdcache
->get_subtrees(dirs
);
2685 for (const auto& dir
: dirs
) {
2686 if (dir
->get_dir_auth().first
!= mds
->get_nodeid())
2688 if (subtrees
.count(dir
->dirfrag()) == 0) {
2689 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2690 << " does not include cache subtree " << dir
->dirfrag();
2696 dout(0) << "journal subtrees: " << subtrees
<< dendl
;
2697 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees
<< dendl
;
2698 mds
->mdcache
->show_subtrees();
2699 ceph_assert(!g_conf()->mds_debug_subtrees
|| errors
== 0);
2704 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl
;
2706 // first, stick the spanning tree in my cache
2707 //metablob.print(*_dout);
2708 metablob
.replay(mds
, get_segment());
2710 // restore import/export maps
2711 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= subtrees
.begin();
2712 p
!= subtrees
.end();
2714 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
2716 if (ambiguous_subtrees
.count(p
->first
)) {
2718 mds
->mdcache
->add_ambiguous_import(p
->first
, p
->second
);
2719 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, p
->second
,
2720 mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
2723 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, p
->second
, mds
->get_nodeid());
2727 mds
->mdcache
->recalc_auth_bits(true);
2729 mds
->mdcache
->show_subtrees();
2734 // -----------------------
2737 void EFragment::replay(MDSRank
*mds
)
2739 dout(10) << "EFragment.replay " << op_name(op
) << " " << ino
<< " " << basefrag
<< " by " << bits
<< dendl
;
2741 std::vector
<CDir
*> resultfrags
;
2742 MDSContext::vec waiters
;
2744 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2745 // it will be once we replay the metablob , but first we need to
2746 // refragment anything we already have in the cache.
2747 CInode
*in
= mds
->mdcache
->get_inode(ino
);
2749 auto&& segment
= get_segment();
2752 mds
->mdcache
->add_uncommitted_fragment(dirfrag_t(ino
, basefrag
), bits
, orig_frags
, segment
, &rollback
);
2755 mds
->mdcache
->adjust_dir_fragments(in
, basefrag
, bits
, &resultfrags
, waiters
, true);
2759 frag_vec_t old_frags
;
2761 in
->dirfragtree
.get_leaves_under(basefrag
, old_frags
);
2762 if (orig_frags
.empty()) {
2763 // old format EFragment
2764 mds
->mdcache
->adjust_dir_fragments(in
, basefrag
, -bits
, &resultfrags
, waiters
, true);
2766 for (const auto& fg
: orig_frags
)
2767 mds
->mdcache
->force_dir_fragment(in
, fg
);
2770 mds
->mdcache
->rollback_uncommitted_fragment(dirfrag_t(ino
, basefrag
), std::move(old_frags
));
2776 mds
->mdcache
->finish_uncommitted_fragment(dirfrag_t(ino
, basefrag
), op
);
2783 metablob
.replay(mds
, segment
);
2784 if (in
&& g_conf()->mds_debug_frag
)
2785 in
->verify_dirfrags();
2788 void EFragment::encode(bufferlist
&bl
, uint64_t features
) const {
2789 ENCODE_START(5, 4, bl
);
2793 encode(basefrag
, bl
);
2795 encode(metablob
, bl
, features
);
2796 encode(orig_frags
, bl
);
2797 encode(rollback
, bl
);
2801 void EFragment::decode(bufferlist::const_iterator
&bl
) {
2802 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl
);
2808 decode(basefrag
, bl
);
2810 decode(metablob
, bl
);
2811 if (struct_v
>= 5) {
2812 decode(orig_frags
, bl
);
2813 decode(rollback
, bl
);
2818 void EFragment::dump(Formatter
*f
) const
2820 /*f->open_object_section("Metablob");
2821 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2822 f->close_section();*/
2823 f
->dump_string("op", op_name(op
));
2824 f
->dump_stream("ino") << ino
;
2825 f
->dump_stream("base frag") << basefrag
;
2826 f
->dump_int("bits", bits
);
2829 void EFragment::generate_test_instances(std::list
<EFragment
*>& ls
)
2831 ls
.push_back(new EFragment
);
2832 ls
.push_back(new EFragment
);
2833 ls
.back()->op
= OP_PREPARE
;
2835 ls
.back()->bits
= 5;
2838 void dirfrag_rollback::encode(bufferlist
&bl
) const
2840 ENCODE_START(1, 1, bl
);
2845 void dirfrag_rollback::decode(bufferlist::const_iterator
&bl
)
2847 DECODE_START(1, bl
);
2854 // =========================================================================
2856 // -----------------------
2859 void EExport::replay(MDSRank
*mds
)
2861 dout(10) << "EExport.replay " << base
<< dendl
;
2862 auto&& segment
= get_segment();
2863 metablob
.replay(mds
, segment
);
2865 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
2868 set
<CDir
*> realbounds
;
2869 for (set
<dirfrag_t
>::iterator p
= bounds
.begin();
2872 CDir
*bd
= mds
->mdcache
->get_dirfrag(*p
);
2874 realbounds
.insert(bd
);
2878 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, realbounds
, CDIR_AUTH_UNDEF
);
2880 mds
->mdcache
->try_trim_non_auth_subtree(dir
);
2883 void EExport::encode(bufferlist
& bl
, uint64_t features
) const
2885 ENCODE_START(4, 3, bl
);
2887 encode(metablob
, bl
, features
);
2894 void EExport::decode(bufferlist::const_iterator
&bl
)
2896 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2899 decode(metablob
, bl
);
2907 void EExport::dump(Formatter
*f
) const
2909 f
->dump_float("stamp", (double)stamp
);
2910 /*f->open_object_section("Metablob");
2911 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2912 f->close_section();*/
2913 f
->dump_stream("base dirfrag") << base
;
2914 f
->open_array_section("bounds dirfrags");
2915 for (set
<dirfrag_t
>::const_iterator i
= bounds
.begin();
2916 i
!= bounds
.end(); ++i
) {
2917 f
->dump_stream("dirfrag") << *i
;
2919 f
->close_section(); // bounds dirfrags
2922 void EExport::generate_test_instances(std::list
<EExport
*>& ls
)
2924 EExport
*sample
= new EExport();
2925 ls
.push_back(sample
);
2929 // -----------------------
2932 void EImportStart::update_segment()
2934 get_segment()->sessionmapv
= cmapv
;
2937 void EImportStart::replay(MDSRank
*mds
)
2939 dout(10) << "EImportStart.replay " << base
<< " bounds " << bounds
<< dendl
;
2940 //metablob.print(*_dout);
2941 auto&& segment
= get_segment();
2942 metablob
.replay(mds
, segment
);
2944 // put in ambiguous import list
2945 mds
->mdcache
->add_ambiguous_import(base
, bounds
);
2947 // set auth partially to us so we don't trim it
2948 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
2951 set
<CDir
*> realbounds
;
2952 for (vector
<dirfrag_t
>::iterator p
= bounds
.begin();
2955 CDir
*bd
= mds
->mdcache
->get_dirfrag(*p
);
2957 if (!bd
->is_subtree_root())
2958 bd
->state_clear(CDir::STATE_AUTH
);
2959 realbounds
.insert(bd
);
2962 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, realbounds
,
2963 mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
2965 // open client sessions?
2966 if (mds
->sessionmap
.get_version() >= cmapv
) {
2967 dout(10) << "EImportStart.replay sessionmap " << mds
->sessionmap
.get_version()
2968 << " >= " << cmapv
<< ", noop" << dendl
;
2970 dout(10) << "EImportStart.replay sessionmap " << mds
->sessionmap
.get_version()
2971 << " < " << cmapv
<< dendl
;
2972 map
<client_t
,entity_inst_t
> cm
;
2973 map
<client_t
,client_metadata_t
> cmm
;
2974 auto blp
= client_map
.cbegin();
2979 mds
->sessionmap
.replay_open_sessions(cmapv
, cm
, cmm
);
2984 void EImportStart::encode(bufferlist
&bl
, uint64_t features
) const {
2985 ENCODE_START(4, 3, bl
);
2988 encode(metablob
, bl
, features
);
2991 encode(client_map
, bl
);
2996 void EImportStart::decode(bufferlist::const_iterator
&bl
) {
2997 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
3001 decode(metablob
, bl
);
3004 decode(client_map
, bl
);
3010 void EImportStart::dump(Formatter
*f
) const
3012 f
->dump_stream("base dirfrag") << base
;
3013 f
->open_array_section("boundary dirfrags");
3014 for (vector
<dirfrag_t
>::const_iterator iter
= bounds
.begin();
3015 iter
!= bounds
.end(); ++iter
) {
3016 f
->dump_stream("frag") << *iter
;
3021 void EImportStart::generate_test_instances(std::list
<EImportStart
*>& ls
)
3023 ls
.push_back(new EImportStart
);
3026 // -----------------------
3029 void EImportFinish::replay(MDSRank
*mds
)
3031 if (mds
->mdcache
->have_ambiguous_import(base
)) {
3032 dout(10) << "EImportFinish.replay " << base
<< " success=" << success
<< dendl
;
3034 mds
->mdcache
->finish_ambiguous_import(base
);
3036 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
3038 vector
<dirfrag_t
> bounds
;
3039 mds
->mdcache
->get_ambiguous_import_bounds(base
, bounds
);
3040 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, bounds
, CDIR_AUTH_UNDEF
);
3041 mds
->mdcache
->cancel_ambiguous_import(dir
);
3042 mds
->mdcache
->try_trim_non_auth_subtree(dir
);
3045 // this shouldn't happen unless this is an old journal
3046 dout(10) << "EImportFinish.replay " << base
<< " success=" << success
3047 << " on subtree not marked as ambiguous"
3049 mds
->clog
->error() << "failure replaying journal (EImportFinish)";
3051 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3055 void EImportFinish::encode(bufferlist
& bl
, uint64_t features
) const
3057 ENCODE_START(3, 3, bl
);
3060 encode(success
, bl
);
3064 void EImportFinish::decode(bufferlist::const_iterator
&bl
)
3066 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
3070 decode(success
, bl
);
3074 void EImportFinish::dump(Formatter
*f
) const
3076 f
->dump_stream("base dirfrag") << base
;
3077 f
->dump_string("success", success
? "true" : "false");
3079 void EImportFinish::generate_test_instances(std::list
<EImportFinish
*>& ls
)
3081 ls
.push_back(new EImportFinish
);
3082 ls
.push_back(new EImportFinish
);
3083 ls
.back()->success
= true;
3087 // ------------------------
3090 void EResetJournal::encode(bufferlist
& bl
, uint64_t features
) const
3092 ENCODE_START(2, 2, bl
);
3097 void EResetJournal::decode(bufferlist::const_iterator
&bl
)
3099 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
3104 void EResetJournal::dump(Formatter
*f
) const
3106 f
->dump_stream("timestamp") << stamp
;
3109 void EResetJournal::generate_test_instances(std::list
<EResetJournal
*>& ls
)
3111 ls
.push_back(new EResetJournal());
3114 void EResetJournal::replay(MDSRank
*mds
)
3116 dout(1) << "EResetJournal" << dendl
;
3118 mds
->sessionmap
.wipe();
3119 mds
->inotable
->replay_reset();
3121 if (mds
->mdsmap
->get_root() == mds
->get_nodeid()) {
3122 CDir
*rootdir
= mds
->mdcache
->get_root()->get_or_open_dirfrag(mds
->mdcache
, frag_t());
3123 mds
->mdcache
->adjust_subtree_auth(rootdir
, mds
->get_nodeid());
3126 CDir
*mydir
= mds
->mdcache
->get_myin()->get_or_open_dirfrag(mds
->mdcache
, frag_t());
3127 mds
->mdcache
->adjust_subtree_auth(mydir
, mds
->get_nodeid());
3129 mds
->mdcache
->recalc_auth_bits(true);
3131 mds
->mdcache
->show_subtrees();
3135 void ENoOp::encode(bufferlist
&bl
, uint64_t features
) const
3137 ENCODE_START(2, 2, bl
);
3138 encode(pad_size
, bl
);
3139 uint8_t const pad
= 0xff;
3140 for (unsigned int i
= 0; i
< pad_size
; ++i
) {
3147 void ENoOp::decode(bufferlist::const_iterator
&bl
)
3149 DECODE_START(2, bl
);
3150 decode(pad_size
, bl
);
3151 if (bl
.get_remaining() != pad_size
) {
3152 // This is spiritually an assertion, but expressing in a way that will let
3153 // journal debug tools catch it and recognise a malformed entry.
3154 throw buffer::end_of_buffer();
3162 void ENoOp::replay(MDSRank
*mds
)
3164 dout(4) << "ENoOp::replay, " << pad_size
<< " bytes skipped in journal" << dendl
;
3168 * If re-formatting an old journal that used absolute log position
3169 * references as segment sequence numbers, use this function to update
3173 * MDSRank instance, just used for logging
3175 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3178 * True if the event was modified.
3180 bool EMetaBlob::rewrite_truncate_finish(MDSRank
const *mds
,
3181 std::map
<LogSegment::seq_t
, LogSegment::seq_t
> const &old_to_new
)
3183 bool modified
= false;
3184 map
<inodeno_t
, LogSegment::seq_t
> new_trunc_finish
;
3185 for (const auto& p
: truncate_finish
) {
3186 auto q
= old_to_new
.find(p
.second
);
3187 if (q
!= old_to_new
.end()) {
3188 dout(20) << __func__
<< " applying segment seq mapping "
3189 << p
.second
<< " -> " << q
->second
<< dendl
;
3190 new_trunc_finish
.emplace(p
.first
, q
->second
);
3193 dout(20) << __func__
<< " no segment seq mapping found for "
3194 << p
.second
<< dendl
;
3195 new_trunc_finish
.insert(p
);
3198 truncate_finish
.swap(new_trunc_finish
);