1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "common/config.h"
16 #include "osdc/Journaler.h"
17 #include "events/ESubtreeMap.h"
18 #include "events/ESession.h"
19 #include "events/ESessions.h"
21 #include "events/EMetaBlob.h"
22 #include "events/EResetJournal.h"
23 #include "events/ENoOp.h"
25 #include "events/EUpdate.h"
26 #include "events/ESlaveUpdate.h"
27 #include "events/EOpen.h"
28 #include "events/ECommitted.h"
30 #include "events/EExport.h"
31 #include "events/EImportStart.h"
32 #include "events/EImportFinish.h"
33 #include "events/EFragment.h"
35 #include "events/ETableClient.h"
36 #include "events/ETableServer.h"
38 #include "include/stringify.h"
40 #include "LogSegment.h"
50 #include "MDSTableClient.h"
51 #include "MDSTableServer.h"
55 #define dout_context g_ceph_context
56 #define dout_subsys ceph_subsys_mds
58 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
61 // -----------------------
64 void LogSegment::try_to_expire(MDSRank
*mds
, MDSGatherBuilder
&gather_bld
, int op_prio
)
68 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire" << dendl
;
70 assert(g_conf
->mds_kill_journal_expire_at
!= 1);
73 for (elist
<CDir
*>::iterator p
= new_dirfrags
.begin(); !p
.end(); ++p
) {
74 dout(20) << " new_dirfrag " << **p
<< dendl
;
75 assert((*p
)->is_auth());
78 for (elist
<CDir
*>::iterator p
= dirty_dirfrags
.begin(); !p
.end(); ++p
) {
79 dout(20) << " dirty_dirfrag " << **p
<< dendl
;
80 assert((*p
)->is_auth());
83 for (elist
<CDentry
*>::iterator p
= dirty_dentries
.begin(); !p
.end(); ++p
) {
84 dout(20) << " dirty_dentry " << **p
<< dendl
;
85 assert((*p
)->is_auth());
86 commit
.insert((*p
)->get_dir());
88 for (elist
<CInode
*>::iterator p
= dirty_inodes
.begin(); !p
.end(); ++p
) {
89 dout(20) << " dirty_inode " << **p
<< dendl
;
90 assert((*p
)->is_auth());
91 if ((*p
)->is_base()) {
92 (*p
)->store(gather_bld
.new_sub());
94 commit
.insert((*p
)->get_parent_dn()->get_dir());
97 if (!commit
.empty()) {
98 for (set
<CDir
*>::iterator p
= commit
.begin();
102 assert(dir
->is_auth());
103 if (dir
->can_auth_pin()) {
104 dout(15) << "try_to_expire committing " << *dir
<< dendl
;
105 dir
->commit(0, gather_bld
.new_sub(), false, op_prio
);
107 dout(15) << "try_to_expire waiting for unfreeze on " << *dir
<< dendl
;
108 dir
->add_waiter(CDir::WAIT_UNFREEZE
, gather_bld
.new_sub());
113 // master ops with possibly uncommitted slaves
114 for (set
<metareqid_t
>::iterator p
= uncommitted_masters
.begin();
115 p
!= uncommitted_masters
.end();
117 dout(10) << "try_to_expire waiting for slaves to ack commit on " << *p
<< dendl
;
118 mds
->mdcache
->wait_for_uncommitted_master(*p
, gather_bld
.new_sub());
121 // uncommitted fragments
122 for (set
<dirfrag_t
>::iterator p
= uncommitted_fragments
.begin();
123 p
!= uncommitted_fragments
.end();
125 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p
<< dendl
;
126 mds
->mdcache
->wait_for_uncommitted_fragment(*p
, gather_bld
.new_sub());
129 // nudge scatterlocks
130 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_dir
.begin(); !p
.end(); ++p
) {
132 dout(10) << "try_to_expire waiting for dirlock flush on " << *in
<< dendl
;
133 mds
->locker
->scatter_nudge(&in
->filelock
, gather_bld
.new_sub());
135 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_dirfragtree
.begin(); !p
.end(); ++p
) {
137 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in
<< dendl
;
138 mds
->locker
->scatter_nudge(&in
->dirfragtreelock
, gather_bld
.new_sub());
140 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_nest
.begin(); !p
.end(); ++p
) {
142 dout(10) << "try_to_expire waiting for nest flush on " << *in
<< dendl
;
143 mds
->locker
->scatter_nudge(&in
->nestlock
, gather_bld
.new_sub());
146 assert(g_conf
->mds_kill_journal_expire_at
!= 2);
148 // open files and snap inodes
149 if (!open_files
.empty()) {
150 assert(!mds
->mdlog
->is_capped()); // hmm FIXME
152 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
154 elist
<CInode
*>::iterator p
= open_files
.begin(member_offset(CInode
, item_open_file
));
158 if (in
->last
== CEPH_NOSNAP
&& in
->is_auth() &&
159 !in
->is_ambiguous_auth() && in
->is_any_caps()) {
160 if (in
->is_any_caps_wanted()) {
161 dout(20) << "try_to_expire requeueing open file " << *in
<< dendl
;
163 le
= new EOpen(mds
->mdlog
);
164 mds
->mdlog
->start_entry(le
);
166 le
->add_clean_inode(in
);
167 ls
->open_files
.push_back(&in
->item_open_file
);
169 // drop inodes that aren't wanted
170 dout(20) << "try_to_expire not requeueing and delisting unwanted file " << *in
<< dendl
;
171 in
->item_open_file
.remove_myself();
173 } else if (in
->last
!= CEPH_NOSNAP
&& !in
->client_snap_caps
.empty()) {
174 // journal snap inodes that need flush. This simplify the mds failover hanlding
175 dout(20) << "try_to_expire requeueing snap needflush inode " << *in
<< dendl
;
177 le
= new EOpen(mds
->mdlog
);
178 mds
->mdlog
->start_entry(le
);
180 le
->add_clean_inode(in
);
181 ls
->open_files
.push_back(&in
->item_open_file
);
184 * we can get a capless inode here if we replay an open file, the client fails to
185 * reconnect it, but does REPLAY an open request (that adds it to the logseg). AFAICS
186 * it's ok for the client to replay an open on a file it doesn't have in it's cache
189 * this makes the mds less sensitive to strict open_file consistency, although it does
190 * make it easier to miss subtle problems.
192 dout(20) << "try_to_expire not requeueing and delisting capless file " << *in
<< dendl
;
193 in
->item_open_file
.remove_myself();
197 mds
->mdlog
->submit_entry(le
);
198 mds
->mdlog
->wait_for_safe(gather_bld
.new_sub());
199 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl
;
203 assert(g_conf
->mds_kill_journal_expire_at
!= 3);
205 // backtraces to be stored/updated
206 for (elist
<CInode
*>::iterator p
= dirty_parent_inodes
.begin(); !p
.end(); ++p
) {
208 assert(in
->is_auth());
209 if (in
->can_auth_pin()) {
210 dout(15) << "try_to_expire waiting for storing backtrace on " << *in
<< dendl
;
211 in
->store_backtrace(gather_bld
.new_sub(), op_prio
);
213 dout(15) << "try_to_expire waiting for unfreeze on " << *in
<< dendl
;
214 in
->add_waiter(CInode::WAIT_UNFREEZE
, gather_bld
.new_sub());
218 assert(g_conf
->mds_kill_journal_expire_at
!= 4);
221 for (elist
<MDSlaveUpdate
*>::iterator p
= slave_updates
.begin(member_offset(MDSlaveUpdate
,
224 MDSlaveUpdate
*su
= *p
;
225 dout(10) << "try_to_expire waiting on slave update " << su
<< dendl
;
226 assert(su
->waiter
== 0);
227 su
->waiter
= gather_bld
.new_sub();
231 if (inotablev
> mds
->inotable
->get_committed_version()) {
232 dout(10) << "try_to_expire saving inotable table, need " << inotablev
233 << ", committed is " << mds
->inotable
->get_committed_version()
234 << " (" << mds
->inotable
->get_committing_version() << ")"
236 mds
->inotable
->save(gather_bld
.new_sub(), inotablev
);
240 if (sessionmapv
> mds
->sessionmap
.get_committed()) {
241 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
242 << ", committed is " << mds
->sessionmap
.get_committed()
243 << " (" << mds
->sessionmap
.get_committing() << ")"
245 mds
->sessionmap
.save(gather_bld
.new_sub(), sessionmapv
);
248 // updates to sessions for completed_requests
249 mds
->sessionmap
.save_if_dirty(touched_sessions
, &gather_bld
);
250 touched_sessions
.clear();
252 // pending commit atids
253 for (map
<int, ceph::unordered_set
<version_t
> >::iterator p
= pending_commit_tids
.begin();
254 p
!= pending_commit_tids
.end();
256 MDSTableClient
*client
= mds
->get_table_client(p
->first
);
258 for (ceph::unordered_set
<version_t
>::iterator q
= p
->second
.begin();
259 q
!= p
->second
.end();
261 dout(10) << "try_to_expire " << get_mdstable_name(p
->first
) << " transaction " << *q
262 << " pending commit (not yet acked), waiting" << dendl
;
263 assert(!client
->has_committed(*q
));
264 client
->wait_for_ack(*q
, gather_bld
.new_sub());
269 for (map
<int, version_t
>::iterator p
= tablev
.begin();
272 MDSTableServer
*server
= mds
->get_table_server(p
->first
);
274 if (p
->second
> server
->get_committed_version()) {
275 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p
->first
)
276 << " to save, need " << p
->second
<< dendl
;
277 server
->save(gather_bld
.new_sub());
282 for (set
<CInode
*>::iterator p
= truncating_inodes
.begin();
283 p
!= truncating_inodes
.end();
285 dout(10) << "try_to_expire waiting for truncate of " << **p
<< dendl
;
286 (*p
)->add_waiter(CInode::WAIT_TRUNC
, gather_bld
.new_sub());
289 if (gather_bld
.has_subs()) {
290 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire waiting" << dendl
;
293 assert(g_conf
->mds_kill_journal_expire_at
!= 5);
294 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire success" << dendl
;
299 // -----------------------
302 EMetaBlob::EMetaBlob(MDLog
*mdlog
) : opened_ino(0), renamed_dirino(0),
303 inotablev(0), sessionmapv(0), allocated_ino(0),
304 last_subtree_map(0), event_seq(0)
307 void EMetaBlob::add_dir_context(CDir
*dir
, int mode
)
309 MDSRank
*mds
= dir
->cache
->mds
;
311 list
<CDentry
*> parents
;
313 // it may be okay not to include the maybe items, if
314 // - we journaled the maybe child inode in this segment
315 // - that subtree turns out to be unambiguously auth
316 list
<CDentry
*> maybe
;
317 bool maybenot
= false;
320 // already have this dir? (we must always add in order)
321 if (lump_map
.count(dir
->dirfrag())) {
322 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") have lump " << dir
->dirfrag() << dendl
;
326 // stop at root/stray
327 CInode
*diri
= dir
->get_inode();
328 CDentry
*parent
= diri
->get_projected_parent_dn();
330 if (mode
== TO_AUTH_SUBTREE_ROOT
) {
332 if (dir
->is_subtree_root()) {
333 // match logic in MDCache::create_subtree_map()
334 if (dir
->get_dir_auth().first
== mds
->get_nodeid()) {
335 mds_authority_t parent_auth
= parent
? parent
->authority() : CDIR_AUTH_UNDEF
;
336 if (parent_auth
.first
== dir
->get_dir_auth().first
) {
337 if (parent_auth
.second
== CDIR_AUTH_UNKNOWN
&&
338 !dir
->is_ambiguous_dir_auth() &&
339 !dir
->state_test(CDir::STATE_EXPORTBOUND
) &&
340 !dir
->state_test(CDir::STATE_AUXSUBTREE
) &&
341 !diri
->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
342 dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir
<< dendl
;
345 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") ambiguous or transient subtree " << dendl
;
347 // it's an auth subtree, we don't need maybe (if any), and we're done.
348 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") reached unambig auth subtree, don't need " << maybe
349 << " at " << *dir
<< dendl
;
354 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") reached ambig or !auth subtree, need " << maybe
355 << " at " << *dir
<< dendl
;
356 // we need the maybe list after all!
357 parents
.splice(parents
.begin(), maybe
);
362 // was the inode journaled in this blob?
363 if (event_seq
&& diri
->last_journaled
== event_seq
) {
364 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") already have diri this blob " << *diri
<< dendl
;
368 // have we journaled this inode since the last subtree map?
369 if (!maybenot
&& last_subtree_map
&& diri
->last_journaled
>= last_subtree_map
) {
370 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") already have diri in this segment ("
371 << diri
->last_journaled
<< " >= " << last_subtree_map
<< "), setting maybenot flag "
381 dout(25) << "EMetaBlob::add_dir_context(" << dir
<< ") maybe " << *parent
<< dendl
;
382 maybe
.push_front(parent
);
384 dout(25) << "EMetaBlob::add_dir_context(" << dir
<< ") definitely " << *parent
<< dendl
;
385 parents
.push_front(parent
);
388 dir
= parent
->get_dir();
391 parents
.splice(parents
.begin(), maybe
);
393 dout(20) << "EMetaBlob::add_dir_context final: " << parents
<< dendl
;
394 for (list
<CDentry
*>::iterator p
= parents
.begin(); p
!= parents
.end(); ++p
) {
395 assert((*p
)->get_projected_linkage()->is_primary());
396 add_dentry(*p
, false);
400 void EMetaBlob::update_segment(LogSegment
*ls
)
402 // dirty inode mtimes
403 // -> handled directly by Server.cc, replay()
405 // alloc table update?
407 ls
->inotablev
= inotablev
;
409 ls
->sessionmapv
= sessionmapv
;
412 // -> handled directly by Server.cc
415 // note the newest request per client
416 //if (!client_reqs.empty())
417 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
420 // EMetaBlob::fullbit
422 void EMetaBlob::fullbit::encode(bufferlist
& bl
, uint64_t features
) const {
423 ENCODE_START(8, 5, bl
);
425 ::encode(dnfirst
, bl
);
426 ::encode(dnlast
, bl
);
428 ::encode(inode
, bl
, features
);
429 ::encode(xattrs
, bl
);
430 if (inode
.is_symlink())
431 ::encode(symlink
, bl
);
432 if (inode
.is_dir()) {
433 ::encode(dirfragtree
, bl
);
434 ::encode(snapbl
, bl
);
437 if (old_inodes
.empty()) {
441 ::encode(old_inodes
, bl
, features
);
444 ::encode(snapbl
, bl
);
445 ::encode(oldest_snap
, bl
);
449 void EMetaBlob::fullbit::decode(bufferlist::iterator
&bl
) {
450 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
452 ::decode(dnfirst
, bl
);
453 ::decode(dnlast
, bl
);
456 ::decode(xattrs
, bl
);
457 if (inode
.is_symlink())
458 ::decode(symlink
, bl
);
459 if (inode
.is_dir()) {
460 ::decode(dirfragtree
, bl
);
461 ::decode(snapbl
, bl
);
462 if ((struct_v
== 2) || (struct_v
== 3)) {
463 bool dir_layout_exists
;
464 ::decode(dir_layout_exists
, bl
);
465 if (dir_layout_exists
) {
467 ::decode(dir_struct_v
, bl
); // default_file_layout version
468 ::decode(inode
.layout
, bl
); // and actual layout, that we care about
477 state
= dirty
? EMetaBlob::fullbit::STATE_DIRTY
: 0;
481 bool old_inodes_present
;
482 ::decode(old_inodes_present
, bl
);
483 if (old_inodes_present
) {
484 ::decode(old_inodes
, bl
);
487 if (!inode
.is_dir()) {
489 ::decode(snapbl
, bl
);
492 ::decode(oldest_snap
, bl
);
494 oldest_snap
= CEPH_NOSNAP
;
499 void EMetaBlob::fullbit::dump(Formatter
*f
) const
501 f
->dump_string("dentry", dn
);
502 f
->dump_stream("snapid.first") << dnfirst
;
503 f
->dump_stream("snapid.last") << dnlast
;
504 f
->dump_int("dentry version", dnv
);
505 f
->open_object_section("inode");
507 f
->close_section(); // inode
508 f
->open_object_section("xattrs");
509 for (const auto &p
: xattrs
) {
510 std::string
s(p
.second
.c_str(), p
.second
.length());
511 f
->dump_string(p
.first
.c_str(), s
);
513 f
->close_section(); // xattrs
514 if (inode
.is_symlink()) {
515 f
->dump_string("symlink", symlink
);
517 if (inode
.is_dir()) {
518 f
->dump_stream("frag tree") << dirfragtree
;
519 f
->dump_string("has_snapbl", snapbl
.length() ? "true" : "false");
520 if (inode
.has_layout()) {
521 f
->open_object_section("file layout policy");
523 f
->dump_string("layout", "the layout exists");
524 f
->close_section(); // file layout policy
527 f
->dump_string("state", state_string());
528 if (!old_inodes
.empty()) {
529 f
->open_array_section("old inodes");
530 for (const auto &p
: old_inodes
) {
531 f
->open_object_section("inode");
532 f
->dump_int("snapid", p
.first
);
534 f
->close_section(); // inode
536 f
->close_section(); // old inodes
540 void EMetaBlob::fullbit::generate_test_instances(list
<EMetaBlob::fullbit
*>& ls
)
542 CInode::mempool_inode inode
;
544 CInode::mempool_xattr_map empty_xattrs
;
545 bufferlist empty_snapbl
;
546 fullbit
*sample
= new fullbit("/testdn", 0, 0, 0,
547 inode
, fragtree
, empty_xattrs
, "", 0, empty_snapbl
,
549 ls
.push_back(sample
);
552 void EMetaBlob::fullbit::update_inode(MDSRank
*mds
, CInode
*in
)
556 in
->maybe_export_pin();
557 if (in
->inode
.is_dir()) {
558 if (!(in
->dirfragtree
== dirfragtree
)) {
559 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in
->dirfragtree
<< " -> "
560 << dirfragtree
<< " on " << *in
<< dendl
;
561 in
->dirfragtree
= dirfragtree
;
562 in
->force_dirfrags();
563 if (in
->has_dirfrags() && in
->authority() == CDIR_AUTH_UNDEF
) {
565 in
->get_nested_dirfrags(ls
);
566 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
568 if (dir
->get_num_any() == 0 &&
569 mds
->mdcache
->can_trim_non_auth_dirfrag(dir
)) {
570 dout(10) << " closing empty non-auth dirfrag " << *dir
<< dendl
;
571 in
->close_dirfrag(dir
->get_frag());
576 } else if (in
->inode
.is_symlink()) {
577 in
->symlink
= mempool::mds_co::string(boost::string_view(symlink
));
579 in
->old_inodes
= old_inodes
;
580 if (!in
->old_inodes
.empty()) {
581 snapid_t min_first
= in
->old_inodes
.rbegin()->first
+ 1;
582 if (min_first
> in
->first
)
583 in
->first
= min_first
;
587 * we can do this before linking hte inode bc the split_at would
588 * be a no-op.. we have no children (namely open snaprealms) to
591 in
->oldest_snap
= oldest_snap
;
592 in
->decode_snap_blob(snapbl
);
595 * In case there was anything malformed in the journal that we are
596 * replaying, do sanity checks on the inodes we're replaying and
597 * go damaged instead of letting any trash into a live cache
600 // Files must have valid layouts with a pool set
601 if (in
->inode
.layout
.pool_id
== -1 || !in
->inode
.layout
.is_valid()) {
602 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
603 << ": " << in
->inode
.layout
<< dendl
;
604 std::ostringstream oss
;
605 oss
<< "Invalid layout for inode 0x" << std::hex
<< in
->inode
.ino
606 << std::dec
<< " in journal";
607 mds
->clog
->error() << oss
.str();
609 ceph_abort(); // Should be unreachable because damaged() calls respawn()
614 // EMetaBlob::remotebit
616 void EMetaBlob::remotebit::encode(bufferlist
& bl
) const
618 ENCODE_START(2, 2, bl
);
620 ::encode(dnfirst
, bl
);
621 ::encode(dnlast
, bl
);
624 ::encode(d_type
, bl
);
629 void EMetaBlob::remotebit::decode(bufferlist::iterator
&bl
)
631 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
633 ::decode(dnfirst
, bl
);
634 ::decode(dnlast
, bl
);
637 ::decode(d_type
, bl
);
642 void EMetaBlob::remotebit::dump(Formatter
*f
) const
644 f
->dump_string("dentry", dn
);
645 f
->dump_int("snapid.first", dnfirst
);
646 f
->dump_int("snapid.last", dnlast
);
647 f
->dump_int("dentry version", dnv
);
648 f
->dump_int("inodeno", ino
);
649 uint32_t type
= DTTOIF(d_type
) & S_IFMT
; // convert to type entries
653 type_string
= "file"; break;
655 type_string
= "symlink"; break;
657 type_string
= "directory"; break;
659 type_string
= "fifo"; break;
661 type_string
= "chr"; break;
663 type_string
= "blk"; break;
665 type_string
= "sock"; break;
667 assert (0 == "unknown d_type!");
669 f
->dump_string("d_type", type_string
);
670 f
->dump_string("dirty", dirty
? "true" : "false");
673 void EMetaBlob::remotebit::
674 generate_test_instances(list
<EMetaBlob::remotebit
*>& ls
)
676 remotebit
*remote
= new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG
), false);
677 ls
.push_back(remote
);
680 // EMetaBlob::nullbit
682 void EMetaBlob::nullbit::encode(bufferlist
& bl
) const
684 ENCODE_START(2, 2, bl
);
686 ::encode(dnfirst
, bl
);
687 ::encode(dnlast
, bl
);
693 void EMetaBlob::nullbit::decode(bufferlist::iterator
&bl
)
695 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
697 ::decode(dnfirst
, bl
);
698 ::decode(dnlast
, bl
);
704 void EMetaBlob::nullbit::dump(Formatter
*f
) const
706 f
->dump_string("dentry", dn
);
707 f
->dump_int("snapid.first", dnfirst
);
708 f
->dump_int("snapid.last", dnlast
);
709 f
->dump_int("dentry version", dnv
);
710 f
->dump_string("dirty", dirty
? "true" : "false");
713 void EMetaBlob::nullbit::generate_test_instances(list
<nullbit
*>& ls
)
715 nullbit
*sample
= new nullbit("/test/dentry", 0, 10, 15, false);
716 nullbit
*sample2
= new nullbit("/test/dirty", 10, 20, 25, true);
717 ls
.push_back(sample
);
718 ls
.push_back(sample2
);
721 // EMetaBlob::dirlump
723 void EMetaBlob::dirlump::encode(bufferlist
& bl
, uint64_t features
) const
725 ENCODE_START(2, 2, bl
);
729 ::encode(nremote
, bl
);
731 _encode_bits(features
);
736 void EMetaBlob::dirlump::decode(bufferlist::iterator
&bl
)
738 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
)
742 ::decode(nremote
, bl
);
745 dn_decoded
= false; // don't decode bits unless we need them.
749 void EMetaBlob::dirlump::dump(Formatter
*f
) const
752 dirlump
*me
= const_cast<dirlump
*>(this);
755 f
->open_object_section("fnode");
757 f
->close_section(); // fnode
758 f
->dump_string("state", state_string());
759 f
->dump_int("nfull", nfull
);
760 f
->dump_int("nremote", nremote
);
761 f
->dump_int("nnull", nnull
);
763 f
->open_array_section("full bits");
764 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
765 iter
= dfull
.begin(); iter
!= dfull
.end(); ++iter
) {
766 f
->open_object_section("fullbit");
768 f
->close_section(); // fullbit
770 f
->close_section(); // full bits
771 f
->open_array_section("remote bits");
772 for (list
<remotebit
>::const_iterator
773 iter
= dremote
.begin(); iter
!= dremote
.end(); ++iter
) {
774 f
->open_object_section("remotebit");
776 f
->close_section(); // remotebit
778 f
->close_section(); // remote bits
779 f
->open_array_section("null bits");
780 for (list
<nullbit
>::const_iterator
781 iter
= dnull
.begin(); iter
!= dnull
.end(); ++iter
) {
782 f
->open_object_section("null bit");
784 f
->close_section(); // null bit
786 f
->close_section(); // null bits
789 void EMetaBlob::dirlump::generate_test_instances(list
<dirlump
*>& ls
)
791 ls
.push_back(new dirlump());
797 void EMetaBlob::encode(bufferlist
& bl
, uint64_t features
) const
799 ENCODE_START(8, 5, bl
);
800 ::encode(lump_order
, bl
);
801 ::encode(lump_map
, bl
, features
);
802 ::encode(roots
, bl
, features
);
803 ::encode(table_tids
, bl
);
804 ::encode(opened_ino
, bl
);
805 ::encode(allocated_ino
, bl
);
806 ::encode(used_preallocated_ino
, bl
);
807 ::encode(preallocated_inos
, bl
);
808 ::encode(client_name
, bl
);
809 ::encode(inotablev
, bl
);
810 ::encode(sessionmapv
, bl
);
811 ::encode(truncate_start
, bl
);
812 ::encode(truncate_finish
, bl
);
813 ::encode(destroyed_inodes
, bl
);
814 ::encode(client_reqs
, bl
);
815 ::encode(renamed_dirino
, bl
);
816 ::encode(renamed_dir_frags
, bl
);
818 // make MDSRank use v6 format happy
824 ::encode(client_flushes
, bl
);
827 void EMetaBlob::decode(bufferlist::iterator
&bl
)
829 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
830 ::decode(lump_order
, bl
);
831 ::decode(lump_map
, bl
);
836 ::decode(rootbl
, bl
);
837 if (rootbl
.length()) {
838 bufferlist::iterator p
= rootbl
.begin();
839 roots
.push_back(ceph::shared_ptr
<fullbit
>(new fullbit(p
)));
842 ::decode(table_tids
, bl
);
843 ::decode(opened_ino
, bl
);
844 ::decode(allocated_ino
, bl
);
845 ::decode(used_preallocated_ino
, bl
);
846 ::decode(preallocated_inos
, bl
);
847 ::decode(client_name
, bl
);
848 ::decode(inotablev
, bl
);
849 ::decode(sessionmapv
, bl
);
850 ::decode(truncate_start
, bl
);
851 ::decode(truncate_finish
, bl
);
852 ::decode(destroyed_inodes
, bl
);
854 ::decode(client_reqs
, bl
);
859 client_reqs
.push_back(pair
<metareqid_t
,uint64_t>(r
.front(), 0));
864 ::decode(renamed_dirino
, bl
);
865 ::decode(renamed_dir_frags
, bl
);
875 ::decode(client_flushes
, bl
);
882 * Get all inodes touched by this metablob. Includes the 'bits' within
883 * dirlumps, and the inodes of the dirs themselves.
885 void EMetaBlob::get_inodes(
886 std::set
<inodeno_t
> &inodes
) const
888 // For all dirlumps in this metablob
889 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
890 // Record inode of dirlump
891 inodeno_t
const dir_ino
= i
->first
.ino
;
892 inodes
.insert(dir_ino
);
894 // Decode dirlump bits
895 dirlump
const &dl
= i
->second
;
898 // Record inodes of fullbits
899 list
<ceph::shared_ptr
<fullbit
> > const &fb_list
= dl
.get_dfull();
900 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
901 iter
= fb_list
.begin(); iter
!= fb_list
.end(); ++iter
) {
902 inodes
.insert((*iter
)->inode
.ino
);
905 // Record inodes of remotebits
906 list
<remotebit
> const &rb_list
= dl
.get_dremote();
907 for (list
<remotebit
>::const_iterator
908 iter
= rb_list
.begin(); iter
!= rb_list
.end(); ++iter
) {
909 inodes
.insert(iter
->ino
);
916 * Get a map of dirfrag to set of dentries in that dirfrag which are
917 * touched in this operation.
919 void EMetaBlob::get_dentries(std::map
<dirfrag_t
, std::set
<std::string
> > &dentries
) const
921 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
922 dirlump
const &dl
= i
->second
;
923 dirfrag_t
const &df
= i
->first
;
927 list
<ceph::shared_ptr
<fullbit
> > const &fb_list
= dl
.get_dfull();
928 list
<nullbit
> const &nb_list
= dl
.get_dnull();
929 list
<remotebit
> const &rb_list
= dl
.get_dremote();
931 // For all bits, store dentry
932 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
933 iter
= fb_list
.begin(); iter
!= fb_list
.end(); ++iter
) {
934 dentries
[df
].insert((*iter
)->dn
);
937 for (list
<nullbit
>::const_iterator
938 iter
= nb_list
.begin(); iter
!= nb_list
.end(); ++iter
) {
939 dentries
[df
].insert(iter
->dn
);
941 for (list
<remotebit
>::const_iterator
942 iter
= rb_list
.begin(); iter
!= rb_list
.end(); ++iter
) {
943 dentries
[df
].insert(iter
->dn
);
951 * Calculate all paths that we can infer are touched by this metablob. Only uses
952 * information local to this metablob so it may only be the path within the
955 void EMetaBlob::get_paths(
956 std::vector
<std::string
> &paths
) const
958 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
959 typedef std::pair
<inodeno_t
, std::string
> Location
;
961 // Whenever we see a dentry within a dirlump, we remember it as a child of
962 // the dirlump's inode
963 std::map
<inodeno_t
, std::list
<std::string
> > children
;
965 // Whenever we see a location for an inode, remember it: this allows us to
966 // build a path given an inode
967 std::map
<inodeno_t
, Location
> ino_locations
;
969 // Special case: operations on root inode populate roots but not dirlumps
970 if (lump_map
.empty() && !roots
.empty()) {
971 paths
.push_back("/");
977 // Build a tiny local metadata cache for the path structure in this metablob
978 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
979 inodeno_t
const dir_ino
= i
->first
.ino
;
980 dirlump
const &dl
= i
->second
;
983 list
<ceph::shared_ptr
<fullbit
> > const &fb_list
= dl
.get_dfull();
984 list
<nullbit
> const &nb_list
= dl
.get_dnull();
985 list
<remotebit
> const &rb_list
= dl
.get_dremote();
987 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
988 iter
= fb_list
.begin(); iter
!= fb_list
.end(); ++iter
) {
989 boost::string_view dentry
= (*iter
)->dn
;
990 children
[dir_ino
].emplace_back(dentry
);
991 ino_locations
[(*iter
)->inode
.ino
] = Location(dir_ino
, std::string(dentry
));
994 for (list
<nullbit
>::const_iterator
995 iter
= nb_list
.begin(); iter
!= nb_list
.end(); ++iter
) {
996 boost::string_view dentry
= iter
->dn
;
997 children
[dir_ino
].emplace_back(dentry
);
1000 for (list
<remotebit
>::const_iterator
1001 iter
= rb_list
.begin(); iter
!= rb_list
.end(); ++iter
) {
1002 boost::string_view dentry
= iter
->dn
;
1003 children
[dir_ino
].emplace_back(dentry
);
1007 std::vector
<Location
> leaf_locations
;
1011 // Output paths for all childless nodes in the metablob
1012 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
1013 inodeno_t
const dir_ino
= i
->first
.ino
;
1014 dirlump
const &dl
= i
->second
;
1017 list
<ceph::shared_ptr
<fullbit
> > const &fb_list
= dl
.get_dfull();
1018 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
1019 iter
= fb_list
.begin(); iter
!= fb_list
.end(); ++iter
) {
1020 std::string
dentry((*iter
)->dn
);
1021 children
[dir_ino
].push_back(dentry
);
1022 ino_locations
[(*iter
)->inode
.ino
] = Location(dir_ino
, std::string(dentry
));
1023 if (children
.find((*iter
)->inode
.ino
) == children
.end()) {
1024 leaf_locations
.push_back(Location(dir_ino
, std::string(dentry
)));
1029 list
<nullbit
> const &nb_list
= dl
.get_dnull();
1030 for (list
<nullbit
>::const_iterator
1031 iter
= nb_list
.begin(); iter
!= nb_list
.end(); ++iter
) {
1032 boost::string_view dentry
= iter
->dn
;
1033 leaf_locations
.push_back(Location(dir_ino
, std::string(dentry
)));
1036 list
<remotebit
> const &rb_list
= dl
.get_dremote();
1037 for (list
<remotebit
>::const_iterator
1038 iter
= rb_list
.begin(); iter
!= rb_list
.end(); ++iter
) {
1039 boost::string_view dentry
= iter
->dn
;
1040 leaf_locations
.push_back(Location(dir_ino
, std::string(dentry
)));
1044 // For all the leaf locations identified, generate paths
1045 for (std::vector
<Location
>::iterator i
= leaf_locations
.begin(); i
!= leaf_locations
.end(); ++i
) {
1046 Location
const &loc
= *i
;
1047 std::string path
= loc
.second
;
1048 inodeno_t ino
= loc
.first
;
1049 while(ino_locations
.find(ino
) != ino_locations
.end()) {
1050 Location
const &loc
= ino_locations
[ino
];
1051 if (!path
.empty()) {
1052 path
= loc
.second
+ "/" + path
;
1054 path
= loc
.second
+ path
;
1059 paths
.push_back(path
);
1064 void EMetaBlob::dump(Formatter
*f
) const
1066 f
->open_array_section("lumps");
1067 for (list
<dirfrag_t
>::const_iterator i
= lump_order
.begin();
1068 i
!= lump_order
.end(); ++i
) {
1069 f
->open_object_section("lump");
1070 f
->open_object_section("dirfrag");
1071 f
->dump_stream("dirfrag") << *i
;
1072 f
->close_section(); // dirfrag
1073 f
->open_object_section("dirlump");
1074 lump_map
.at(*i
).dump(f
);
1075 f
->close_section(); // dirlump
1076 f
->close_section(); // lump
1078 f
->close_section(); // lumps
1080 f
->open_array_section("roots");
1081 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator i
= roots
.begin();
1082 i
!= roots
.end(); ++i
) {
1083 f
->open_object_section("root");
1085 f
->close_section(); // root
1087 f
->close_section(); // roots
1089 f
->open_array_section("tableclient tranactions");
1090 for (list
<pair
<__u8
,version_t
> >::const_iterator i
= table_tids
.begin();
1091 i
!= table_tids
.end(); ++i
) {
1092 f
->open_object_section("transaction");
1093 f
->dump_int("tid", i
->first
);
1094 f
->dump_int("version", i
->second
);
1095 f
->close_section(); // transaction
1097 f
->close_section(); // tableclient transactions
1099 f
->dump_int("renamed directory inodeno", renamed_dirino
);
1101 f
->open_array_section("renamed directory fragments");
1102 for (list
<frag_t
>::const_iterator i
= renamed_dir_frags
.begin();
1103 i
!= renamed_dir_frags
.end(); ++i
) {
1104 f
->dump_int("frag", *i
);
1106 f
->close_section(); // renamed directory fragments
1108 f
->dump_int("inotable version", inotablev
);
1109 f
->dump_int("SessionMap version", sessionmapv
);
1110 f
->dump_int("allocated ino", allocated_ino
);
1112 f
->dump_stream("preallocated inos") << preallocated_inos
;
1113 f
->dump_int("used preallocated ino", used_preallocated_ino
);
1115 f
->open_object_section("client name");
1116 client_name
.dump(f
);
1117 f
->close_section(); // client name
1119 f
->open_array_section("inodes starting a truncate");
1120 for(list
<inodeno_t
>::const_iterator i
= truncate_start
.begin();
1121 i
!= truncate_start
.end(); ++i
) {
1122 f
->dump_int("inodeno", *i
);
1124 f
->close_section(); // truncate inodes
1125 f
->open_array_section("inodes finishing a truncated");
1126 for(map
<inodeno_t
,uint64_t>::const_iterator i
= truncate_finish
.begin();
1127 i
!= truncate_finish
.end(); ++i
) {
1128 f
->open_object_section("inode+segment");
1129 f
->dump_int("inodeno", i
->first
);
1130 f
->dump_int("truncate starting segment", i
->second
);
1131 f
->close_section(); // truncated inode
1133 f
->close_section(); // truncate finish inodes
1135 f
->open_array_section("destroyed inodes");
1136 for(vector
<inodeno_t
>::const_iterator i
= destroyed_inodes
.begin();
1137 i
!= destroyed_inodes
.end(); ++i
) {
1138 f
->dump_int("inodeno", *i
);
1140 f
->close_section(); // destroyed inodes
1142 f
->open_array_section("client requests");
1143 for(list
<pair
<metareqid_t
,uint64_t> >::const_iterator i
= client_reqs
.begin();
1144 i
!= client_reqs
.end(); ++i
) {
1145 f
->open_object_section("Client request");
1146 f
->dump_stream("request ID") << i
->first
;
1147 f
->dump_int("oldest request on client", i
->second
);
1148 f
->close_section(); // request
1150 f
->close_section(); // client requests
1153 void EMetaBlob::generate_test_instances(list
<EMetaBlob
*>& ls
)
1155 ls
.push_back(new EMetaBlob());
1158 void EMetaBlob::replay(MDSRank
*mds
, LogSegment
*logseg
, MDSlaveUpdate
*slaveup
)
1160 dout(10) << "EMetaBlob.replay " << lump_map
.size() << " dirlumps by " << client_name
<< dendl
;
1164 assert(g_conf
->mds_kill_journal_replay_at
!= 1);
1166 for (list
<ceph::shared_ptr
<fullbit
> >::iterator p
= roots
.begin(); p
!= roots
.end(); ++p
) {
1167 CInode
*in
= mds
->mdcache
->get_inode((*p
)->inode
.ino
);
1168 bool isnew
= in
? false:true;
1170 in
= new CInode(mds
->mdcache
, false);
1171 (*p
)->update_inode(mds
, in
);
1174 mds
->mdcache
->add_inode(in
);
1175 if ((*p
)->is_dirty()) in
->_mark_dirty(logseg
);
1176 dout(10) << "EMetaBlob.replay " << (isnew
? " added root ":" updated root ") << *in
<< dendl
;
1179 CInode
*renamed_diri
= 0;
1181 if (renamed_dirino
) {
1182 renamed_diri
= mds
->mdcache
->get_inode(renamed_dirino
);
1184 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri
<< dendl
;
1186 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino
<< dendl
;
1189 for (list
<dirfrag_t
>::iterator lp
= lump_order
.begin(); lp
!= lump_order
.end(); ++lp
) {
1190 dirlump
&lump
= lump_map
[*lp
];
1192 dout(10) << "EMetaBlob.replay found null dentry in dir " << *lp
<< dendl
;
1193 nnull
+= lump
.nnull
;
1199 // keep track of any inodes we unlink and don't relink elsewhere
1200 map
<CInode
*, CDir
*> unlinked
;
1201 set
<CInode
*> linked
;
1203 // walk through my dirs (in order!)
1204 for (list
<dirfrag_t
>::iterator lp
= lump_order
.begin();
1205 lp
!= lump_order
.end();
1207 dout(10) << "EMetaBlob.replay dir " << *lp
<< dendl
;
1208 dirlump
&lump
= lump_map
[*lp
];
1211 CDir
*dir
= mds
->mdcache
->get_force_dirfrag(*lp
, true);
1213 // hmm. do i have the inode?
1214 CInode
*diri
= mds
->mdcache
->get_inode((*lp
).ino
);
1216 if (MDS_INO_IS_MDSDIR(lp
->ino
)) {
1217 assert(MDS_INO_MDSDIR(mds
->get_nodeid()) != lp
->ino
);
1218 diri
= mds
->mdcache
->create_system_inode(lp
->ino
, S_IFDIR
|0755);
1219 diri
->state_clear(CInode::STATE_AUTH
);
1220 dout(10) << "EMetaBlob.replay created base " << *diri
<< dendl
;
1222 dout(0) << "EMetaBlob.replay missing dir ino " << (*lp
).ino
<< dendl
;
1223 mds
->clog
->error() << "failure replaying journal (EMetaBlob)";
1225 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1229 // create the dirfrag
1230 dir
= diri
->get_or_open_dirfrag(mds
->mdcache
, (*lp
).frag
);
1232 if (MDS_INO_IS_BASE(lp
->ino
))
1233 mds
->mdcache
->adjust_subtree_auth(dir
, CDIR_AUTH_UNDEF
);
1235 dout(10) << "EMetaBlob.replay added dir " << *dir
<< dendl
;
1237 dir
->set_version( lump
.fnode
.version
);
1238 dir
->fnode
= lump
.fnode
;
1240 if (lump
.is_importing()) {
1241 dir
->state_set(CDir::STATE_AUTH
);
1242 dir
->state_clear(CDir::STATE_COMPLETE
);
1244 if (lump
.is_dirty()) {
1245 dir
->_mark_dirty(logseg
);
1247 if (!(dir
->fnode
.rstat
== dir
->fnode
.accounted_rstat
)) {
1248 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir
<< dendl
;
1249 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->nestlock
);
1250 logseg
->dirty_dirfrag_nest
.push_back(&dir
->inode
->item_dirty_dirfrag_nest
);
1252 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir
<< dendl
;
1254 if (!(dir
->fnode
.fragstat
== dir
->fnode
.accounted_fragstat
)) {
1255 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir
<< dendl
;
1256 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->filelock
);
1257 logseg
->dirty_dirfrag_dir
.push_back(&dir
->inode
->item_dirty_dirfrag_dir
);
1259 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir
<< dendl
;
1262 if (lump
.is_dirty_dft()) {
1263 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir
<< dendl
;
1264 dir
->state_set(CDir::STATE_DIRTYDFT
);
1265 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->dirfragtreelock
);
1266 logseg
->dirty_dirfrag_dirfragtree
.push_back(&dir
->inode
->item_dirty_dirfrag_dirfragtree
);
1269 dir
->mark_new(logseg
);
1270 if (lump
.is_complete())
1271 dir
->mark_complete();
1273 dout(10) << "EMetaBlob.replay updated dir " << *dir
<< dendl
;
1276 lump
._decode_bits();
1278 // full dentry+inode pairs
1279 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator pp
= lump
.get_dfull().begin();
1280 pp
!= lump
.get_dfull().end();
1282 ceph::shared_ptr
<fullbit
> p
= *pp
;
1283 CDentry
*dn
= dir
->lookup_exact_snap(p
->dn
, p
->dnlast
);
1285 dn
= dir
->add_null_dentry(p
->dn
, p
->dnfirst
, p
->dnlast
);
1286 dn
->set_version(p
->dnv
);
1287 if (p
->is_dirty()) dn
->_mark_dirty(logseg
);
1288 dout(10) << "EMetaBlob.replay added (full) " << *dn
<< dendl
;
1290 dn
->set_version(p
->dnv
);
1291 if (p
->is_dirty()) dn
->_mark_dirty(logseg
);
1292 dout(10) << "EMetaBlob.replay for [" << p
->dnfirst
<< "," << p
->dnlast
<< "] had " << *dn
<< dendl
;
1293 dn
->first
= p
->dnfirst
;
1294 assert(dn
->last
== p
->dnlast
);
1296 if (lump
.is_importing())
1297 dn
->state_set(CDentry::STATE_AUTH
);
1299 CInode
*in
= mds
->mdcache
->get_inode(p
->inode
.ino
, p
->dnlast
);
1301 in
= new CInode(mds
->mdcache
, dn
->is_auth(), p
->dnfirst
, p
->dnlast
);
1302 p
->update_inode(mds
, in
);
1303 mds
->mdcache
->add_inode(in
);
1304 if (!dn
->get_linkage()->is_null()) {
1305 if (dn
->get_linkage()->is_primary()) {
1306 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1308 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1309 << " " << *dn
->get_linkage()->get_inode() << " should be " << p
->inode
.ino
;
1310 dout(0) << ss
.str() << dendl
;
1311 mds
->clog
->warn(ss
);
1313 dir
->unlink_inode(dn
, false);
1315 if (unlinked
.count(in
))
1317 dir
->link_primary_inode(dn
, in
);
1318 dout(10) << "EMetaBlob.replay added " << *in
<< dendl
;
1320 in
->first
= p
->dnfirst
;
1321 p
->update_inode(mds
, in
);
1322 if (dn
->get_linkage()->get_inode() != in
&& in
->get_parent_dn()) {
1323 dout(10) << "EMetaBlob.replay unlinking " << *in
<< dendl
;
1324 unlinked
[in
] = in
->get_parent_dir();
1325 in
->get_parent_dir()->unlink_inode(in
->get_parent_dn());
1327 if (dn
->get_linkage()->get_inode() != in
) {
1328 if (!dn
->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1329 if (dn
->get_linkage()->is_primary()) {
1330 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1332 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1333 << " " << *dn
->get_linkage()->get_inode() << " should be " << p
->inode
.ino
;
1334 dout(0) << ss
.str() << dendl
;
1335 mds
->clog
->warn(ss
);
1337 dir
->unlink_inode(dn
, false);
1339 if (unlinked
.count(in
))
1341 dir
->link_primary_inode(dn
, in
);
1342 dout(10) << "EMetaBlob.replay linked " << *in
<< dendl
;
1344 dout(10) << "EMetaBlob.replay for [" << p
->dnfirst
<< "," << p
->dnlast
<< "] had " << *in
<< dendl
;
1346 assert(in
->first
== p
->dnfirst
||
1347 (in
->is_multiversion() && in
->first
> p
->dnfirst
));
1350 in
->_mark_dirty(logseg
);
1351 if (p
->is_dirty_parent())
1352 in
->_mark_dirty_parent(logseg
, p
->is_dirty_pool());
1353 if (p
->need_snapflush())
1354 logseg
->open_files
.push_back(&in
->item_open_file
);
1356 in
->state_set(CInode::STATE_AUTH
);
1358 in
->state_clear(CInode::STATE_AUTH
);
1359 assert(g_conf
->mds_kill_journal_replay_at
!= 2);
1363 for (list
<remotebit
>::const_iterator p
= lump
.get_dremote().begin();
1364 p
!= lump
.get_dremote().end();
1366 CDentry
*dn
= dir
->lookup_exact_snap(p
->dn
, p
->dnlast
);
1368 dn
= dir
->add_remote_dentry(p
->dn
, p
->ino
, p
->d_type
, p
->dnfirst
, p
->dnlast
);
1369 dn
->set_version(p
->dnv
);
1370 if (p
->dirty
) dn
->_mark_dirty(logseg
);
1371 dout(10) << "EMetaBlob.replay added " << *dn
<< dendl
;
1373 if (!dn
->get_linkage()->is_null()) {
1374 dout(10) << "EMetaBlob.replay unlinking " << *dn
<< dendl
;
1375 if (dn
->get_linkage()->is_primary()) {
1376 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1378 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1379 << " " << *dn
->get_linkage()->get_inode() << " should be remote " << p
->ino
;
1380 dout(0) << ss
.str() << dendl
;
1382 dir
->unlink_inode(dn
, false);
1384 dir
->link_remote_inode(dn
, p
->ino
, p
->d_type
);
1385 dn
->set_version(p
->dnv
);
1386 if (p
->dirty
) dn
->_mark_dirty(logseg
);
1387 dout(10) << "EMetaBlob.replay for [" << p
->dnfirst
<< "," << p
->dnlast
<< "] had " << *dn
<< dendl
;
1388 dn
->first
= p
->dnfirst
;
1389 assert(dn
->last
== p
->dnlast
);
1391 if (lump
.is_importing())
1392 dn
->state_set(CDentry::STATE_AUTH
);
1396 for (list
<nullbit
>::const_iterator p
= lump
.get_dnull().begin();
1397 p
!= lump
.get_dnull().end();
1399 CDentry
*dn
= dir
->lookup_exact_snap(p
->dn
, p
->dnlast
);
1401 dn
= dir
->add_null_dentry(p
->dn
, p
->dnfirst
, p
->dnlast
);
1402 dn
->set_version(p
->dnv
);
1403 if (p
->dirty
) dn
->_mark_dirty(logseg
);
1404 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn
<< dendl
;
1406 dn
->first
= p
->dnfirst
;
1407 if (!dn
->get_linkage()->is_null()) {
1408 dout(10) << "EMetaBlob.replay unlinking " << *dn
<< dendl
;
1409 CInode
*in
= dn
->get_linkage()->get_inode();
1410 // For renamed inode, We may call CInode::force_dirfrag() later.
1411 // CInode::force_dirfrag() doesn't work well when inode is detached
1412 // from the hierarchy.
1413 if (!renamed_diri
|| renamed_diri
!= in
) {
1414 if (dn
->get_linkage()->is_primary())
1416 dir
->unlink_inode(dn
);
1419 dn
->set_version(p
->dnv
);
1420 if (p
->dirty
) dn
->_mark_dirty(logseg
);
1421 dout(10) << "EMetaBlob.replay had " << *dn
<< dendl
;
1422 assert(dn
->last
== p
->dnlast
);
1425 if (lump
.is_importing())
1426 dn
->state_set(CDentry::STATE_AUTH
);
1428 // Make null dentries the first things we trim
1429 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn
<< dendl
;
1433 assert(g_conf
->mds_kill_journal_replay_at
!= 3);
1435 if (renamed_dirino
) {
1437 assert(unlinked
.count(renamed_diri
));
1438 assert(linked
.count(renamed_diri
));
1439 olddir
= unlinked
[renamed_diri
];
1441 // we imported a diri we haven't seen before
1442 renamed_diri
= mds
->mdcache
->get_inode(renamed_dirino
);
1443 assert(renamed_diri
); // it was in the metablob
1447 if (olddir
->authority() != CDIR_AUTH_UNDEF
&&
1448 renamed_diri
->authority() == CDIR_AUTH_UNDEF
) {
1449 assert(slaveup
); // auth to non-auth, must be slave prepare
1450 list
<frag_t
> leaves
;
1451 renamed_diri
->dirfragtree
.get_leaves(leaves
);
1452 for (list
<frag_t
>::iterator p
= leaves
.begin(); p
!= leaves
.end(); ++p
) {
1453 CDir
*dir
= renamed_diri
->get_dirfrag(*p
);
1455 if (dir
->get_dir_auth() == CDIR_AUTH_UNDEF
)
1456 // preserve subtree bound until slave commit
1457 slaveup
->olddirs
.insert(dir
->inode
);
1459 dir
->state_set(CDir::STATE_AUTH
);
1463 mds
->mdcache
->adjust_subtree_after_rename(renamed_diri
, olddir
, false);
1465 // see if we can discard the subtree we renamed out of
1466 CDir
*root
= mds
->mdcache
->get_subtree_root(olddir
);
1467 if (root
->get_dir_auth() == CDIR_AUTH_UNDEF
) {
1468 if (slaveup
) // preserve the old dir until slave commit
1469 slaveup
->olddirs
.insert(olddir
->inode
);
1471 mds
->mdcache
->try_trim_non_auth_subtree(root
);
1475 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1476 if (renamed_diri
->authority() != CDIR_AUTH_UNDEF
) {
1477 for (list
<frag_t
>::iterator p
= renamed_dir_frags
.begin(); p
!= renamed_dir_frags
.end(); ++p
) {
1478 CDir
*dir
= renamed_diri
->get_dirfrag(*p
);
1480 // we already had the inode before, and we already adjusted this subtree accordingly.
1481 dout(10) << " already had+adjusted rename import bound " << *dir
<< dendl
;
1485 dir
= renamed_diri
->get_or_open_dirfrag(mds
->mdcache
, *p
);
1486 dout(10) << " creating new rename import bound " << *dir
<< dendl
;
1487 dir
->state_clear(CDir::STATE_AUTH
);
1488 mds
->mdcache
->adjust_subtree_auth(dir
, CDIR_AUTH_UNDEF
);
1492 // rename may overwrite an empty directory and move it into stray dir.
1493 unlinked
.erase(renamed_diri
);
1494 for (map
<CInode
*, CDir
*>::iterator p
= unlinked
.begin(); p
!= unlinked
.end(); ++p
) {
1495 if (!linked
.count(p
->first
))
1497 assert(p
->first
->is_dir());
1498 mds
->mdcache
->adjust_subtree_after_rename(p
->first
, p
->second
, false);
1502 if (!unlinked
.empty()) {
1503 for (set
<CInode
*>::iterator p
= linked
.begin(); p
!= linked
.end(); ++p
)
1505 dout(10) << " unlinked set contains " << unlinked
<< dendl
;
1506 for (map
<CInode
*, CDir
*>::iterator p
= unlinked
.begin(); p
!= unlinked
.end(); ++p
) {
1507 if (slaveup
) // preserve unlinked inodes until slave commit
1508 slaveup
->unlinked
.insert(p
->first
);
1510 mds
->mdcache
->remove_inode_recursive(p
->first
);
1514 // table client transactions
1515 for (list
<pair
<__u8
,version_t
> >::iterator p
= table_tids
.begin();
1516 p
!= table_tids
.end();
1518 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p
->first
)
1519 << " transaction " << p
->second
<< dendl
;
1520 MDSTableClient
*client
= mds
->get_table_client(p
->first
);
1522 client
->got_journaled_agree(p
->second
, logseg
);
1527 CInode
*in
= mds
->mdcache
->get_inode(opened_ino
);
1529 dout(10) << "EMetaBlob.replay noting opened inode " << *in
<< dendl
;
1530 logseg
->open_files
.push_back(&in
->item_open_file
);
1535 if (mds
->inotable
->get_version() >= inotablev
) {
1536 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1537 << " <= table " << mds
->inotable
->get_version() << dendl
;
1539 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1540 << " - 1 == table " << mds
->inotable
->get_version()
1541 << " allocated+used " << allocated_ino
1542 << " prealloc " << preallocated_inos
1545 mds
->inotable
->replay_alloc_id(allocated_ino
);
1546 if (preallocated_inos
.size())
1547 mds
->inotable
->replay_alloc_ids(preallocated_inos
);
1549 // [repair bad inotable updates]
1550 if (inotablev
> mds
->inotable
->get_version()) {
1551 mds
->clog
->error() << "journal replay inotablev mismatch "
1552 << mds
->inotable
->get_version() << " -> " << inotablev
;
1553 mds
->inotable
->force_replay_version(inotablev
);
1556 assert(inotablev
== mds
->inotable
->get_version());
1560 if (mds
->sessionmap
.get_version() >= sessionmapv
) {
1561 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1562 << " <= table " << mds
->sessionmap
.get_version() << dendl
;
1563 } else if (mds
->sessionmap
.get_version() + 2 >= sessionmapv
) {
1564 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1565 << " -(1|2) == table " << mds
->sessionmap
.get_version()
1566 << " prealloc " << preallocated_inos
1567 << " used " << used_preallocated_ino
1569 Session
*session
= mds
->sessionmap
.get_session(client_name
);
1571 dout(20) << " (session prealloc " << session
->info
.prealloc_inos
<< ")" << dendl
;
1572 if (used_preallocated_ino
) {
1573 if (!session
->info
.prealloc_inos
.empty()) {
1574 inodeno_t next
= session
->next_ino();
1575 inodeno_t i
= session
->take_ino(used_preallocated_ino
);
1577 mds
->clog
->warn() << " replayed op " << client_reqs
<< " used ino " << i
1578 << " but session next is " << next
;
1579 assert(i
== used_preallocated_ino
);
1580 session
->info
.used_inos
.clear();
1582 mds
->sessionmap
.replay_dirty_session(session
);
1584 if (!preallocated_inos
.empty()) {
1585 session
->info
.prealloc_inos
.insert(preallocated_inos
);
1586 mds
->sessionmap
.replay_dirty_session(session
);
1590 dout(10) << "EMetaBlob.replay no session for " << client_name
<< dendl
;
1591 if (used_preallocated_ino
) {
1592 mds
->sessionmap
.replay_advance_version();
1594 if (!preallocated_inos
.empty())
1595 mds
->sessionmap
.replay_advance_version();
1597 assert(sessionmapv
== mds
->sessionmap
.get_version());
1599 mds
->clog
->error() << "journal replay sessionmap v " << sessionmapv
1600 << " -(1|2) > table " << mds
->sessionmap
.get_version();
1601 assert(g_conf
->mds_wipe_sessions
);
1602 mds
->sessionmap
.wipe();
1603 mds
->sessionmap
.set_version(sessionmapv
);
1607 // truncating inodes
1608 for (list
<inodeno_t
>::iterator p
= truncate_start
.begin();
1609 p
!= truncate_start
.end();
1611 CInode
*in
= mds
->mdcache
->get_inode(*p
);
1613 mds
->mdcache
->add_recovered_truncate(in
, logseg
);
1615 for (map
<inodeno_t
,uint64_t>::iterator p
= truncate_finish
.begin();
1616 p
!= truncate_finish
.end();
1618 LogSegment
*ls
= mds
->mdlog
->get_segment(p
->second
);
1620 CInode
*in
= mds
->mdcache
->get_inode(p
->first
);
1622 mds
->mdcache
->remove_recovered_truncate(in
, ls
);
1627 for (vector
<inodeno_t
>::iterator p
= destroyed_inodes
.begin();
1628 p
!= destroyed_inodes
.end();
1630 CInode
*in
= mds
->mdcache
->get_inode(*p
);
1632 dout(10) << "EMetaBlob.replay destroyed " << *p
<< ", dropping " << *in
<< dendl
;
1633 CDentry
*parent
= in
->get_parent_dn();
1634 mds
->mdcache
->remove_inode(in
);
1636 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent
<< dendl
;
1637 assert(parent
->get_linkage()->is_null());
1640 dout(10) << "EMetaBlob.replay destroyed " << *p
<< ", not in cache" << dendl
;
1645 for (list
<pair
<metareqid_t
, uint64_t> >::iterator p
= client_reqs
.begin();
1646 p
!= client_reqs
.end();
1648 if (p
->first
.name
.is_client()) {
1649 dout(10) << "EMetaBlob.replay request " << p
->first
<< " trim_to " << p
->second
<< dendl
;
1650 inodeno_t created
= allocated_ino
? allocated_ino
: used_preallocated_ino
;
1651 // if we allocated an inode, there should be exactly one client request id.
1652 assert(created
== inodeno_t() || client_reqs
.size() == 1);
1654 Session
*session
= mds
->sessionmap
.get_session(p
->first
.name
);
1656 session
->add_completed_request(p
->first
.tid
, created
);
1658 session
->trim_completed_requests(p
->second
);
1664 for (list
<pair
<metareqid_t
, uint64_t> >::iterator p
= client_flushes
.begin();
1665 p
!= client_flushes
.end();
1667 if (p
->first
.name
.is_client()) {
1668 dout(10) << "EMetaBlob.replay flush " << p
->first
<< " trim_to " << p
->second
<< dendl
;
1669 Session
*session
= mds
->sessionmap
.get_session(p
->first
.name
);
1671 session
->add_completed_flush(p
->first
.tid
);
1673 session
->trim_completed_flushes(p
->second
);
1679 update_segment(logseg
);
1681 assert(g_conf
->mds_kill_journal_replay_at
!= 4);
1684 // -----------------------
1687 void ESession::update_segment()
1689 _segment
->sessionmapv
= cmapv
;
1690 if (inos
.size() && inotablev
)
1691 _segment
->inotablev
= inotablev
;
1694 void ESession::replay(MDSRank
*mds
)
1696 if (mds
->sessionmap
.get_version() >= cmapv
) {
1697 dout(10) << "ESession.replay sessionmap " << mds
->sessionmap
.get_version()
1698 << " >= " << cmapv
<< ", noop" << dendl
;
1700 dout(10) << "ESession.replay sessionmap " << mds
->sessionmap
.get_version()
1701 << " < " << cmapv
<< " " << (open
? "open":"close") << " " << client_inst
<< dendl
;
1704 session
= mds
->sessionmap
.get_or_add_session(client_inst
);
1705 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
1706 session
->set_client_metadata(client_metadata
);
1707 dout(10) << " opened session " << session
->info
.inst
<< dendl
;
1709 session
= mds
->sessionmap
.get_session(client_inst
.name
);
1710 if (session
) { // there always should be a session, but there's a bug
1711 if (session
->connection
== NULL
) {
1712 dout(10) << " removed session " << session
->info
.inst
<< dendl
;
1713 mds
->sessionmap
.remove_session(session
);
1716 session
->clear(); // the client has reconnected; keep the Session, but reset
1717 dout(10) << " reset session " << session
->info
.inst
<< " (they reconnected)" << dendl
;
1720 mds
->clog
->error() << "replayed stray Session close event for " << client_inst
1721 << " from time " << stamp
<< ", ignoring";
1725 mds
->sessionmap
.replay_dirty_session(session
);
1727 mds
->sessionmap
.replay_advance_version();
1729 assert(mds
->sessionmap
.get_version() == cmapv
);
1732 if (inos
.size() && inotablev
) {
1733 if (mds
->inotable
->get_version() >= inotablev
) {
1734 dout(10) << "ESession.replay inotable " << mds
->inotable
->get_version()
1735 << " >= " << inotablev
<< ", noop" << dendl
;
1737 dout(10) << "ESession.replay inotable " << mds
->inotable
->get_version()
1738 << " < " << inotablev
<< " " << (open
? "add":"remove") << dendl
;
1739 assert(!open
); // for now
1740 mds
->inotable
->replay_release_ids(inos
);
1741 assert(mds
->inotable
->get_version() == inotablev
);
1748 void ESession::encode(bufferlist
&bl
, uint64_t features
) const
1750 ENCODE_START(4, 3, bl
);
1751 ::encode(stamp
, bl
);
1752 ::encode(client_inst
, bl
, features
);
1754 ::encode(cmapv
, bl
);
1756 ::encode(inotablev
, bl
);
1757 ::encode(client_metadata
, bl
);
1761 void ESession::decode(bufferlist::iterator
&bl
)
1763 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl
);
1765 ::decode(stamp
, bl
);
1766 ::decode(client_inst
, bl
);
1768 ::decode(cmapv
, bl
);
1770 ::decode(inotablev
, bl
);
1771 if (struct_v
>= 4) {
1772 ::decode(client_metadata
, bl
);
1777 void ESession::dump(Formatter
*f
) const
1779 f
->dump_stream("client instance") << client_inst
;
1780 f
->dump_string("open", open
? "true" : "false");
1781 f
->dump_int("client map version", cmapv
);
1782 f
->dump_stream("inos") << inos
;
1783 f
->dump_int("inotable version", inotablev
);
1784 f
->open_object_section("client_metadata");
1785 for (map
<string
, string
>::const_iterator i
= client_metadata
.begin();
1786 i
!= client_metadata
.end(); ++i
) {
1787 f
->dump_string(i
->first
.c_str(), i
->second
);
1789 f
->close_section(); // client_metadata
1792 void ESession::generate_test_instances(list
<ESession
*>& ls
)
1794 ls
.push_back(new ESession
);
1797 // -----------------------
1800 void ESessions::encode(bufferlist
&bl
, uint64_t features
) const
1802 ENCODE_START(1, 1, bl
);
1803 ::encode(client_map
, bl
, features
);
1804 ::encode(cmapv
, bl
);
1805 ::encode(stamp
, bl
);
1809 void ESessions::decode_old(bufferlist::iterator
&bl
)
1811 ::decode(client_map
, bl
);
1812 ::decode(cmapv
, bl
);
1814 ::decode(stamp
, bl
);
1817 void ESessions::decode_new(bufferlist::iterator
&bl
)
1819 DECODE_START(1, bl
);
1820 ::decode(client_map
, bl
);
1821 ::decode(cmapv
, bl
);
1823 ::decode(stamp
, bl
);
1827 void ESessions::dump(Formatter
*f
) const
1829 f
->dump_int("client map version", cmapv
);
1831 f
->open_array_section("client map");
1832 for (map
<client_t
,entity_inst_t
>::const_iterator i
= client_map
.begin();
1833 i
!= client_map
.end(); ++i
) {
1834 f
->open_object_section("client");
1835 f
->dump_int("client id", i
->first
.v
);
1836 f
->dump_stream("client entity") << i
->second
;
1837 f
->close_section(); // client
1839 f
->close_section(); // client map
1842 void ESessions::generate_test_instances(list
<ESessions
*>& ls
)
1844 ls
.push_back(new ESessions());
1847 void ESessions::update_segment()
1849 _segment
->sessionmapv
= cmapv
;
1852 void ESessions::replay(MDSRank
*mds
)
1854 if (mds
->sessionmap
.get_version() >= cmapv
) {
1855 dout(10) << "ESessions.replay sessionmap " << mds
->sessionmap
.get_version()
1856 << " >= " << cmapv
<< ", noop" << dendl
;
1858 dout(10) << "ESessions.replay sessionmap " << mds
->sessionmap
.get_version()
1859 << " < " << cmapv
<< dendl
;
1860 mds
->sessionmap
.open_sessions(client_map
);
1861 assert(mds
->sessionmap
.get_version() == cmapv
);
1862 mds
->sessionmap
.set_projected(mds
->sessionmap
.get_version());
1868 // -----------------------
1871 void ETableServer::encode(bufferlist
& bl
, uint64_t features
) const
1873 ENCODE_START(3, 3, bl
);
1874 ::encode(stamp
, bl
);
1875 ::encode(table
, bl
);
1877 ::encode(reqid
, bl
);
1878 ::encode(bymds
, bl
);
1879 ::encode(mutation
, bl
);
1881 ::encode(version
, bl
);
1885 void ETableServer::decode(bufferlist::iterator
&bl
)
1887 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
1889 ::decode(stamp
, bl
);
1890 ::decode(table
, bl
);
1892 ::decode(reqid
, bl
);
1893 ::decode(bymds
, bl
);
1894 ::decode(mutation
, bl
);
1896 ::decode(version
, bl
);
1900 void ETableServer::dump(Formatter
*f
) const
1902 f
->dump_int("table id", table
);
1903 f
->dump_int("op", op
);
1904 f
->dump_int("request id", reqid
);
1905 f
->dump_int("by mds", bymds
);
1906 f
->dump_int("tid", tid
);
1907 f
->dump_int("version", version
);
1910 void ETableServer::generate_test_instances(list
<ETableServer
*>& ls
)
1912 ls
.push_back(new ETableServer());
1916 void ETableServer::update_segment()
1918 _segment
->tablev
[table
] = version
;
1921 void ETableServer::replay(MDSRank
*mds
)
1923 MDSTableServer
*server
= mds
->get_table_server(table
);
1927 if (server
->get_version() >= version
) {
1928 dout(10) << "ETableServer.replay " << get_mdstable_name(table
)
1929 << " " << get_mdstableserver_opname(op
)
1930 << " event " << version
1931 << " <= table " << server
->get_version() << dendl
;
1935 dout(10) << " ETableServer.replay " << get_mdstable_name(table
)
1936 << " " << get_mdstableserver_opname(op
)
1937 << " event " << version
<< " - 1 == table " << server
->get_version() << dendl
;
1938 assert(version
-1 == server
->get_version());
1941 case TABLESERVER_OP_PREPARE
:
1942 server
->_prepare(mutation
, reqid
, bymds
);
1943 server
->_note_prepare(bymds
, reqid
);
1945 case TABLESERVER_OP_COMMIT
:
1946 server
->_commit(tid
);
1947 server
->_note_commit(tid
);
1949 case TABLESERVER_OP_ROLLBACK
:
1950 server
->_rollback(tid
);
1951 server
->_note_rollback(tid
);
1953 case TABLESERVER_OP_SERVER_UPDATE
:
1954 server
->_server_update(mutation
);
1957 mds
->clog
->error() << "invalid tableserver op in ETableServer";
1959 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1962 assert(version
== server
->get_version());
1967 // ---------------------
1970 void ETableClient::encode(bufferlist
& bl
, uint64_t features
) const
1972 ENCODE_START(3, 3, bl
);
1973 ::encode(stamp
, bl
);
1974 ::encode(table
, bl
);
1980 void ETableClient::decode(bufferlist::iterator
&bl
)
1982 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
1984 ::decode(stamp
, bl
);
1985 ::decode(table
, bl
);
1991 void ETableClient::dump(Formatter
*f
) const
1993 f
->dump_int("table", table
);
1994 f
->dump_int("op", op
);
1995 f
->dump_int("tid", tid
);
1998 void ETableClient::generate_test_instances(list
<ETableClient
*>& ls
)
2000 ls
.push_back(new ETableClient());
2003 void ETableClient::replay(MDSRank
*mds
)
2005 dout(10) << " ETableClient.replay " << get_mdstable_name(table
)
2006 << " op " << get_mdstableserver_opname(op
)
2007 << " tid " << tid
<< dendl
;
2009 MDSTableClient
*client
= mds
->get_table_client(table
);
2013 assert(op
== TABLESERVER_OP_ACK
);
2014 client
->got_journaled_ack(tid
);
2018 // -----------------------
2021 void ESnap::update_segment()
2023 _segment->tablev[TABLE_SNAP] = version;
2026 void ESnap::replay(MDSRank *mds)
2028 if (mds->snaptable->get_version() >= version) {
2029 dout(10) << "ESnap.replay event " << version
2030 << " <= table " << mds->snaptable->get_version() << dendl;
2034 dout(10) << " ESnap.replay event " << version
2035 << " - 1 == table " << mds->snaptable->get_version() << dendl;
2036 assert(version-1 == mds->snaptable->get_version());
2040 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
2041 assert(s == snap.snapid);
2043 mds->snaptable->remove(snap.snapid);
2046 assert(version == mds->snaptable->get_version());
2052 // -----------------------
2055 void EUpdate::encode(bufferlist
&bl
, uint64_t features
) const
2057 ENCODE_START(4, 4, bl
);
2058 ::encode(stamp
, bl
);
2060 ::encode(metablob
, bl
, features
);
2061 ::encode(client_map
, bl
);
2062 ::encode(cmapv
, bl
);
2063 ::encode(reqid
, bl
);
2064 ::encode(had_slaves
, bl
);
2068 void EUpdate::decode(bufferlist::iterator
&bl
)
2070 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl
);
2072 ::decode(stamp
, bl
);
2074 ::decode(metablob
, bl
);
2075 ::decode(client_map
, bl
);
2077 ::decode(cmapv
, bl
);
2078 ::decode(reqid
, bl
);
2079 ::decode(had_slaves
, bl
);
2083 void EUpdate::dump(Formatter
*f
) const
2085 f
->open_object_section("metablob");
2087 f
->close_section(); // metablob
2089 f
->dump_string("type", type
);
2090 f
->dump_int("client map length", client_map
.length());
2091 f
->dump_int("client map version", cmapv
);
2092 f
->dump_stream("reqid") << reqid
;
2093 f
->dump_string("had slaves", had_slaves
? "true" : "false");
2096 void EUpdate::generate_test_instances(list
<EUpdate
*>& ls
)
2098 ls
.push_back(new EUpdate());
2102 void EUpdate::update_segment()
2104 metablob
.update_segment(_segment
);
2106 if (client_map
.length())
2107 _segment
->sessionmapv
= cmapv
;
2110 _segment
->uncommitted_masters
.insert(reqid
);
2113 void EUpdate::replay(MDSRank
*mds
)
2115 metablob
.replay(mds
, _segment
);
2118 dout(10) << "EUpdate.replay " << reqid
<< " had slaves, expecting a matching ECommitted" << dendl
;
2119 _segment
->uncommitted_masters
.insert(reqid
);
2120 set
<mds_rank_t
> slaves
;
2121 mds
->mdcache
->add_uncommitted_master(reqid
, _segment
, slaves
, true);
2124 if (client_map
.length()) {
2125 if (mds
->sessionmap
.get_version() >= cmapv
) {
2126 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2127 << " <= table " << mds
->sessionmap
.get_version() << dendl
;
2129 dout(10) << "EUpdate.replay sessionmap " << mds
->sessionmap
.get_version()
2130 << " < " << cmapv
<< dendl
;
2131 // open client sessions?
2132 map
<client_t
,entity_inst_t
> cm
;
2133 bufferlist::iterator blp
= client_map
.begin();
2135 mds
->sessionmap
.open_sessions(cm
);
2137 assert(mds
->sessionmap
.get_version() == cmapv
);
2138 mds
->sessionmap
.set_projected(mds
->sessionmap
.get_version());
2145 // ------------------------
2148 void EOpen::encode(bufferlist
&bl
, uint64_t features
) const {
2149 ENCODE_START(4, 3, bl
);
2150 ::encode(stamp
, bl
);
2151 ::encode(metablob
, bl
, features
);
2153 ::encode(snap_inos
, bl
);
2157 void EOpen::decode(bufferlist::iterator
&bl
) {
2158 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2160 ::decode(stamp
, bl
);
2161 ::decode(metablob
, bl
);
2164 ::decode(snap_inos
, bl
);
2168 void EOpen::dump(Formatter
*f
) const
2170 f
->open_object_section("metablob");
2172 f
->close_section(); // metablob
2173 f
->open_array_section("inos involved");
2174 for (vector
<inodeno_t
>::const_iterator i
= inos
.begin();
2175 i
!= inos
.end(); ++i
) {
2176 f
->dump_int("ino", *i
);
2178 f
->close_section(); // inos
2181 void EOpen::generate_test_instances(list
<EOpen
*>& ls
)
2183 ls
.push_back(new EOpen());
2184 ls
.push_back(new EOpen());
2185 ls
.back()->add_ino(0);
2188 void EOpen::update_segment()
2193 void EOpen::replay(MDSRank
*mds
)
2195 dout(10) << "EOpen.replay " << dendl
;
2196 metablob
.replay(mds
, _segment
);
2198 // note which segments inodes belong to, so we don't have to start rejournaling them
2199 for (const auto &ino
: inos
) {
2200 CInode
*in
= mds
->mdcache
->get_inode(ino
);
2202 dout(0) << "EOpen.replay ino " << ino
<< " not in metablob" << dendl
;
2205 _segment
->open_files
.push_back(&in
->item_open_file
);
2207 for (const auto &vino
: snap_inos
) {
2208 CInode
*in
= mds
->mdcache
->get_inode(vino
);
2210 dout(0) << "EOpen.replay ino " << vino
<< " not in metablob" << dendl
;
2213 _segment
->open_files
.push_back(&in
->item_open_file
);
2218 // -----------------------
2221 void ECommitted::replay(MDSRank
*mds
)
2223 if (mds
->mdcache
->uncommitted_masters
.count(reqid
)) {
2224 dout(10) << "ECommitted.replay " << reqid
<< dendl
;
2225 mds
->mdcache
->uncommitted_masters
[reqid
].ls
->uncommitted_masters
.erase(reqid
);
2226 mds
->mdcache
->uncommitted_masters
.erase(reqid
);
2228 dout(10) << "ECommitted.replay " << reqid
<< " -- didn't see original op" << dendl
;
2232 void ECommitted::encode(bufferlist
& bl
, uint64_t features
) const
2234 ENCODE_START(3, 3, bl
);
2235 ::encode(stamp
, bl
);
2236 ::encode(reqid
, bl
);
2240 void ECommitted::decode(bufferlist::iterator
& bl
)
2242 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2244 ::decode(stamp
, bl
);
2245 ::decode(reqid
, bl
);
2249 void ECommitted::dump(Formatter
*f
) const {
2250 f
->dump_stream("stamp") << stamp
;
2251 f
->dump_stream("reqid") << reqid
;
2254 void ECommitted::generate_test_instances(list
<ECommitted
*>& ls
)
2256 ls
.push_back(new ECommitted
);
2257 ls
.push_back(new ECommitted
);
2258 ls
.back()->stamp
= utime_t(1, 2);
2259 ls
.back()->reqid
= metareqid_t(entity_name_t::CLIENT(123), 456);
2262 // -----------------------
2265 void link_rollback::encode(bufferlist
&bl
) const
2267 ENCODE_START(2, 2, bl
);
2268 ::encode(reqid
, bl
);
2270 ::encode(was_inc
, bl
);
2271 ::encode(old_ctime
, bl
);
2272 ::encode(old_dir_mtime
, bl
);
2273 ::encode(old_dir_rctime
, bl
);
2277 void link_rollback::decode(bufferlist::iterator
&bl
)
2279 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2280 ::decode(reqid
, bl
);
2282 ::decode(was_inc
, bl
);
2283 ::decode(old_ctime
, bl
);
2284 ::decode(old_dir_mtime
, bl
);
2285 ::decode(old_dir_rctime
, bl
);
2289 void link_rollback::dump(Formatter
*f
) const
2291 f
->dump_stream("metareqid") << reqid
;
2292 f
->dump_int("ino", ino
);
2293 f
->dump_string("was incremented", was_inc
? "true" : "false");
2294 f
->dump_stream("old_ctime") << old_ctime
;
2295 f
->dump_stream("old_dir_mtime") << old_dir_mtime
;
2296 f
->dump_stream("old_dir_rctime") << old_dir_rctime
;
2299 void link_rollback::generate_test_instances(list
<link_rollback
*>& ls
)
2301 ls
.push_back(new link_rollback());
2304 void rmdir_rollback::encode(bufferlist
& bl
) const
2306 ENCODE_START(2, 2, bl
);
2307 ::encode(reqid
, bl
);
2308 ::encode(src_dir
, bl
);
2309 ::encode(src_dname
, bl
);
2310 ::encode(dest_dir
, bl
);
2311 ::encode(dest_dname
, bl
);
2315 void rmdir_rollback::decode(bufferlist::iterator
& bl
)
2317 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2318 ::decode(reqid
, bl
);
2319 ::decode(src_dir
, bl
);
2320 ::decode(src_dname
, bl
);
2321 ::decode(dest_dir
, bl
);
2322 ::decode(dest_dname
, bl
);
2326 void rmdir_rollback::dump(Formatter
*f
) const
2328 f
->dump_stream("metareqid") << reqid
;
2329 f
->dump_stream("source directory") << src_dir
;
2330 f
->dump_string("source dname", src_dname
);
2331 f
->dump_stream("destination directory") << dest_dir
;
2332 f
->dump_string("destination dname", dest_dname
);
2335 void rmdir_rollback::generate_test_instances(list
<rmdir_rollback
*>& ls
)
2337 ls
.push_back(new rmdir_rollback());
2340 void rename_rollback::drec::encode(bufferlist
&bl
) const
2342 ENCODE_START(2, 2, bl
);
2343 ::encode(dirfrag
, bl
);
2344 ::encode(dirfrag_old_mtime
, bl
);
2345 ::encode(dirfrag_old_rctime
, bl
);
2347 ::encode(remote_ino
, bl
);
2348 ::encode(dname
, bl
);
2349 ::encode(remote_d_type
, bl
);
2350 ::encode(old_ctime
, bl
);
2354 void rename_rollback::drec::decode(bufferlist::iterator
&bl
)
2356 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2357 ::decode(dirfrag
, bl
);
2358 ::decode(dirfrag_old_mtime
, bl
);
2359 ::decode(dirfrag_old_rctime
, bl
);
2361 ::decode(remote_ino
, bl
);
2362 ::decode(dname
, bl
);
2363 ::decode(remote_d_type
, bl
);
2364 ::decode(old_ctime
, bl
);
2368 void rename_rollback::drec::dump(Formatter
*f
) const
2370 f
->dump_stream("directory fragment") << dirfrag
;
2371 f
->dump_stream("directory old mtime") << dirfrag_old_mtime
;
2372 f
->dump_stream("directory old rctime") << dirfrag_old_rctime
;
2373 f
->dump_int("ino", ino
);
2374 f
->dump_int("remote ino", remote_ino
);
2375 f
->dump_string("dname", dname
);
2376 uint32_t type
= DTTOIF(remote_d_type
) & S_IFMT
; // convert to type entries
2380 type_string
= "file"; break;
2382 type_string
= "symlink"; break;
2384 type_string
= "directory"; break;
2386 type_string
= "UNKNOWN-" + stringify((int)type
); break;
2388 f
->dump_string("remote dtype", type_string
);
2389 f
->dump_stream("old ctime") << old_ctime
;
2392 void rename_rollback::drec::generate_test_instances(list
<drec
*>& ls
)
2394 ls
.push_back(new drec());
2395 ls
.back()->remote_d_type
= IFTODT(S_IFREG
);
2398 void rename_rollback::encode(bufferlist
&bl
) const
2400 ENCODE_START(2, 2, bl
);
2401 ::encode(reqid
, bl
);
2402 encode(orig_src
, bl
);
2403 encode(orig_dest
, bl
);
2405 ::encode(ctime
, bl
);
2409 void rename_rollback::decode(bufferlist::iterator
&bl
)
2411 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2412 ::decode(reqid
, bl
);
2413 decode(orig_src
, bl
);
2414 decode(orig_dest
, bl
);
2416 ::decode(ctime
, bl
);
2420 void rename_rollback::dump(Formatter
*f
) const
2422 f
->dump_stream("request id") << reqid
;
2423 f
->open_object_section("original src drec");
2425 f
->close_section(); // original src drec
2426 f
->open_object_section("original dest drec");
2428 f
->close_section(); // original dest drec
2429 f
->open_object_section("stray drec");
2431 f
->close_section(); // stray drec
2432 f
->dump_stream("ctime") << ctime
;
2435 void rename_rollback::generate_test_instances(list
<rename_rollback
*>& ls
)
2437 ls
.push_back(new rename_rollback());
2438 ls
.back()->orig_src
.remote_d_type
= IFTODT(S_IFREG
);
2439 ls
.back()->orig_dest
.remote_d_type
= IFTODT(S_IFREG
);
2440 ls
.back()->stray
.remote_d_type
= IFTODT(S_IFREG
);
2443 void ESlaveUpdate::encode(bufferlist
&bl
, uint64_t features
) const
2445 ENCODE_START(3, 3, bl
);
2446 ::encode(stamp
, bl
);
2448 ::encode(reqid
, bl
);
2449 ::encode(master
, bl
);
2451 ::encode(origop
, bl
);
2452 ::encode(commit
, bl
, features
);
2453 ::encode(rollback
, bl
);
2457 void ESlaveUpdate::decode(bufferlist::iterator
&bl
)
2459 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2461 ::decode(stamp
, bl
);
2463 ::decode(reqid
, bl
);
2464 ::decode(master
, bl
);
2466 ::decode(origop
, bl
);
2467 ::decode(commit
, bl
);
2468 ::decode(rollback
, bl
);
2472 void ESlaveUpdate::dump(Formatter
*f
) const
2474 f
->open_object_section("metablob");
2476 f
->close_section(); // metablob
2478 f
->dump_int("rollback length", rollback
.length());
2479 f
->dump_string("type", type
);
2480 f
->dump_stream("metareqid") << reqid
;
2481 f
->dump_int("master", master
);
2482 f
->dump_int("op", op
);
2483 f
->dump_int("original op", origop
);
2486 void ESlaveUpdate::generate_test_instances(list
<ESlaveUpdate
*>& ls
)
2488 ls
.push_back(new ESlaveUpdate());
2492 void ESlaveUpdate::replay(MDSRank
*mds
)
2496 case ESlaveUpdate::OP_PREPARE
:
2497 dout(10) << "ESlaveUpdate.replay prepare " << reqid
<< " for mds." << master
2498 << ": applying commit, saving rollback info" << dendl
;
2499 su
= new MDSlaveUpdate(origop
, rollback
, _segment
->slave_updates
);
2500 commit
.replay(mds
, _segment
, su
);
2501 mds
->mdcache
->add_uncommitted_slave_update(reqid
, master
, su
);
2504 case ESlaveUpdate::OP_COMMIT
:
2505 su
= mds
->mdcache
->get_uncommitted_slave_update(reqid
, master
);
2507 dout(10) << "ESlaveUpdate.replay commit " << reqid
<< " for mds." << master
<< dendl
;
2508 mds
->mdcache
->finish_uncommitted_slave_update(reqid
, master
);
2510 dout(10) << "ESlaveUpdate.replay commit " << reqid
<< " for mds." << master
2511 << ": ignoring, no previously saved prepare" << dendl
;
2515 case ESlaveUpdate::OP_ROLLBACK
:
2516 dout(10) << "ESlaveUpdate.replay abort " << reqid
<< " for mds." << master
2517 << ": applying rollback commit blob" << dendl
;
2518 commit
.replay(mds
, _segment
);
2519 su
= mds
->mdcache
->get_uncommitted_slave_update(reqid
, master
);
2521 mds
->mdcache
->finish_uncommitted_slave_update(reqid
, master
);
2525 mds
->clog
->error() << "invalid op in ESlaveUpdate";
2527 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2532 // -----------------------
2535 void ESubtreeMap::encode(bufferlist
& bl
, uint64_t features
) const
2537 ENCODE_START(6, 5, bl
);
2538 ::encode(stamp
, bl
);
2539 ::encode(metablob
, bl
, features
);
2540 ::encode(subtrees
, bl
);
2541 ::encode(ambiguous_subtrees
, bl
);
2542 ::encode(expire_pos
, bl
);
2543 ::encode(event_seq
, bl
);
2547 void ESubtreeMap::decode(bufferlist::iterator
&bl
)
2549 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl
);
2551 ::decode(stamp
, bl
);
2552 ::decode(metablob
, bl
);
2553 ::decode(subtrees
, bl
);
2555 ::decode(ambiguous_subtrees
, bl
);
2557 ::decode(expire_pos
, bl
);
2559 ::decode(event_seq
, bl
);
2563 void ESubtreeMap::dump(Formatter
*f
) const
2565 f
->open_object_section("metablob");
2567 f
->close_section(); // metablob
2569 f
->open_array_section("subtrees");
2570 for(map
<dirfrag_t
,vector
<dirfrag_t
> >::const_iterator i
= subtrees
.begin();
2571 i
!= subtrees
.end(); ++i
) {
2572 f
->open_object_section("tree");
2573 f
->dump_stream("root dirfrag") << i
->first
;
2574 for (vector
<dirfrag_t
>::const_iterator j
= i
->second
.begin();
2575 j
!= i
->second
.end(); ++j
) {
2576 f
->dump_stream("bound dirfrag") << *j
;
2578 f
->close_section(); // tree
2580 f
->close_section(); // subtrees
2582 f
->open_array_section("ambiguous subtrees");
2583 for(set
<dirfrag_t
>::const_iterator i
= ambiguous_subtrees
.begin();
2584 i
!= ambiguous_subtrees
.end(); ++i
) {
2585 f
->dump_stream("dirfrag") << *i
;
2587 f
->close_section(); // ambiguous subtrees
2589 f
->dump_int("expire position", expire_pos
);
2592 void ESubtreeMap::generate_test_instances(list
<ESubtreeMap
*>& ls
)
2594 ls
.push_back(new ESubtreeMap());
2597 void ESubtreeMap::replay(MDSRank
*mds
)
2599 if (expire_pos
&& expire_pos
> mds
->mdlog
->journaler
->get_expire_pos())
2600 mds
->mdlog
->journaler
->set_expire_pos(expire_pos
);
2602 // suck up the subtree map?
2603 if (mds
->mdcache
->is_subtrees()) {
2604 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl
;
2607 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= subtrees
.begin();
2608 p
!= subtrees
.end();
2610 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
2612 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2613 << " subtree root " << p
->first
<< " not in cache";
2618 if (!mds
->mdcache
->is_subtree(dir
)) {
2619 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2620 << " subtree root " << p
->first
<< " not a subtree in cache";
2624 if (dir
->get_dir_auth().first
!= mds
->get_nodeid()) {
2625 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2626 << " subtree root " << p
->first
2627 << " is not mine in cache (it's " << dir
->get_dir_auth() << ")";
2632 for (vector
<dirfrag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
2633 mds
->mdcache
->get_force_dirfrag(*q
, true);
2636 mds
->mdcache
->get_subtree_bounds(dir
, bounds
);
2637 for (vector
<dirfrag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
2638 CDir
*b
= mds
->mdcache
->get_dirfrag(*q
);
2640 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2641 << " subtree " << p
->first
<< " bound " << *q
<< " not in cache";
2645 if (bounds
.count(b
) == 0) {
2646 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2647 << " subtree " << p
->first
<< " bound " << *q
<< " not a bound in cache";
2653 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
) {
2654 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2655 << " subtree " << p
->first
<< " has extra bound in cache " << (*q
)->dirfrag();
2659 if (ambiguous_subtrees
.count(p
->first
)) {
2660 if (!mds
->mdcache
->have_ambiguous_import(p
->first
)) {
2661 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2662 << " subtree " << p
->first
<< " is ambiguous but is not in our cache";
2666 if (mds
->mdcache
->have_ambiguous_import(p
->first
)) {
2667 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2668 << " subtree " << p
->first
<< " is not ambiguous but is in our cache";
2675 mds
->mdcache
->list_subtrees(subs
);
2676 for (list
<CDir
*>::iterator p
= subs
.begin(); p
!= subs
.end(); ++p
) {
2678 if (dir
->get_dir_auth().first
!= mds
->get_nodeid())
2680 if (subtrees
.count(dir
->dirfrag()) == 0) {
2681 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2682 << " does not include cache subtree " << dir
->dirfrag();
2688 dout(0) << "journal subtrees: " << subtrees
<< dendl
;
2689 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees
<< dendl
;
2690 mds
->mdcache
->show_subtrees();
2691 assert(!g_conf
->mds_debug_subtrees
|| errors
== 0);
2696 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl
;
2698 // first, stick the spanning tree in my cache
2699 //metablob.print(*_dout);
2700 metablob
.replay(mds
, _segment
);
2702 // restore import/export maps
2703 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= subtrees
.begin();
2704 p
!= subtrees
.end();
2706 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
2708 if (ambiguous_subtrees
.count(p
->first
)) {
2710 mds
->mdcache
->add_ambiguous_import(p
->first
, p
->second
);
2711 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, p
->second
,
2712 mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
2715 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, p
->second
, mds
->get_nodeid());
2719 mds
->mdcache
->recalc_auth_bits(true);
2721 mds
->mdcache
->show_subtrees();
2726 // -----------------------
2729 void EFragment::replay(MDSRank
*mds
)
2731 dout(10) << "EFragment.replay " << op_name(op
) << " " << ino
<< " " << basefrag
<< " by " << bits
<< dendl
;
2733 list
<CDir
*> resultfrags
;
2734 list
<MDSInternalContextBase
*> waiters
;
2735 list
<frag_t
> old_frags
;
2737 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2738 // it will be once we replay the metablob , but first we need to
2739 // refragment anything we already have in the cache.
2740 CInode
*in
= mds
->mdcache
->get_inode(ino
);
2744 mds
->mdcache
->add_uncommitted_fragment(dirfrag_t(ino
, basefrag
), bits
, orig_frags
, _segment
, &rollback
);
2747 mds
->mdcache
->adjust_dir_fragments(in
, basefrag
, bits
, resultfrags
, waiters
, true);
2752 in
->dirfragtree
.get_leaves_under(basefrag
, old_frags
);
2753 if (orig_frags
.empty()) {
2754 // old format EFragment
2755 mds
->mdcache
->adjust_dir_fragments(in
, basefrag
, -bits
, resultfrags
, waiters
, true);
2757 for (list
<frag_t
>::iterator p
= orig_frags
.begin(); p
!= orig_frags
.end(); ++p
)
2758 mds
->mdcache
->force_dir_fragment(in
, *p
);
2761 mds
->mdcache
->rollback_uncommitted_fragment(dirfrag_t(ino
, basefrag
), old_frags
);
2766 mds
->mdcache
->finish_uncommitted_fragment(dirfrag_t(ino
, basefrag
), op
);
2773 metablob
.replay(mds
, _segment
);
2774 if (in
&& g_conf
->mds_debug_frag
)
2775 in
->verify_dirfrags();
2778 void EFragment::encode(bufferlist
&bl
, uint64_t features
) const {
2779 ENCODE_START(5, 4, bl
);
2780 ::encode(stamp
, bl
);
2783 ::encode(basefrag
, bl
);
2785 ::encode(metablob
, bl
, features
);
2786 ::encode(orig_frags
, bl
);
2787 ::encode(rollback
, bl
);
2791 void EFragment::decode(bufferlist::iterator
&bl
) {
2792 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl
);
2794 ::decode(stamp
, bl
);
2798 ::decode(basefrag
, bl
);
2800 ::decode(metablob
, bl
);
2801 if (struct_v
>= 5) {
2802 ::decode(orig_frags
, bl
);
2803 ::decode(rollback
, bl
);
2808 void EFragment::dump(Formatter
*f
) const
2810 /*f->open_object_section("Metablob");
2811 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2812 f->close_section();*/
2813 f
->dump_string("op", op_name(op
));
2814 f
->dump_stream("ino") << ino
;
2815 f
->dump_stream("base frag") << basefrag
;
2816 f
->dump_int("bits", bits
);
2819 void EFragment::generate_test_instances(list
<EFragment
*>& ls
)
2821 ls
.push_back(new EFragment
);
2822 ls
.push_back(new EFragment
);
2823 ls
.back()->op
= OP_PREPARE
;
2825 ls
.back()->bits
= 5;
2828 void dirfrag_rollback::encode(bufferlist
&bl
) const
2830 ENCODE_START(1, 1, bl
);
2831 ::encode(fnode
, bl
);
2835 void dirfrag_rollback::decode(bufferlist::iterator
&bl
)
2837 DECODE_START(1, bl
);
2838 ::decode(fnode
, bl
);
2844 // =========================================================================
2846 // -----------------------
2849 void EExport::replay(MDSRank
*mds
)
2851 dout(10) << "EExport.replay " << base
<< dendl
;
2852 metablob
.replay(mds
, _segment
);
2854 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
2857 set
<CDir
*> realbounds
;
2858 for (set
<dirfrag_t
>::iterator p
= bounds
.begin();
2861 CDir
*bd
= mds
->mdcache
->get_dirfrag(*p
);
2863 realbounds
.insert(bd
);
2867 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, realbounds
, CDIR_AUTH_UNDEF
);
2869 mds
->mdcache
->try_trim_non_auth_subtree(dir
);
2872 void EExport::encode(bufferlist
& bl
, uint64_t features
) const
2874 ENCODE_START(4, 3, bl
);
2875 ::encode(stamp
, bl
);
2876 ::encode(metablob
, bl
, features
);
2878 ::encode(bounds
, bl
);
2879 ::encode(target
, bl
);
2883 void EExport::decode(bufferlist::iterator
&bl
)
2885 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2887 ::decode(stamp
, bl
);
2888 ::decode(metablob
, bl
);
2890 ::decode(bounds
, bl
);
2892 ::decode(target
, bl
);
2896 void EExport::dump(Formatter
*f
) const
2898 f
->dump_float("stamp", (double)stamp
);
2899 /*f->open_object_section("Metablob");
2900 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2901 f->close_section();*/
2902 f
->dump_stream("base dirfrag") << base
;
2903 f
->open_array_section("bounds dirfrags");
2904 for (set
<dirfrag_t
>::const_iterator i
= bounds
.begin();
2905 i
!= bounds
.end(); ++i
) {
2906 f
->dump_stream("dirfrag") << *i
;
2908 f
->close_section(); // bounds dirfrags
2911 void EExport::generate_test_instances(list
<EExport
*>& ls
)
2913 EExport
*sample
= new EExport();
2914 ls
.push_back(sample
);
2918 // -----------------------
2921 void EImportStart::update_segment()
2923 _segment
->sessionmapv
= cmapv
;
2926 void EImportStart::replay(MDSRank
*mds
)
2928 dout(10) << "EImportStart.replay " << base
<< " bounds " << bounds
<< dendl
;
2929 //metablob.print(*_dout);
2930 metablob
.replay(mds
, _segment
);
2932 // put in ambiguous import list
2933 mds
->mdcache
->add_ambiguous_import(base
, bounds
);
2935 // set auth partially to us so we don't trim it
2936 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
2939 set
<CDir
*> realbounds
;
2940 for (vector
<dirfrag_t
>::iterator p
= bounds
.begin();
2943 CDir
*bd
= mds
->mdcache
->get_dirfrag(*p
);
2945 if (!bd
->is_subtree_root())
2946 bd
->state_clear(CDir::STATE_AUTH
);
2947 realbounds
.insert(bd
);
2950 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, realbounds
,
2951 mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
2953 // open client sessions?
2954 if (mds
->sessionmap
.get_version() >= cmapv
) {
2955 dout(10) << "EImportStart.replay sessionmap " << mds
->sessionmap
.get_version()
2956 << " >= " << cmapv
<< ", noop" << dendl
;
2958 dout(10) << "EImportStart.replay sessionmap " << mds
->sessionmap
.get_version()
2959 << " < " << cmapv
<< dendl
;
2960 map
<client_t
,entity_inst_t
> cm
;
2961 bufferlist::iterator blp
= client_map
.begin();
2963 mds
->sessionmap
.open_sessions(cm
);
2964 if (mds
->sessionmap
.get_version() != cmapv
)
2966 derr
<< "sessionmap version " << mds
->sessionmap
.get_version()
2967 << " != cmapv " << cmapv
<< dendl
;
2968 mds
->clog
->error() << "failure replaying journal (EImportStart)";
2970 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2972 mds
->sessionmap
.set_projected(mds
->sessionmap
.get_version());
2977 void EImportStart::encode(bufferlist
&bl
, uint64_t features
) const {
2978 ENCODE_START(4, 3, bl
);
2979 ::encode(stamp
, bl
);
2981 ::encode(metablob
, bl
, features
);
2982 ::encode(bounds
, bl
);
2983 ::encode(cmapv
, bl
);
2984 ::encode(client_map
, bl
);
2989 void EImportStart::decode(bufferlist::iterator
&bl
) {
2990 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2992 ::decode(stamp
, bl
);
2994 ::decode(metablob
, bl
);
2995 ::decode(bounds
, bl
);
2996 ::decode(cmapv
, bl
);
2997 ::decode(client_map
, bl
);
3003 void EImportStart::dump(Formatter
*f
) const
3005 f
->dump_stream("base dirfrag") << base
;
3006 f
->open_array_section("boundary dirfrags");
3007 for (vector
<dirfrag_t
>::const_iterator iter
= bounds
.begin();
3008 iter
!= bounds
.end(); ++iter
) {
3009 f
->dump_stream("frag") << *iter
;
3014 void EImportStart::generate_test_instances(list
<EImportStart
*>& ls
)
3016 ls
.push_back(new EImportStart
);
3019 // -----------------------
3022 void EImportFinish::replay(MDSRank
*mds
)
3024 if (mds
->mdcache
->have_ambiguous_import(base
)) {
3025 dout(10) << "EImportFinish.replay " << base
<< " success=" << success
<< dendl
;
3027 mds
->mdcache
->finish_ambiguous_import(base
);
3029 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
3031 vector
<dirfrag_t
> bounds
;
3032 mds
->mdcache
->get_ambiguous_import_bounds(base
, bounds
);
3033 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, bounds
, CDIR_AUTH_UNDEF
);
3034 mds
->mdcache
->cancel_ambiguous_import(dir
);
3035 mds
->mdcache
->try_trim_non_auth_subtree(dir
);
3038 // this shouldn't happen unless this is an old journal
3039 dout(10) << "EImportFinish.replay " << base
<< " success=" << success
3040 << " on subtree not marked as ambiguous"
3042 mds
->clog
->error() << "failure replaying journal (EImportFinish)";
3044 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3048 void EImportFinish::encode(bufferlist
& bl
, uint64_t features
) const
3050 ENCODE_START(3, 3, bl
);
3051 ::encode(stamp
, bl
);
3053 ::encode(success
, bl
);
3057 void EImportFinish::decode(bufferlist::iterator
&bl
)
3059 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
3061 ::decode(stamp
, bl
);
3063 ::decode(success
, bl
);
3067 void EImportFinish::dump(Formatter
*f
) const
3069 f
->dump_stream("base dirfrag") << base
;
3070 f
->dump_string("success", success
? "true" : "false");
3072 void EImportFinish::generate_test_instances(list
<EImportFinish
*>& ls
)
3074 ls
.push_back(new EImportFinish
);
3075 ls
.push_back(new EImportFinish
);
3076 ls
.back()->success
= true;
3080 // ------------------------
3083 void EResetJournal::encode(bufferlist
& bl
, uint64_t features
) const
3085 ENCODE_START(2, 2, bl
);
3086 ::encode(stamp
, bl
);
3090 void EResetJournal::decode(bufferlist::iterator
&bl
)
3092 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
3093 ::decode(stamp
, bl
);
3097 void EResetJournal::dump(Formatter
*f
) const
3099 f
->dump_stream("timestamp") << stamp
;
3102 void EResetJournal::generate_test_instances(list
<EResetJournal
*>& ls
)
3104 ls
.push_back(new EResetJournal());
3107 void EResetJournal::replay(MDSRank
*mds
)
3109 dout(1) << "EResetJournal" << dendl
;
3111 mds
->sessionmap
.wipe();
3112 mds
->inotable
->replay_reset();
3114 if (mds
->mdsmap
->get_root() == mds
->get_nodeid()) {
3115 CDir
*rootdir
= mds
->mdcache
->get_root()->get_or_open_dirfrag(mds
->mdcache
, frag_t());
3116 mds
->mdcache
->adjust_subtree_auth(rootdir
, mds
->get_nodeid());
3119 CDir
*mydir
= mds
->mdcache
->get_myin()->get_or_open_dirfrag(mds
->mdcache
, frag_t());
3120 mds
->mdcache
->adjust_subtree_auth(mydir
, mds
->get_nodeid());
3122 mds
->mdcache
->recalc_auth_bits(true);
3124 mds
->mdcache
->show_subtrees();
3128 void ENoOp::encode(bufferlist
&bl
, uint64_t features
) const
3130 ENCODE_START(2, 2, bl
);
3131 ::encode(pad_size
, bl
);
3132 uint8_t const pad
= 0xff;
3133 for (unsigned int i
= 0; i
< pad_size
; ++i
) {
3140 void ENoOp::decode(bufferlist::iterator
&bl
)
3142 DECODE_START(2, bl
);
3143 ::decode(pad_size
, bl
);
3144 if (bl
.get_remaining() != pad_size
) {
3145 // This is spiritually an assertion, but expressing in a way that will let
3146 // journal debug tools catch it and recognise a malformed entry.
3147 throw buffer::end_of_buffer();
3149 bl
.advance(pad_size
);
3155 void ENoOp::replay(MDSRank
*mds
)
3157 dout(4) << "ENoOp::replay, " << pad_size
<< " bytes skipped in journal" << dendl
;
3161 * If re-formatting an old journal that used absolute log position
3162 * references as segment sequence numbers, use this function to update
3166 * MDSRank instance, just used for logging
3168 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3171 * True if the event was modified.
3173 bool EMetaBlob::rewrite_truncate_finish(MDSRank
const *mds
,
3174 std::map
<log_segment_seq_t
, log_segment_seq_t
> const &old_to_new
)
3176 bool modified
= false;
3177 map
<inodeno_t
, log_segment_seq_t
> new_trunc_finish
;
3178 for (std::map
<inodeno_t
, log_segment_seq_t
>::iterator i
= truncate_finish
.begin();
3179 i
!= truncate_finish
.end(); ++i
) {
3180 if (old_to_new
.count(i
->second
)) {
3181 dout(20) << __func__
<< " applying segment seq mapping "
3182 << i
->second
<< " -> " << old_to_new
.find(i
->second
)->second
<< dendl
;
3183 new_trunc_finish
[i
->first
] = old_to_new
.find(i
->second
)->second
;
3186 dout(20) << __func__
<< " no segment seq mapping found for "
3187 << i
->second
<< dendl
;
3188 new_trunc_finish
[i
->first
] = i
->second
;
3191 truncate_finish
= new_trunc_finish
;