1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "common/config.h"
16 #include "osdc/Journaler.h"
17 #include "events/ESubtreeMap.h"
18 #include "events/ESession.h"
19 #include "events/ESessions.h"
21 #include "events/EMetaBlob.h"
22 #include "events/EResetJournal.h"
23 #include "events/ENoOp.h"
25 #include "events/EUpdate.h"
26 #include "events/ESlaveUpdate.h"
27 #include "events/EOpen.h"
28 #include "events/ECommitted.h"
30 #include "events/EExport.h"
31 #include "events/EImportStart.h"
32 #include "events/EImportFinish.h"
33 #include "events/EFragment.h"
35 #include "events/ETableClient.h"
36 #include "events/ETableServer.h"
38 #include "include/stringify.h"
40 #include "LogSegment.h"
50 #include "MDSTableClient.h"
51 #include "MDSTableServer.h"
55 #define dout_context g_ceph_context
56 #define dout_subsys ceph_subsys_mds
58 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
61 // -----------------------
64 void LogSegment::try_to_expire(MDSRank
*mds
, MDSGatherBuilder
&gather_bld
, int op_prio
)
68 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire" << dendl
;
70 assert(g_conf
->mds_kill_journal_expire_at
!= 1);
73 for (elist
<CDir
*>::iterator p
= new_dirfrags
.begin(); !p
.end(); ++p
) {
74 dout(20) << " new_dirfrag " << **p
<< dendl
;
75 assert((*p
)->is_auth());
78 for (elist
<CDir
*>::iterator p
= dirty_dirfrags
.begin(); !p
.end(); ++p
) {
79 dout(20) << " dirty_dirfrag " << **p
<< dendl
;
80 assert((*p
)->is_auth());
83 for (elist
<CDentry
*>::iterator p
= dirty_dentries
.begin(); !p
.end(); ++p
) {
84 dout(20) << " dirty_dentry " << **p
<< dendl
;
85 assert((*p
)->is_auth());
86 commit
.insert((*p
)->get_dir());
88 for (elist
<CInode
*>::iterator p
= dirty_inodes
.begin(); !p
.end(); ++p
) {
89 dout(20) << " dirty_inode " << **p
<< dendl
;
90 assert((*p
)->is_auth());
91 if ((*p
)->is_base()) {
92 (*p
)->store(gather_bld
.new_sub());
94 commit
.insert((*p
)->get_parent_dn()->get_dir());
97 if (!commit
.empty()) {
98 for (set
<CDir
*>::iterator p
= commit
.begin();
102 assert(dir
->is_auth());
103 if (dir
->can_auth_pin()) {
104 dout(15) << "try_to_expire committing " << *dir
<< dendl
;
105 dir
->commit(0, gather_bld
.new_sub(), false, op_prio
);
107 dout(15) << "try_to_expire waiting for unfreeze on " << *dir
<< dendl
;
108 dir
->add_waiter(CDir::WAIT_UNFREEZE
, gather_bld
.new_sub());
113 // master ops with possibly uncommitted slaves
114 for (set
<metareqid_t
>::iterator p
= uncommitted_masters
.begin();
115 p
!= uncommitted_masters
.end();
117 dout(10) << "try_to_expire waiting for slaves to ack commit on " << *p
<< dendl
;
118 mds
->mdcache
->wait_for_uncommitted_master(*p
, gather_bld
.new_sub());
121 // uncommitted fragments
122 for (set
<dirfrag_t
>::iterator p
= uncommitted_fragments
.begin();
123 p
!= uncommitted_fragments
.end();
125 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p
<< dendl
;
126 mds
->mdcache
->wait_for_uncommitted_fragment(*p
, gather_bld
.new_sub());
129 // nudge scatterlocks
130 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_dir
.begin(); !p
.end(); ++p
) {
132 dout(10) << "try_to_expire waiting for dirlock flush on " << *in
<< dendl
;
133 mds
->locker
->scatter_nudge(&in
->filelock
, gather_bld
.new_sub());
135 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_dirfragtree
.begin(); !p
.end(); ++p
) {
137 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in
<< dendl
;
138 mds
->locker
->scatter_nudge(&in
->dirfragtreelock
, gather_bld
.new_sub());
140 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_nest
.begin(); !p
.end(); ++p
) {
142 dout(10) << "try_to_expire waiting for nest flush on " << *in
<< dendl
;
143 mds
->locker
->scatter_nudge(&in
->nestlock
, gather_bld
.new_sub());
146 assert(g_conf
->mds_kill_journal_expire_at
!= 2);
148 // open files and snap inodes
149 if (!open_files
.empty()) {
150 assert(!mds
->mdlog
->is_capped()); // hmm FIXME
152 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
154 elist
<CInode
*>::iterator p
= open_files
.begin(member_offset(CInode
, item_open_file
));
158 if (in
->last
== CEPH_NOSNAP
&& in
->is_auth() &&
159 !in
->is_ambiguous_auth() && in
->is_any_caps()) {
160 if (in
->is_any_caps_wanted()) {
161 dout(20) << "try_to_expire requeueing open file " << *in
<< dendl
;
163 le
= new EOpen(mds
->mdlog
);
164 mds
->mdlog
->start_entry(le
);
166 le
->add_clean_inode(in
);
167 ls
->open_files
.push_back(&in
->item_open_file
);
169 // drop inodes that aren't wanted
170 dout(20) << "try_to_expire not requeueing and delisting unwanted file " << *in
<< dendl
;
171 in
->item_open_file
.remove_myself();
173 } else if (in
->last
!= CEPH_NOSNAP
&& !in
->client_snap_caps
.empty()) {
174 // journal snap inodes that need flush. This simplify the mds failover hanlding
175 dout(20) << "try_to_expire requeueing snap needflush inode " << *in
<< dendl
;
177 le
= new EOpen(mds
->mdlog
);
178 mds
->mdlog
->start_entry(le
);
180 le
->add_clean_inode(in
);
181 ls
->open_files
.push_back(&in
->item_open_file
);
184 * we can get a capless inode here if we replay an open file, the client fails to
185 * reconnect it, but does REPLAY an open request (that adds it to the logseg). AFAICS
186 * it's ok for the client to replay an open on a file it doesn't have in it's cache
189 * this makes the mds less sensitive to strict open_file consistency, although it does
190 * make it easier to miss subtle problems.
192 dout(20) << "try_to_expire not requeueing and delisting capless file " << *in
<< dendl
;
193 in
->item_open_file
.remove_myself();
197 mds
->mdlog
->submit_entry(le
);
198 mds
->mdlog
->wait_for_safe(gather_bld
.new_sub());
199 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl
;
203 assert(g_conf
->mds_kill_journal_expire_at
!= 3);
205 // backtraces to be stored/updated
206 for (elist
<CInode
*>::iterator p
= dirty_parent_inodes
.begin(); !p
.end(); ++p
) {
208 assert(in
->is_auth());
209 if (in
->can_auth_pin()) {
210 dout(15) << "try_to_expire waiting for storing backtrace on " << *in
<< dendl
;
211 in
->store_backtrace(gather_bld
.new_sub(), op_prio
);
213 dout(15) << "try_to_expire waiting for unfreeze on " << *in
<< dendl
;
214 in
->add_waiter(CInode::WAIT_UNFREEZE
, gather_bld
.new_sub());
218 assert(g_conf
->mds_kill_journal_expire_at
!= 4);
221 for (elist
<MDSlaveUpdate
*>::iterator p
= slave_updates
.begin(member_offset(MDSlaveUpdate
,
224 MDSlaveUpdate
*su
= *p
;
225 dout(10) << "try_to_expire waiting on slave update " << su
<< dendl
;
226 assert(su
->waiter
== 0);
227 su
->waiter
= gather_bld
.new_sub();
231 if (inotablev
> mds
->inotable
->get_committed_version()) {
232 dout(10) << "try_to_expire saving inotable table, need " << inotablev
233 << ", committed is " << mds
->inotable
->get_committed_version()
234 << " (" << mds
->inotable
->get_committing_version() << ")"
236 mds
->inotable
->save(gather_bld
.new_sub(), inotablev
);
240 if (sessionmapv
> mds
->sessionmap
.get_committed()) {
241 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
242 << ", committed is " << mds
->sessionmap
.get_committed()
243 << " (" << mds
->sessionmap
.get_committing() << ")"
245 mds
->sessionmap
.save(gather_bld
.new_sub(), sessionmapv
);
248 // updates to sessions for completed_requests
249 mds
->sessionmap
.save_if_dirty(touched_sessions
, &gather_bld
);
250 touched_sessions
.clear();
252 // pending commit atids
253 for (map
<int, ceph::unordered_set
<version_t
> >::iterator p
= pending_commit_tids
.begin();
254 p
!= pending_commit_tids
.end();
256 MDSTableClient
*client
= mds
->get_table_client(p
->first
);
258 for (ceph::unordered_set
<version_t
>::iterator q
= p
->second
.begin();
259 q
!= p
->second
.end();
261 dout(10) << "try_to_expire " << get_mdstable_name(p
->first
) << " transaction " << *q
262 << " pending commit (not yet acked), waiting" << dendl
;
263 assert(!client
->has_committed(*q
));
264 client
->wait_for_ack(*q
, gather_bld
.new_sub());
269 for (map
<int, version_t
>::iterator p
= tablev
.begin();
272 MDSTableServer
*server
= mds
->get_table_server(p
->first
);
274 if (p
->second
> server
->get_committed_version()) {
275 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p
->first
)
276 << " to save, need " << p
->second
<< dendl
;
277 server
->save(gather_bld
.new_sub());
282 for (set
<CInode
*>::iterator p
= truncating_inodes
.begin();
283 p
!= truncating_inodes
.end();
285 dout(10) << "try_to_expire waiting for truncate of " << **p
<< dendl
;
286 (*p
)->add_waiter(CInode::WAIT_TRUNC
, gather_bld
.new_sub());
289 if (gather_bld
.has_subs()) {
290 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire waiting" << dendl
;
293 assert(g_conf
->mds_kill_journal_expire_at
!= 5);
294 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire success" << dendl
;
299 // -----------------------
302 EMetaBlob::EMetaBlob(MDLog
*mdlog
) : opened_ino(0), renamed_dirino(0),
303 inotablev(0), sessionmapv(0), allocated_ino(0),
304 last_subtree_map(0), event_seq(0)
307 void EMetaBlob::add_dir_context(CDir
*dir
, int mode
)
309 MDSRank
*mds
= dir
->cache
->mds
;
311 list
<CDentry
*> parents
;
313 // it may be okay not to include the maybe items, if
314 // - we journaled the maybe child inode in this segment
315 // - that subtree turns out to be unambiguously auth
316 list
<CDentry
*> maybe
;
317 bool maybenot
= false;
320 // already have this dir? (we must always add in order)
321 if (lump_map
.count(dir
->dirfrag())) {
322 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") have lump " << dir
->dirfrag() << dendl
;
326 // stop at root/stray
327 CInode
*diri
= dir
->get_inode();
328 CDentry
*parent
= diri
->get_projected_parent_dn();
330 if (mode
== TO_AUTH_SUBTREE_ROOT
) {
332 if (dir
->is_subtree_root()) {
333 // match logic in MDCache::create_subtree_map()
334 if (dir
->get_dir_auth().first
== mds
->get_nodeid()) {
335 mds_authority_t parent_auth
= parent
? parent
->authority() : CDIR_AUTH_UNDEF
;
336 if (parent_auth
.first
== dir
->get_dir_auth().first
) {
337 if (parent_auth
.second
== CDIR_AUTH_UNKNOWN
&&
338 !dir
->is_ambiguous_dir_auth() &&
339 !dir
->state_test(CDir::STATE_EXPORTBOUND
) &&
340 !dir
->state_test(CDir::STATE_AUXSUBTREE
) &&
341 !diri
->state_test(CInode::STATE_AMBIGUOUSAUTH
)) {
342 dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir
<< dendl
;
345 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") ambiguous or transient subtree " << dendl
;
347 // it's an auth subtree, we don't need maybe (if any), and we're done.
348 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") reached unambig auth subtree, don't need " << maybe
349 << " at " << *dir
<< dendl
;
354 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") reached ambig or !auth subtree, need " << maybe
355 << " at " << *dir
<< dendl
;
356 // we need the maybe list after all!
357 parents
.splice(parents
.begin(), maybe
);
362 // was the inode journaled in this blob?
363 if (event_seq
&& diri
->last_journaled
== event_seq
) {
364 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") already have diri this blob " << *diri
<< dendl
;
368 // have we journaled this inode since the last subtree map?
369 if (!maybenot
&& last_subtree_map
&& diri
->last_journaled
>= last_subtree_map
) {
370 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") already have diri in this segment ("
371 << diri
->last_journaled
<< " >= " << last_subtree_map
<< "), setting maybenot flag "
381 dout(25) << "EMetaBlob::add_dir_context(" << dir
<< ") maybe " << *parent
<< dendl
;
382 maybe
.push_front(parent
);
384 dout(25) << "EMetaBlob::add_dir_context(" << dir
<< ") definitely " << *parent
<< dendl
;
385 parents
.push_front(parent
);
388 dir
= parent
->get_dir();
391 parents
.splice(parents
.begin(), maybe
);
393 dout(20) << "EMetaBlob::add_dir_context final: " << parents
<< dendl
;
394 for (list
<CDentry
*>::iterator p
= parents
.begin(); p
!= parents
.end(); ++p
) {
395 assert((*p
)->get_projected_linkage()->is_primary());
396 add_dentry(*p
, false);
400 void EMetaBlob::update_segment(LogSegment
*ls
)
402 // dirty inode mtimes
403 // -> handled directly by Server.cc, replay()
405 // alloc table update?
407 ls
->inotablev
= inotablev
;
409 ls
->sessionmapv
= sessionmapv
;
412 // -> handled directly by Server.cc
415 // note the newest request per client
416 //if (!client_reqs.empty())
417 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
420 // EMetaBlob::fullbit
422 void EMetaBlob::fullbit::encode(bufferlist
& bl
, uint64_t features
) const {
423 ENCODE_START(8, 5, bl
);
425 ::encode(dnfirst
, bl
);
426 ::encode(dnlast
, bl
);
428 ::encode(inode
, bl
, features
);
429 ::encode(xattrs
, bl
);
430 if (inode
.is_symlink())
431 ::encode(symlink
, bl
);
432 if (inode
.is_dir()) {
433 ::encode(dirfragtree
, bl
);
434 ::encode(snapbl
, bl
);
437 if (old_inodes
.empty()) {
441 ::encode(old_inodes
, bl
, features
);
444 ::encode(snapbl
, bl
);
445 ::encode(oldest_snap
, bl
);
449 void EMetaBlob::fullbit::decode(bufferlist::iterator
&bl
) {
450 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
452 ::decode(dnfirst
, bl
);
453 ::decode(dnlast
, bl
);
456 ::decode(xattrs
, bl
);
457 if (inode
.is_symlink())
458 ::decode(symlink
, bl
);
459 if (inode
.is_dir()) {
460 ::decode(dirfragtree
, bl
);
461 ::decode(snapbl
, bl
);
462 if ((struct_v
== 2) || (struct_v
== 3)) {
463 bool dir_layout_exists
;
464 ::decode(dir_layout_exists
, bl
);
465 if (dir_layout_exists
) {
467 ::decode(dir_struct_v
, bl
); // default_file_layout version
468 ::decode(inode
.layout
, bl
); // and actual layout, that we care about
477 state
= dirty
? EMetaBlob::fullbit::STATE_DIRTY
: 0;
481 bool old_inodes_present
;
482 ::decode(old_inodes_present
, bl
);
483 if (old_inodes_present
) {
484 ::decode(old_inodes
, bl
);
487 if (!inode
.is_dir()) {
489 ::decode(snapbl
, bl
);
492 ::decode(oldest_snap
, bl
);
494 oldest_snap
= CEPH_NOSNAP
;
499 void EMetaBlob::fullbit::dump(Formatter
*f
) const
501 f
->dump_string("dentry", dn
);
502 f
->dump_stream("snapid.first") << dnfirst
;
503 f
->dump_stream("snapid.last") << dnlast
;
504 f
->dump_int("dentry version", dnv
);
505 f
->open_object_section("inode");
507 f
->close_section(); // inode
508 f
->open_object_section("xattrs");
509 for (const auto &p
: xattrs
) {
510 std::string
s(p
.second
.c_str(), p
.second
.length());
511 f
->dump_string(p
.first
.c_str(), s
);
513 f
->close_section(); // xattrs
514 if (inode
.is_symlink()) {
515 f
->dump_string("symlink", symlink
);
517 if (inode
.is_dir()) {
518 f
->dump_stream("frag tree") << dirfragtree
;
519 f
->dump_string("has_snapbl", snapbl
.length() ? "true" : "false");
520 if (inode
.has_layout()) {
521 f
->open_object_section("file layout policy");
523 f
->dump_string("layout", "the layout exists");
524 f
->close_section(); // file layout policy
527 f
->dump_string("state", state_string());
528 if (!old_inodes
.empty()) {
529 f
->open_array_section("old inodes");
530 for (const auto &p
: old_inodes
) {
531 f
->open_object_section("inode");
532 f
->dump_int("snapid", p
.first
);
534 f
->close_section(); // inode
536 f
->close_section(); // old inodes
540 void EMetaBlob::fullbit::generate_test_instances(list
<EMetaBlob::fullbit
*>& ls
)
542 CInode::mempool_inode inode
;
544 CInode::mempool_xattr_map empty_xattrs
;
545 bufferlist empty_snapbl
;
546 fullbit
*sample
= new fullbit("/testdn", 0, 0, 0,
547 inode
, fragtree
, empty_xattrs
, "", 0, empty_snapbl
,
549 ls
.push_back(sample
);
552 void EMetaBlob::fullbit::update_inode(MDSRank
*mds
, CInode
*in
)
556 in
->maybe_export_pin();
557 if (in
->inode
.is_dir()) {
558 if (!(in
->dirfragtree
== dirfragtree
)) {
559 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in
->dirfragtree
<< " -> "
560 << dirfragtree
<< " on " << *in
<< dendl
;
561 in
->dirfragtree
= dirfragtree
;
562 in
->force_dirfrags();
563 if (in
->has_dirfrags() && in
->authority() == CDIR_AUTH_UNDEF
) {
565 in
->get_nested_dirfrags(ls
);
566 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
568 if (dir
->get_num_any() == 0 &&
569 mds
->mdcache
->can_trim_non_auth_dirfrag(dir
)) {
570 dout(10) << " closing empty non-auth dirfrag " << *dir
<< dendl
;
571 in
->close_dirfrag(dir
->get_frag());
576 } else if (in
->inode
.is_symlink()) {
577 in
->symlink
= mempool::mds_co::string(boost::string_view(symlink
));
579 in
->old_inodes
= old_inodes
;
580 if (!in
->old_inodes
.empty()) {
581 snapid_t min_first
= in
->old_inodes
.rbegin()->first
+ 1;
582 if (min_first
> in
->first
)
583 in
->first
= min_first
;
587 * we can do this before linking hte inode bc the split_at would
588 * be a no-op.. we have no children (namely open snaprealms) to
591 in
->oldest_snap
= oldest_snap
;
592 in
->decode_snap_blob(snapbl
);
595 * In case there was anything malformed in the journal that we are
596 * replaying, do sanity checks on the inodes we're replaying and
597 * go damaged instead of letting any trash into a live cache
600 // Files must have valid layouts with a pool set
601 if (in
->inode
.layout
.pool_id
== -1 || !in
->inode
.layout
.is_valid()) {
602 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
603 << ": " << in
->inode
.layout
<< dendl
;
604 std::ostringstream oss
;
605 oss
<< "Invalid layout for inode 0x" << std::hex
<< in
->inode
.ino
606 << std::dec
<< " in journal";
607 mds
->clog
->error() << oss
.str();
609 ceph_abort(); // Should be unreachable because damaged() calls respawn()
614 // EMetaBlob::remotebit
616 void EMetaBlob::remotebit::encode(bufferlist
& bl
) const
618 ENCODE_START(2, 2, bl
);
620 ::encode(dnfirst
, bl
);
621 ::encode(dnlast
, bl
);
624 ::encode(d_type
, bl
);
629 void EMetaBlob::remotebit::decode(bufferlist::iterator
&bl
)
631 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
633 ::decode(dnfirst
, bl
);
634 ::decode(dnlast
, bl
);
637 ::decode(d_type
, bl
);
642 void EMetaBlob::remotebit::dump(Formatter
*f
) const
644 f
->dump_string("dentry", dn
);
645 f
->dump_int("snapid.first", dnfirst
);
646 f
->dump_int("snapid.last", dnlast
);
647 f
->dump_int("dentry version", dnv
);
648 f
->dump_int("inodeno", ino
);
649 uint32_t type
= DTTOIF(d_type
) & S_IFMT
; // convert to type entries
653 type_string
= "file"; break;
655 type_string
= "symlink"; break;
657 type_string
= "directory"; break;
659 type_string
= "fifo"; break;
661 type_string
= "chr"; break;
663 type_string
= "blk"; break;
665 type_string
= "sock"; break;
667 assert (0 == "unknown d_type!");
669 f
->dump_string("d_type", type_string
);
670 f
->dump_string("dirty", dirty
? "true" : "false");
673 void EMetaBlob::remotebit::
674 generate_test_instances(list
<EMetaBlob::remotebit
*>& ls
)
676 remotebit
*remote
= new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG
), false);
677 ls
.push_back(remote
);
680 // EMetaBlob::nullbit
682 void EMetaBlob::nullbit::encode(bufferlist
& bl
) const
684 ENCODE_START(2, 2, bl
);
686 ::encode(dnfirst
, bl
);
687 ::encode(dnlast
, bl
);
693 void EMetaBlob::nullbit::decode(bufferlist::iterator
&bl
)
695 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
697 ::decode(dnfirst
, bl
);
698 ::decode(dnlast
, bl
);
704 void EMetaBlob::nullbit::dump(Formatter
*f
) const
706 f
->dump_string("dentry", dn
);
707 f
->dump_int("snapid.first", dnfirst
);
708 f
->dump_int("snapid.last", dnlast
);
709 f
->dump_int("dentry version", dnv
);
710 f
->dump_string("dirty", dirty
? "true" : "false");
713 void EMetaBlob::nullbit::generate_test_instances(list
<nullbit
*>& ls
)
715 nullbit
*sample
= new nullbit("/test/dentry", 0, 10, 15, false);
716 nullbit
*sample2
= new nullbit("/test/dirty", 10, 20, 25, true);
717 ls
.push_back(sample
);
718 ls
.push_back(sample2
);
721 // EMetaBlob::dirlump
723 void EMetaBlob::dirlump::encode(bufferlist
& bl
, uint64_t features
) const
725 ENCODE_START(2, 2, bl
);
729 ::encode(nremote
, bl
);
731 _encode_bits(features
);
736 void EMetaBlob::dirlump::decode(bufferlist::iterator
&bl
)
738 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
)
742 ::decode(nremote
, bl
);
745 dn_decoded
= false; // don't decode bits unless we need them.
749 void EMetaBlob::dirlump::dump(Formatter
*f
) const
752 dirlump
*me
= const_cast<dirlump
*>(this);
755 f
->open_object_section("fnode");
757 f
->close_section(); // fnode
758 f
->dump_string("state", state_string());
759 f
->dump_int("nfull", nfull
);
760 f
->dump_int("nremote", nremote
);
761 f
->dump_int("nnull", nnull
);
763 f
->open_array_section("full bits");
764 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
765 iter
= dfull
.begin(); iter
!= dfull
.end(); ++iter
) {
766 f
->open_object_section("fullbit");
768 f
->close_section(); // fullbit
770 f
->close_section(); // full bits
771 f
->open_array_section("remote bits");
772 for (list
<remotebit
>::const_iterator
773 iter
= dremote
.begin(); iter
!= dremote
.end(); ++iter
) {
774 f
->open_object_section("remotebit");
776 f
->close_section(); // remotebit
778 f
->close_section(); // remote bits
779 f
->open_array_section("null bits");
780 for (list
<nullbit
>::const_iterator
781 iter
= dnull
.begin(); iter
!= dnull
.end(); ++iter
) {
782 f
->open_object_section("null bit");
784 f
->close_section(); // null bit
786 f
->close_section(); // null bits
789 void EMetaBlob::dirlump::generate_test_instances(list
<dirlump
*>& ls
)
791 ls
.push_back(new dirlump());
797 void EMetaBlob::encode(bufferlist
& bl
, uint64_t features
) const
799 ENCODE_START(8, 5, bl
);
800 ::encode(lump_order
, bl
);
801 ::encode(lump_map
, bl
, features
);
802 ::encode(roots
, bl
, features
);
803 ::encode(table_tids
, bl
);
804 ::encode(opened_ino
, bl
);
805 ::encode(allocated_ino
, bl
);
806 ::encode(used_preallocated_ino
, bl
);
807 ::encode(preallocated_inos
, bl
);
808 ::encode(client_name
, bl
);
809 ::encode(inotablev
, bl
);
810 ::encode(sessionmapv
, bl
);
811 ::encode(truncate_start
, bl
);
812 ::encode(truncate_finish
, bl
);
813 ::encode(destroyed_inodes
, bl
);
814 ::encode(client_reqs
, bl
);
815 ::encode(renamed_dirino
, bl
);
816 ::encode(renamed_dir_frags
, bl
);
818 // make MDSRank use v6 format happy
824 ::encode(client_flushes
, bl
);
827 void EMetaBlob::decode(bufferlist::iterator
&bl
)
829 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
830 ::decode(lump_order
, bl
);
831 ::decode(lump_map
, bl
);
836 ::decode(rootbl
, bl
);
837 if (rootbl
.length()) {
838 bufferlist::iterator p
= rootbl
.begin();
839 roots
.push_back(ceph::shared_ptr
<fullbit
>(new fullbit(p
)));
842 ::decode(table_tids
, bl
);
843 ::decode(opened_ino
, bl
);
844 ::decode(allocated_ino
, bl
);
845 ::decode(used_preallocated_ino
, bl
);
846 ::decode(preallocated_inos
, bl
);
847 ::decode(client_name
, bl
);
848 ::decode(inotablev
, bl
);
849 ::decode(sessionmapv
, bl
);
850 ::decode(truncate_start
, bl
);
851 ::decode(truncate_finish
, bl
);
852 ::decode(destroyed_inodes
, bl
);
854 ::decode(client_reqs
, bl
);
859 client_reqs
.push_back(pair
<metareqid_t
,uint64_t>(r
.front(), 0));
864 ::decode(renamed_dirino
, bl
);
865 ::decode(renamed_dir_frags
, bl
);
875 ::decode(client_flushes
, bl
);
882 * Get all inodes touched by this metablob. Includes the 'bits' within
883 * dirlumps, and the inodes of the dirs themselves.
885 void EMetaBlob::get_inodes(
886 std::set
<inodeno_t
> &inodes
) const
888 // For all dirlumps in this metablob
889 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
890 // Record inode of dirlump
891 inodeno_t
const dir_ino
= i
->first
.ino
;
892 inodes
.insert(dir_ino
);
894 // Decode dirlump bits
895 dirlump
const &dl
= i
->second
;
898 // Record inodes of fullbits
899 list
<ceph::shared_ptr
<fullbit
> > const &fb_list
= dl
.get_dfull();
900 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
901 iter
= fb_list
.begin(); iter
!= fb_list
.end(); ++iter
) {
902 inodes
.insert((*iter
)->inode
.ino
);
905 // Record inodes of remotebits
906 list
<remotebit
> const &rb_list
= dl
.get_dremote();
907 for (list
<remotebit
>::const_iterator
908 iter
= rb_list
.begin(); iter
!= rb_list
.end(); ++iter
) {
909 inodes
.insert(iter
->ino
);
916 * Get a map of dirfrag to set of dentries in that dirfrag which are
917 * touched in this operation.
919 void EMetaBlob::get_dentries(std::map
<dirfrag_t
, std::set
<std::string
> > &dentries
) const
921 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
922 dirlump
const &dl
= i
->second
;
923 dirfrag_t
const &df
= i
->first
;
927 list
<ceph::shared_ptr
<fullbit
> > const &fb_list
= dl
.get_dfull();
928 list
<nullbit
> const &nb_list
= dl
.get_dnull();
929 list
<remotebit
> const &rb_list
= dl
.get_dremote();
931 // For all bits, store dentry
932 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
933 iter
= fb_list
.begin(); iter
!= fb_list
.end(); ++iter
) {
934 dentries
[df
].insert((*iter
)->dn
);
937 for (list
<nullbit
>::const_iterator
938 iter
= nb_list
.begin(); iter
!= nb_list
.end(); ++iter
) {
939 dentries
[df
].insert(iter
->dn
);
941 for (list
<remotebit
>::const_iterator
942 iter
= rb_list
.begin(); iter
!= rb_list
.end(); ++iter
) {
943 dentries
[df
].insert(iter
->dn
);
951 * Calculate all paths that we can infer are touched by this metablob. Only uses
952 * information local to this metablob so it may only be the path within the
955 void EMetaBlob::get_paths(
956 std::vector
<std::string
> &paths
) const
958 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
959 typedef std::pair
<inodeno_t
, std::string
> Location
;
961 // Whenever we see a dentry within a dirlump, we remember it as a child of
962 // the dirlump's inode
963 std::map
<inodeno_t
, std::list
<std::string
> > children
;
965 // Whenever we see a location for an inode, remember it: this allows us to
966 // build a path given an inode
967 std::map
<inodeno_t
, Location
> ino_locations
;
969 // Special case: operations on root inode populate roots but not dirlumps
970 if (lump_map
.empty() && !roots
.empty()) {
971 paths
.push_back("/");
977 // Build a tiny local metadata cache for the path structure in this metablob
978 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
979 inodeno_t
const dir_ino
= i
->first
.ino
;
980 dirlump
const &dl
= i
->second
;
983 list
<ceph::shared_ptr
<fullbit
> > const &fb_list
= dl
.get_dfull();
984 list
<nullbit
> const &nb_list
= dl
.get_dnull();
985 list
<remotebit
> const &rb_list
= dl
.get_dremote();
987 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
988 iter
= fb_list
.begin(); iter
!= fb_list
.end(); ++iter
) {
989 boost::string_view dentry
= (*iter
)->dn
;
990 children
[dir_ino
].emplace_back(dentry
);
991 ino_locations
[(*iter
)->inode
.ino
] = Location(dir_ino
, std::string(dentry
));
994 for (list
<nullbit
>::const_iterator
995 iter
= nb_list
.begin(); iter
!= nb_list
.end(); ++iter
) {
996 boost::string_view dentry
= iter
->dn
;
997 children
[dir_ino
].emplace_back(dentry
);
1000 for (list
<remotebit
>::const_iterator
1001 iter
= rb_list
.begin(); iter
!= rb_list
.end(); ++iter
) {
1002 boost::string_view dentry
= iter
->dn
;
1003 children
[dir_ino
].emplace_back(dentry
);
1007 std::vector
<Location
> leaf_locations
;
1011 // Output paths for all childless nodes in the metablob
1012 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
1013 inodeno_t
const dir_ino
= i
->first
.ino
;
1014 dirlump
const &dl
= i
->second
;
1017 list
<ceph::shared_ptr
<fullbit
> > const &fb_list
= dl
.get_dfull();
1018 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
1019 iter
= fb_list
.begin(); iter
!= fb_list
.end(); ++iter
) {
1020 std::string
dentry((*iter
)->dn
);
1021 children
[dir_ino
].push_back(dentry
);
1022 ino_locations
[(*iter
)->inode
.ino
] = Location(dir_ino
, std::string(dentry
));
1023 if (children
.find((*iter
)->inode
.ino
) == children
.end()) {
1024 leaf_locations
.push_back(Location(dir_ino
, std::string(dentry
)));
1029 list
<nullbit
> const &nb_list
= dl
.get_dnull();
1030 for (list
<nullbit
>::const_iterator
1031 iter
= nb_list
.begin(); iter
!= nb_list
.end(); ++iter
) {
1032 boost::string_view dentry
= iter
->dn
;
1033 leaf_locations
.push_back(Location(dir_ino
, std::string(dentry
)));
1036 list
<remotebit
> const &rb_list
= dl
.get_dremote();
1037 for (list
<remotebit
>::const_iterator
1038 iter
= rb_list
.begin(); iter
!= rb_list
.end(); ++iter
) {
1039 boost::string_view dentry
= iter
->dn
;
1040 leaf_locations
.push_back(Location(dir_ino
, std::string(dentry
)));
1044 // For all the leaf locations identified, generate paths
1045 for (std::vector
<Location
>::iterator i
= leaf_locations
.begin(); i
!= leaf_locations
.end(); ++i
) {
1046 Location
const &loc
= *i
;
1047 std::string path
= loc
.second
;
1048 inodeno_t ino
= loc
.first
;
1049 while(ino_locations
.find(ino
) != ino_locations
.end()) {
1050 Location
const &loc
= ino_locations
[ino
];
1051 if (!path
.empty()) {
1052 path
= loc
.second
+ "/" + path
;
1054 path
= loc
.second
+ path
;
1059 paths
.push_back(path
);
1064 void EMetaBlob::dump(Formatter
*f
) const
1066 f
->open_array_section("lumps");
1067 for (list
<dirfrag_t
>::const_iterator i
= lump_order
.begin();
1068 i
!= lump_order
.end(); ++i
) {
1069 f
->open_object_section("lump");
1070 f
->open_object_section("dirfrag");
1071 f
->dump_stream("dirfrag") << *i
;
1072 f
->close_section(); // dirfrag
1073 f
->open_object_section("dirlump");
1074 lump_map
.at(*i
).dump(f
);
1075 f
->close_section(); // dirlump
1076 f
->close_section(); // lump
1078 f
->close_section(); // lumps
1080 f
->open_array_section("roots");
1081 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator i
= roots
.begin();
1082 i
!= roots
.end(); ++i
) {
1083 f
->open_object_section("root");
1085 f
->close_section(); // root
1087 f
->close_section(); // roots
1089 f
->open_array_section("tableclient tranactions");
1090 for (list
<pair
<__u8
,version_t
> >::const_iterator i
= table_tids
.begin();
1091 i
!= table_tids
.end(); ++i
) {
1092 f
->open_object_section("transaction");
1093 f
->dump_int("tid", i
->first
);
1094 f
->dump_int("version", i
->second
);
1095 f
->close_section(); // transaction
1097 f
->close_section(); // tableclient transactions
1099 f
->dump_int("renamed directory inodeno", renamed_dirino
);
1101 f
->open_array_section("renamed directory fragments");
1102 for (list
<frag_t
>::const_iterator i
= renamed_dir_frags
.begin();
1103 i
!= renamed_dir_frags
.end(); ++i
) {
1104 f
->dump_int("frag", *i
);
1106 f
->close_section(); // renamed directory fragments
1108 f
->dump_int("inotable version", inotablev
);
1109 f
->dump_int("SessionMap version", sessionmapv
);
1110 f
->dump_int("allocated ino", allocated_ino
);
1112 f
->dump_stream("preallocated inos") << preallocated_inos
;
1113 f
->dump_int("used preallocated ino", used_preallocated_ino
);
1115 f
->open_object_section("client name");
1116 client_name
.dump(f
);
1117 f
->close_section(); // client name
1119 f
->open_array_section("inodes starting a truncate");
1120 for(list
<inodeno_t
>::const_iterator i
= truncate_start
.begin();
1121 i
!= truncate_start
.end(); ++i
) {
1122 f
->dump_int("inodeno", *i
);
1124 f
->close_section(); // truncate inodes
1125 f
->open_array_section("inodes finishing a truncated");
1126 for(map
<inodeno_t
,uint64_t>::const_iterator i
= truncate_finish
.begin();
1127 i
!= truncate_finish
.end(); ++i
) {
1128 f
->open_object_section("inode+segment");
1129 f
->dump_int("inodeno", i
->first
);
1130 f
->dump_int("truncate starting segment", i
->second
);
1131 f
->close_section(); // truncated inode
1133 f
->close_section(); // truncate finish inodes
1135 f
->open_array_section("destroyed inodes");
1136 for(vector
<inodeno_t
>::const_iterator i
= destroyed_inodes
.begin();
1137 i
!= destroyed_inodes
.end(); ++i
) {
1138 f
->dump_int("inodeno", *i
);
1140 f
->close_section(); // destroyed inodes
1142 f
->open_array_section("client requests");
1143 for(list
<pair
<metareqid_t
,uint64_t> >::const_iterator i
= client_reqs
.begin();
1144 i
!= client_reqs
.end(); ++i
) {
1145 f
->open_object_section("Client request");
1146 f
->dump_stream("request ID") << i
->first
;
1147 f
->dump_int("oldest request on client", i
->second
);
1148 f
->close_section(); // request
1150 f
->close_section(); // client requests
1153 void EMetaBlob::generate_test_instances(list
<EMetaBlob
*>& ls
)
1155 ls
.push_back(new EMetaBlob());
1158 void EMetaBlob::replay(MDSRank
*mds
, LogSegment
*logseg
, MDSlaveUpdate
*slaveup
)
1160 dout(10) << "EMetaBlob.replay " << lump_map
.size() << " dirlumps by " << client_name
<< dendl
;
1164 assert(g_conf
->mds_kill_journal_replay_at
!= 1);
1166 for (list
<ceph::shared_ptr
<fullbit
> >::iterator p
= roots
.begin(); p
!= roots
.end(); ++p
) {
1167 CInode
*in
= mds
->mdcache
->get_inode((*p
)->inode
.ino
);
1168 bool isnew
= in
? false:true;
1170 in
= new CInode(mds
->mdcache
, false);
1171 (*p
)->update_inode(mds
, in
);
1174 mds
->mdcache
->add_inode(in
);
1175 if ((*p
)->is_dirty()) in
->_mark_dirty(logseg
);
1176 dout(10) << "EMetaBlob.replay " << (isnew
? " added root ":" updated root ") << *in
<< dendl
;
1179 CInode
*renamed_diri
= 0;
1181 if (renamed_dirino
) {
1182 renamed_diri
= mds
->mdcache
->get_inode(renamed_dirino
);
1184 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri
<< dendl
;
1186 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino
<< dendl
;
1189 for (list
<dirfrag_t
>::iterator lp
= lump_order
.begin(); lp
!= lump_order
.end(); ++lp
) {
1190 dirlump
&lump
= lump_map
[*lp
];
1192 dout(10) << "EMetaBlob.replay found null dentry in dir " << *lp
<< dendl
;
1193 nnull
+= lump
.nnull
;
1199 // keep track of any inodes we unlink and don't relink elsewhere
1200 map
<CInode
*, CDir
*> unlinked
;
1201 set
<CInode
*> linked
;
1203 // walk through my dirs (in order!)
1204 for (list
<dirfrag_t
>::iterator lp
= lump_order
.begin();
1205 lp
!= lump_order
.end();
1207 dout(10) << "EMetaBlob.replay dir " << *lp
<< dendl
;
1208 dirlump
&lump
= lump_map
[*lp
];
1211 CDir
*dir
= mds
->mdcache
->get_force_dirfrag(*lp
, true);
1213 // hmm. do i have the inode?
1214 CInode
*diri
= mds
->mdcache
->get_inode((*lp
).ino
);
1216 if (MDS_INO_IS_MDSDIR(lp
->ino
)) {
1217 assert(MDS_INO_MDSDIR(mds
->get_nodeid()) != lp
->ino
);
1218 diri
= mds
->mdcache
->create_system_inode(lp
->ino
, S_IFDIR
|0755);
1219 diri
->state_clear(CInode::STATE_AUTH
);
1220 dout(10) << "EMetaBlob.replay created base " << *diri
<< dendl
;
1222 dout(0) << "EMetaBlob.replay missing dir ino " << (*lp
).ino
<< dendl
;
1223 mds
->clog
->error() << "failure replaying journal (EMetaBlob)";
1225 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1229 // create the dirfrag
1230 dir
= diri
->get_or_open_dirfrag(mds
->mdcache
, (*lp
).frag
);
1232 if (MDS_INO_IS_BASE(lp
->ino
))
1233 mds
->mdcache
->adjust_subtree_auth(dir
, CDIR_AUTH_UNDEF
);
1235 dout(10) << "EMetaBlob.replay added dir " << *dir
<< dendl
;
1237 dir
->set_version( lump
.fnode
.version
);
1238 dir
->fnode
= lump
.fnode
;
1240 if (lump
.is_importing()) {
1241 dir
->state_set(CDir::STATE_AUTH
);
1242 dir
->state_clear(CDir::STATE_COMPLETE
);
1244 if (lump
.is_dirty()) {
1245 dir
->_mark_dirty(logseg
);
1247 if (!(dir
->fnode
.rstat
== dir
->fnode
.accounted_rstat
)) {
1248 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir
<< dendl
;
1249 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->nestlock
);
1250 logseg
->dirty_dirfrag_nest
.push_back(&dir
->inode
->item_dirty_dirfrag_nest
);
1252 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir
<< dendl
;
1254 if (!(dir
->fnode
.fragstat
== dir
->fnode
.accounted_fragstat
)) {
1255 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir
<< dendl
;
1256 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->filelock
);
1257 logseg
->dirty_dirfrag_dir
.push_back(&dir
->inode
->item_dirty_dirfrag_dir
);
1259 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir
<< dendl
;
1262 if (lump
.is_dirty_dft()) {
1263 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir
<< dendl
;
1264 dir
->state_set(CDir::STATE_DIRTYDFT
);
1265 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->dirfragtreelock
);
1266 logseg
->dirty_dirfrag_dirfragtree
.push_back(&dir
->inode
->item_dirty_dirfrag_dirfragtree
);
1269 dir
->mark_new(logseg
);
1270 if (lump
.is_complete())
1271 dir
->mark_complete();
1273 dout(10) << "EMetaBlob.replay updated dir " << *dir
<< dendl
;
1276 lump
._decode_bits();
1278 // full dentry+inode pairs
1279 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator pp
= lump
.get_dfull().begin();
1280 pp
!= lump
.get_dfull().end();
1282 ceph::shared_ptr
<fullbit
> p
= *pp
;
1283 CDentry
*dn
= dir
->lookup_exact_snap(p
->dn
, p
->dnlast
);
1285 dn
= dir
->add_null_dentry(p
->dn
, p
->dnfirst
, p
->dnlast
);
1286 dn
->set_version(p
->dnv
);
1287 if (p
->is_dirty()) dn
->_mark_dirty(logseg
);
1288 dout(10) << "EMetaBlob.replay added (full) " << *dn
<< dendl
;
1290 dn
->set_version(p
->dnv
);
1291 if (p
->is_dirty()) dn
->_mark_dirty(logseg
);
1292 dout(10) << "EMetaBlob.replay for [" << p
->dnfirst
<< "," << p
->dnlast
<< "] had " << *dn
<< dendl
;
1293 dn
->first
= p
->dnfirst
;
1294 assert(dn
->last
== p
->dnlast
);
1296 if (lump
.is_importing())
1297 dn
->state_set(CDentry::STATE_AUTH
);
1299 CInode
*in
= mds
->mdcache
->get_inode(p
->inode
.ino
, p
->dnlast
);
1301 in
= new CInode(mds
->mdcache
, dn
->is_auth(), p
->dnfirst
, p
->dnlast
);
1302 p
->update_inode(mds
, in
);
1303 mds
->mdcache
->add_inode(in
);
1304 if (!dn
->get_linkage()->is_null()) {
1305 if (dn
->get_linkage()->is_primary()) {
1306 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1308 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1309 << " " << *dn
->get_linkage()->get_inode() << " should be " << p
->inode
.ino
;
1310 dout(0) << ss
.str() << dendl
;
1311 mds
->clog
->warn(ss
);
1313 dir
->unlink_inode(dn
, false);
1315 if (unlinked
.count(in
))
1317 dir
->link_primary_inode(dn
, in
);
1318 dout(10) << "EMetaBlob.replay added " << *in
<< dendl
;
1320 in
->first
= p
->dnfirst
;
1321 p
->update_inode(mds
, in
);
1322 if (dn
->get_linkage()->get_inode() != in
&& in
->get_parent_dn()) {
1323 dout(10) << "EMetaBlob.replay unlinking " << *in
<< dendl
;
1324 unlinked
[in
] = in
->get_parent_dir();
1325 in
->get_parent_dir()->unlink_inode(in
->get_parent_dn());
1327 if (dn
->get_linkage()->get_inode() != in
) {
1328 if (!dn
->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1329 if (dn
->get_linkage()->is_primary()) {
1330 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1332 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1333 << " " << *dn
->get_linkage()->get_inode() << " should be " << p
->inode
.ino
;
1334 dout(0) << ss
.str() << dendl
;
1335 mds
->clog
->warn(ss
);
1337 dir
->unlink_inode(dn
, false);
1339 if (unlinked
.count(in
))
1341 dir
->link_primary_inode(dn
, in
);
1342 dout(10) << "EMetaBlob.replay linked " << *in
<< dendl
;
1344 dout(10) << "EMetaBlob.replay for [" << p
->dnfirst
<< "," << p
->dnlast
<< "] had " << *in
<< dendl
;
1346 assert(in
->first
== p
->dnfirst
||
1347 (in
->is_multiversion() && in
->first
> p
->dnfirst
));
1350 in
->_mark_dirty(logseg
);
1351 if (p
->is_dirty_parent())
1352 in
->mark_dirty_parent(logseg
, p
->is_dirty_pool());
1353 if (p
->need_snapflush())
1354 logseg
->open_files
.push_back(&in
->item_open_file
);
1356 in
->state_set(CInode::STATE_AUTH
);
1358 in
->state_clear(CInode::STATE_AUTH
);
1359 assert(g_conf
->mds_kill_journal_replay_at
!= 2);
1363 for (list
<remotebit
>::const_iterator p
= lump
.get_dremote().begin();
1364 p
!= lump
.get_dremote().end();
1366 CDentry
*dn
= dir
->lookup_exact_snap(p
->dn
, p
->dnlast
);
1368 dn
= dir
->add_remote_dentry(p
->dn
, p
->ino
, p
->d_type
, p
->dnfirst
, p
->dnlast
);
1369 dn
->set_version(p
->dnv
);
1370 if (p
->dirty
) dn
->_mark_dirty(logseg
);
1371 dout(10) << "EMetaBlob.replay added " << *dn
<< dendl
;
1373 if (!dn
->get_linkage()->is_null()) {
1374 dout(10) << "EMetaBlob.replay unlinking " << *dn
<< dendl
;
1375 if (dn
->get_linkage()->is_primary()) {
1376 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1378 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1379 << " " << *dn
->get_linkage()->get_inode() << " should be remote " << p
->ino
;
1380 dout(0) << ss
.str() << dendl
;
1382 dir
->unlink_inode(dn
, false);
1384 dir
->link_remote_inode(dn
, p
->ino
, p
->d_type
);
1385 dn
->set_version(p
->dnv
);
1386 if (p
->dirty
) dn
->_mark_dirty(logseg
);
1387 dout(10) << "EMetaBlob.replay for [" << p
->dnfirst
<< "," << p
->dnlast
<< "] had " << *dn
<< dendl
;
1388 dn
->first
= p
->dnfirst
;
1389 assert(dn
->last
== p
->dnlast
);
1391 if (lump
.is_importing())
1392 dn
->state_set(CDentry::STATE_AUTH
);
1396 for (list
<nullbit
>::const_iterator p
= lump
.get_dnull().begin();
1397 p
!= lump
.get_dnull().end();
1399 CDentry
*dn
= dir
->lookup_exact_snap(p
->dn
, p
->dnlast
);
1401 dn
= dir
->add_null_dentry(p
->dn
, p
->dnfirst
, p
->dnlast
);
1402 dn
->set_version(p
->dnv
);
1403 if (p
->dirty
) dn
->_mark_dirty(logseg
);
1404 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn
<< dendl
;
1406 dn
->first
= p
->dnfirst
;
1407 if (!dn
->get_linkage()->is_null()) {
1408 dout(10) << "EMetaBlob.replay unlinking " << *dn
<< dendl
;
1409 CInode
*in
= dn
->get_linkage()->get_inode();
1410 // For renamed inode, We may call CInode::force_dirfrag() later.
1411 // CInode::force_dirfrag() doesn't work well when inode is detached
1412 // from the hierarchy.
1413 if (!renamed_diri
|| renamed_diri
!= in
) {
1414 if (dn
->get_linkage()->is_primary())
1416 dir
->unlink_inode(dn
);
1419 dn
->set_version(p
->dnv
);
1420 if (p
->dirty
) dn
->_mark_dirty(logseg
);
1421 dout(10) << "EMetaBlob.replay had " << *dn
<< dendl
;
1422 assert(dn
->last
== p
->dnlast
);
1425 if (lump
.is_importing())
1426 dn
->state_set(CDentry::STATE_AUTH
);
1428 // Make null dentries the first things we trim
1429 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn
<< dendl
;
1433 assert(g_conf
->mds_kill_journal_replay_at
!= 3);
1435 if (renamed_dirino
) {
1437 assert(unlinked
.count(renamed_diri
));
1438 assert(linked
.count(renamed_diri
));
1439 olddir
= unlinked
[renamed_diri
];
1441 // we imported a diri we haven't seen before
1442 renamed_diri
= mds
->mdcache
->get_inode(renamed_dirino
);
1443 assert(renamed_diri
); // it was in the metablob
1447 if (olddir
->authority() != CDIR_AUTH_UNDEF
&&
1448 renamed_diri
->authority() == CDIR_AUTH_UNDEF
) {
1449 assert(slaveup
); // auth to non-auth, must be slave prepare
1450 list
<frag_t
> leaves
;
1451 renamed_diri
->dirfragtree
.get_leaves(leaves
);
1452 for (list
<frag_t
>::iterator p
= leaves
.begin(); p
!= leaves
.end(); ++p
) {
1453 CDir
*dir
= renamed_diri
->get_dirfrag(*p
);
1455 if (dir
->get_dir_auth() == CDIR_AUTH_UNDEF
)
1456 // preserve subtree bound until slave commit
1457 slaveup
->olddirs
.insert(dir
->inode
);
1459 dir
->state_set(CDir::STATE_AUTH
);
1463 mds
->mdcache
->adjust_subtree_after_rename(renamed_diri
, olddir
, false);
1465 // see if we can discard the subtree we renamed out of
1466 CDir
*root
= mds
->mdcache
->get_subtree_root(olddir
);
1467 if (root
->get_dir_auth() == CDIR_AUTH_UNDEF
) {
1468 if (slaveup
) // preserve the old dir until slave commit
1469 slaveup
->olddirs
.insert(olddir
->inode
);
1471 mds
->mdcache
->try_trim_non_auth_subtree(root
);
1475 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1476 if (renamed_diri
->authority() != CDIR_AUTH_UNDEF
) {
1477 for (list
<frag_t
>::iterator p
= renamed_dir_frags
.begin(); p
!= renamed_dir_frags
.end(); ++p
) {
1478 CDir
*dir
= renamed_diri
->get_dirfrag(*p
);
1480 // we already had the inode before, and we already adjusted this subtree accordingly.
1481 dout(10) << " already had+adjusted rename import bound " << *dir
<< dendl
;
1485 dir
= renamed_diri
->get_or_open_dirfrag(mds
->mdcache
, *p
);
1486 dout(10) << " creating new rename import bound " << *dir
<< dendl
;
1487 dir
->state_clear(CDir::STATE_AUTH
);
1488 mds
->mdcache
->adjust_subtree_auth(dir
, CDIR_AUTH_UNDEF
);
1492 // rename may overwrite an empty directory and move it into stray dir.
1493 unlinked
.erase(renamed_diri
);
1494 for (map
<CInode
*, CDir
*>::iterator p
= unlinked
.begin(); p
!= unlinked
.end(); ++p
) {
1495 if (!linked
.count(p
->first
))
1497 assert(p
->first
->is_dir());
1498 mds
->mdcache
->adjust_subtree_after_rename(p
->first
, p
->second
, false);
1502 if (!unlinked
.empty()) {
1503 for (set
<CInode
*>::iterator p
= linked
.begin(); p
!= linked
.end(); ++p
)
1505 dout(10) << " unlinked set contains " << unlinked
<< dendl
;
1506 for (map
<CInode
*, CDir
*>::iterator p
= unlinked
.begin(); p
!= unlinked
.end(); ++p
) {
1507 if (slaveup
) // preserve unlinked inodes until slave commit
1508 slaveup
->unlinked
.insert(p
->first
);
1510 mds
->mdcache
->remove_inode_recursive(p
->first
);
1514 // table client transactions
1515 for (list
<pair
<__u8
,version_t
> >::iterator p
= table_tids
.begin();
1516 p
!= table_tids
.end();
1518 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p
->first
)
1519 << " transaction " << p
->second
<< dendl
;
1520 MDSTableClient
*client
= mds
->get_table_client(p
->first
);
1522 client
->got_journaled_agree(p
->second
, logseg
);
1527 CInode
*in
= mds
->mdcache
->get_inode(opened_ino
);
1529 dout(10) << "EMetaBlob.replay noting opened inode " << *in
<< dendl
;
1530 logseg
->open_files
.push_back(&in
->item_open_file
);
1535 if (mds
->inotable
->get_version() >= inotablev
) {
1536 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1537 << " <= table " << mds
->inotable
->get_version() << dendl
;
1539 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1540 << " - 1 == table " << mds
->inotable
->get_version()
1541 << " allocated+used " << allocated_ino
1542 << " prealloc " << preallocated_inos
1545 mds
->inotable
->replay_alloc_id(allocated_ino
);
1546 if (preallocated_inos
.size())
1547 mds
->inotable
->replay_alloc_ids(preallocated_inos
);
1549 // [repair bad inotable updates]
1550 if (inotablev
> mds
->inotable
->get_version()) {
1551 mds
->clog
->error() << "journal replay inotablev mismatch "
1552 << mds
->inotable
->get_version() << " -> " << inotablev
;
1553 mds
->inotable
->force_replay_version(inotablev
);
1556 assert(inotablev
== mds
->inotable
->get_version());
1560 if (mds
->sessionmap
.get_version() >= sessionmapv
) {
1561 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1562 << " <= table " << mds
->sessionmap
.get_version() << dendl
;
1563 } else if (mds
->sessionmap
.get_version() + 2 >= sessionmapv
) {
1564 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1565 << " -(1|2) == table " << mds
->sessionmap
.get_version()
1566 << " prealloc " << preallocated_inos
1567 << " used " << used_preallocated_ino
1569 Session
*session
= mds
->sessionmap
.get_session(client_name
);
1571 dout(20) << " (session prealloc " << session
->info
.prealloc_inos
<< ")" << dendl
;
1572 if (used_preallocated_ino
) {
1573 if (!session
->info
.prealloc_inos
.empty()) {
1574 inodeno_t next
= session
->next_ino();
1575 inodeno_t i
= session
->take_ino(used_preallocated_ino
);
1577 mds
->clog
->warn() << " replayed op " << client_reqs
<< " used ino " << i
1578 << " but session next is " << next
;
1579 assert(i
== used_preallocated_ino
);
1580 session
->info
.used_inos
.clear();
1582 mds
->sessionmap
.replay_dirty_session(session
);
1584 if (!preallocated_inos
.empty()) {
1585 session
->info
.prealloc_inos
.insert(preallocated_inos
);
1586 mds
->sessionmap
.replay_dirty_session(session
);
1590 dout(10) << "EMetaBlob.replay no session for " << client_name
<< dendl
;
1591 if (used_preallocated_ino
) {
1592 mds
->sessionmap
.replay_advance_version();
1594 if (!preallocated_inos
.empty())
1595 mds
->sessionmap
.replay_advance_version();
1597 assert(sessionmapv
== mds
->sessionmap
.get_version());
1599 mds
->clog
->error() << "journal replay sessionmap v " << sessionmapv
1600 << " -(1|2) > table " << mds
->sessionmap
.get_version();
1601 assert(g_conf
->mds_wipe_sessions
);
1602 mds
->sessionmap
.wipe();
1603 mds
->sessionmap
.set_version(sessionmapv
);
1607 // truncating inodes
1608 for (list
<inodeno_t
>::iterator p
= truncate_start
.begin();
1609 p
!= truncate_start
.end();
1611 CInode
*in
= mds
->mdcache
->get_inode(*p
);
1613 mds
->mdcache
->add_recovered_truncate(in
, logseg
);
1615 for (map
<inodeno_t
,uint64_t>::iterator p
= truncate_finish
.begin();
1616 p
!= truncate_finish
.end();
1618 LogSegment
*ls
= mds
->mdlog
->get_segment(p
->second
);
1620 CInode
*in
= mds
->mdcache
->get_inode(p
->first
);
1622 mds
->mdcache
->remove_recovered_truncate(in
, ls
);
1627 for (vector
<inodeno_t
>::iterator p
= destroyed_inodes
.begin();
1628 p
!= destroyed_inodes
.end();
1630 CInode
*in
= mds
->mdcache
->get_inode(*p
);
1632 dout(10) << "EMetaBlob.replay destroyed " << *p
<< ", dropping " << *in
<< dendl
;
1633 CDentry
*parent
= in
->get_parent_dn();
1634 mds
->mdcache
->remove_inode(in
);
1636 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent
<< dendl
;
1637 assert(parent
->get_linkage()->is_null());
1640 dout(10) << "EMetaBlob.replay destroyed " << *p
<< ", not in cache" << dendl
;
1645 for (list
<pair
<metareqid_t
, uint64_t> >::iterator p
= client_reqs
.begin();
1646 p
!= client_reqs
.end();
1648 if (p
->first
.name
.is_client()) {
1649 dout(10) << "EMetaBlob.replay request " << p
->first
<< " trim_to " << p
->second
<< dendl
;
1650 inodeno_t created
= allocated_ino
? allocated_ino
: used_preallocated_ino
;
1651 // if we allocated an inode, there should be exactly one client request id.
1652 assert(created
== inodeno_t() || client_reqs
.size() == 1);
1654 Session
*session
= mds
->sessionmap
.get_session(p
->first
.name
);
1656 session
->add_completed_request(p
->first
.tid
, created
);
1658 session
->trim_completed_requests(p
->second
);
1664 for (list
<pair
<metareqid_t
, uint64_t> >::iterator p
= client_flushes
.begin();
1665 p
!= client_flushes
.end();
1667 if (p
->first
.name
.is_client()) {
1668 dout(10) << "EMetaBlob.replay flush " << p
->first
<< " trim_to " << p
->second
<< dendl
;
1669 Session
*session
= mds
->sessionmap
.get_session(p
->first
.name
);
1671 session
->add_completed_flush(p
->first
.tid
);
1673 session
->trim_completed_flushes(p
->second
);
1679 update_segment(logseg
);
1681 assert(g_conf
->mds_kill_journal_replay_at
!= 4);
1684 // -----------------------
1687 void ESession::update_segment()
1689 _segment
->sessionmapv
= cmapv
;
1690 if (inos
.size() && inotablev
)
1691 _segment
->inotablev
= inotablev
;
1694 void ESession::replay(MDSRank
*mds
)
1696 if (mds
->sessionmap
.get_version() >= cmapv
) {
1697 dout(10) << "ESession.replay sessionmap " << mds
->sessionmap
.get_version()
1698 << " >= " << cmapv
<< ", noop" << dendl
;
1700 dout(10) << "ESession.replay sessionmap " << mds
->sessionmap
.get_version()
1701 << " < " << cmapv
<< " " << (open
? "open":"close") << " " << client_inst
<< dendl
;
1704 session
= mds
->sessionmap
.get_or_add_session(client_inst
);
1705 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
1706 session
->set_client_metadata(client_metadata
);
1707 dout(10) << " opened session " << session
->info
.inst
<< dendl
;
1709 session
= mds
->sessionmap
.get_session(client_inst
.name
);
1710 if (session
) { // there always should be a session, but there's a bug
1711 if (session
->connection
== NULL
) {
1712 dout(10) << " removed session " << session
->info
.inst
<< dendl
;
1713 mds
->sessionmap
.remove_session(session
);
1716 session
->clear(); // the client has reconnected; keep the Session, but reset
1717 dout(10) << " reset session " << session
->info
.inst
<< " (they reconnected)" << dendl
;
1720 mds
->clog
->error() << "replayed stray Session close event for " << client_inst
1721 << " from time " << stamp
<< ", ignoring";
1725 mds
->sessionmap
.replay_dirty_session(session
);
1727 mds
->sessionmap
.replay_advance_version();
1729 assert(mds
->sessionmap
.get_version() == cmapv
);
1732 if (inos
.size() && inotablev
) {
1733 if (mds
->inotable
->get_version() >= inotablev
) {
1734 dout(10) << "ESession.replay inotable " << mds
->inotable
->get_version()
1735 << " >= " << inotablev
<< ", noop" << dendl
;
1737 dout(10) << "ESession.replay inotable " << mds
->inotable
->get_version()
1738 << " < " << inotablev
<< " " << (open
? "add":"remove") << dendl
;
1739 assert(!open
); // for now
1740 mds
->inotable
->replay_release_ids(inos
);
1741 assert(mds
->inotable
->get_version() == inotablev
);
1748 void ESession::encode(bufferlist
&bl
, uint64_t features
) const
1750 ENCODE_START(4, 3, bl
);
1751 ::encode(stamp
, bl
);
1752 ::encode(client_inst
, bl
, features
);
1754 ::encode(cmapv
, bl
);
1756 ::encode(inotablev
, bl
);
1757 ::encode(client_metadata
, bl
);
1761 void ESession::decode(bufferlist::iterator
&bl
)
1763 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl
);
1765 ::decode(stamp
, bl
);
1766 ::decode(client_inst
, bl
);
1768 ::decode(cmapv
, bl
);
1770 ::decode(inotablev
, bl
);
1771 if (struct_v
>= 4) {
1772 ::decode(client_metadata
, bl
);
1777 void ESession::dump(Formatter
*f
) const
1779 f
->dump_stream("client instance") << client_inst
;
1780 f
->dump_string("open", open
? "true" : "false");
1781 f
->dump_int("client map version", cmapv
);
1782 f
->dump_stream("inos") << inos
;
1783 f
->dump_int("inotable version", inotablev
);
1784 f
->open_object_section("client_metadata");
1785 for (map
<string
, string
>::const_iterator i
= client_metadata
.begin();
1786 i
!= client_metadata
.end(); ++i
) {
1787 f
->dump_string(i
->first
.c_str(), i
->second
);
1789 f
->close_section(); // client_metadata
1792 void ESession::generate_test_instances(list
<ESession
*>& ls
)
1794 ls
.push_back(new ESession
);
1797 // -----------------------
1800 void ESessions::encode(bufferlist
&bl
, uint64_t features
) const
1802 ENCODE_START(1, 1, bl
);
1803 ::encode(client_map
, bl
, features
);
1804 ::encode(cmapv
, bl
);
1805 ::encode(stamp
, bl
);
1809 void ESessions::decode_old(bufferlist::iterator
&bl
)
1811 ::decode(client_map
, bl
);
1812 ::decode(cmapv
, bl
);
1814 ::decode(stamp
, bl
);
1817 void ESessions::decode_new(bufferlist::iterator
&bl
)
1819 DECODE_START(1, bl
);
1820 ::decode(client_map
, bl
);
1821 ::decode(cmapv
, bl
);
1823 ::decode(stamp
, bl
);
1827 void ESessions::dump(Formatter
*f
) const
1829 f
->dump_int("client map version", cmapv
);
1831 f
->open_array_section("client map");
1832 for (map
<client_t
,entity_inst_t
>::const_iterator i
= client_map
.begin();
1833 i
!= client_map
.end(); ++i
) {
1834 f
->open_object_section("client");
1835 f
->dump_int("client id", i
->first
.v
);
1836 f
->dump_stream("client entity") << i
->second
;
1837 f
->close_section(); // client
1839 f
->close_section(); // client map
1842 void ESessions::generate_test_instances(list
<ESessions
*>& ls
)
1844 ls
.push_back(new ESessions());
1847 void ESessions::update_segment()
1849 _segment
->sessionmapv
= cmapv
;
1852 void ESessions::replay(MDSRank
*mds
)
1854 if (mds
->sessionmap
.get_version() >= cmapv
) {
1855 dout(10) << "ESessions.replay sessionmap " << mds
->sessionmap
.get_version()
1856 << " >= " << cmapv
<< ", noop" << dendl
;
1858 dout(10) << "ESessions.replay sessionmap " << mds
->sessionmap
.get_version()
1859 << " < " << cmapv
<< dendl
;
1860 mds
->sessionmap
.replay_open_sessions(client_map
);
1861 assert(mds
->sessionmap
.get_version() == cmapv
);
1867 // -----------------------
1870 void ETableServer::encode(bufferlist
& bl
, uint64_t features
) const
1872 ENCODE_START(3, 3, bl
);
1873 ::encode(stamp
, bl
);
1874 ::encode(table
, bl
);
1876 ::encode(reqid
, bl
);
1877 ::encode(bymds
, bl
);
1878 ::encode(mutation
, bl
);
1880 ::encode(version
, bl
);
1884 void ETableServer::decode(bufferlist::iterator
&bl
)
1886 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
1888 ::decode(stamp
, bl
);
1889 ::decode(table
, bl
);
1891 ::decode(reqid
, bl
);
1892 ::decode(bymds
, bl
);
1893 ::decode(mutation
, bl
);
1895 ::decode(version
, bl
);
1899 void ETableServer::dump(Formatter
*f
) const
1901 f
->dump_int("table id", table
);
1902 f
->dump_int("op", op
);
1903 f
->dump_int("request id", reqid
);
1904 f
->dump_int("by mds", bymds
);
1905 f
->dump_int("tid", tid
);
1906 f
->dump_int("version", version
);
1909 void ETableServer::generate_test_instances(list
<ETableServer
*>& ls
)
1911 ls
.push_back(new ETableServer());
1915 void ETableServer::update_segment()
1917 _segment
->tablev
[table
] = version
;
1920 void ETableServer::replay(MDSRank
*mds
)
1922 MDSTableServer
*server
= mds
->get_table_server(table
);
1926 if (server
->get_version() >= version
) {
1927 dout(10) << "ETableServer.replay " << get_mdstable_name(table
)
1928 << " " << get_mdstableserver_opname(op
)
1929 << " event " << version
1930 << " <= table " << server
->get_version() << dendl
;
1934 dout(10) << " ETableServer.replay " << get_mdstable_name(table
)
1935 << " " << get_mdstableserver_opname(op
)
1936 << " event " << version
<< " - 1 == table " << server
->get_version() << dendl
;
1937 assert(version
-1 == server
->get_version());
1940 case TABLESERVER_OP_PREPARE
:
1941 server
->_prepare(mutation
, reqid
, bymds
);
1942 server
->_note_prepare(bymds
, reqid
);
1944 case TABLESERVER_OP_COMMIT
:
1945 server
->_commit(tid
);
1946 server
->_note_commit(tid
);
1948 case TABLESERVER_OP_ROLLBACK
:
1949 server
->_rollback(tid
);
1950 server
->_note_rollback(tid
);
1952 case TABLESERVER_OP_SERVER_UPDATE
:
1953 server
->_server_update(mutation
);
1956 mds
->clog
->error() << "invalid tableserver op in ETableServer";
1958 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1961 assert(version
== server
->get_version());
1966 // ---------------------
1969 void ETableClient::encode(bufferlist
& bl
, uint64_t features
) const
1971 ENCODE_START(3, 3, bl
);
1972 ::encode(stamp
, bl
);
1973 ::encode(table
, bl
);
1979 void ETableClient::decode(bufferlist::iterator
&bl
)
1981 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
1983 ::decode(stamp
, bl
);
1984 ::decode(table
, bl
);
1990 void ETableClient::dump(Formatter
*f
) const
1992 f
->dump_int("table", table
);
1993 f
->dump_int("op", op
);
1994 f
->dump_int("tid", tid
);
1997 void ETableClient::generate_test_instances(list
<ETableClient
*>& ls
)
1999 ls
.push_back(new ETableClient());
2002 void ETableClient::replay(MDSRank
*mds
)
2004 dout(10) << " ETableClient.replay " << get_mdstable_name(table
)
2005 << " op " << get_mdstableserver_opname(op
)
2006 << " tid " << tid
<< dendl
;
2008 MDSTableClient
*client
= mds
->get_table_client(table
);
2012 assert(op
== TABLESERVER_OP_ACK
);
2013 client
->got_journaled_ack(tid
);
2017 // -----------------------
2020 void ESnap::update_segment()
2022 _segment->tablev[TABLE_SNAP] = version;
2025 void ESnap::replay(MDSRank *mds)
2027 if (mds->snaptable->get_version() >= version) {
2028 dout(10) << "ESnap.replay event " << version
2029 << " <= table " << mds->snaptable->get_version() << dendl;
2033 dout(10) << " ESnap.replay event " << version
2034 << " - 1 == table " << mds->snaptable->get_version() << dendl;
2035 assert(version-1 == mds->snaptable->get_version());
2039 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
2040 assert(s == snap.snapid);
2042 mds->snaptable->remove(snap.snapid);
2045 assert(version == mds->snaptable->get_version());
2051 // -----------------------
2054 void EUpdate::encode(bufferlist
&bl
, uint64_t features
) const
2056 ENCODE_START(4, 4, bl
);
2057 ::encode(stamp
, bl
);
2059 ::encode(metablob
, bl
, features
);
2060 ::encode(client_map
, bl
);
2061 ::encode(cmapv
, bl
);
2062 ::encode(reqid
, bl
);
2063 ::encode(had_slaves
, bl
);
2067 void EUpdate::decode(bufferlist::iterator
&bl
)
2069 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl
);
2071 ::decode(stamp
, bl
);
2073 ::decode(metablob
, bl
);
2074 ::decode(client_map
, bl
);
2076 ::decode(cmapv
, bl
);
2077 ::decode(reqid
, bl
);
2078 ::decode(had_slaves
, bl
);
2082 void EUpdate::dump(Formatter
*f
) const
2084 f
->open_object_section("metablob");
2086 f
->close_section(); // metablob
2088 f
->dump_string("type", type
);
2089 f
->dump_int("client map length", client_map
.length());
2090 f
->dump_int("client map version", cmapv
);
2091 f
->dump_stream("reqid") << reqid
;
2092 f
->dump_string("had slaves", had_slaves
? "true" : "false");
2095 void EUpdate::generate_test_instances(list
<EUpdate
*>& ls
)
2097 ls
.push_back(new EUpdate());
2101 void EUpdate::update_segment()
2103 metablob
.update_segment(_segment
);
2105 if (client_map
.length())
2106 _segment
->sessionmapv
= cmapv
;
2109 _segment
->uncommitted_masters
.insert(reqid
);
2112 void EUpdate::replay(MDSRank
*mds
)
2114 metablob
.replay(mds
, _segment
);
2117 dout(10) << "EUpdate.replay " << reqid
<< " had slaves, expecting a matching ECommitted" << dendl
;
2118 _segment
->uncommitted_masters
.insert(reqid
);
2119 set
<mds_rank_t
> slaves
;
2120 mds
->mdcache
->add_uncommitted_master(reqid
, _segment
, slaves
, true);
2123 if (client_map
.length()) {
2124 if (mds
->sessionmap
.get_version() >= cmapv
) {
2125 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2126 << " <= table " << mds
->sessionmap
.get_version() << dendl
;
2128 dout(10) << "EUpdate.replay sessionmap " << mds
->sessionmap
.get_version()
2129 << " < " << cmapv
<< dendl
;
2130 // open client sessions?
2131 map
<client_t
,entity_inst_t
> cm
;
2132 bufferlist::iterator blp
= client_map
.begin();
2134 mds
->sessionmap
.replay_open_sessions(cm
);
2135 assert(mds
->sessionmap
.get_version() == cmapv
);
2142 // ------------------------
2145 void EOpen::encode(bufferlist
&bl
, uint64_t features
) const {
2146 ENCODE_START(4, 3, bl
);
2147 ::encode(stamp
, bl
);
2148 ::encode(metablob
, bl
, features
);
2150 ::encode(snap_inos
, bl
);
2154 void EOpen::decode(bufferlist::iterator
&bl
) {
2155 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2157 ::decode(stamp
, bl
);
2158 ::decode(metablob
, bl
);
2161 ::decode(snap_inos
, bl
);
2165 void EOpen::dump(Formatter
*f
) const
2167 f
->open_object_section("metablob");
2169 f
->close_section(); // metablob
2170 f
->open_array_section("inos involved");
2171 for (vector
<inodeno_t
>::const_iterator i
= inos
.begin();
2172 i
!= inos
.end(); ++i
) {
2173 f
->dump_int("ino", *i
);
2175 f
->close_section(); // inos
2178 void EOpen::generate_test_instances(list
<EOpen
*>& ls
)
2180 ls
.push_back(new EOpen());
2181 ls
.push_back(new EOpen());
2182 ls
.back()->add_ino(0);
2185 void EOpen::update_segment()
2190 void EOpen::replay(MDSRank
*mds
)
2192 dout(10) << "EOpen.replay " << dendl
;
2193 metablob
.replay(mds
, _segment
);
2195 // note which segments inodes belong to, so we don't have to start rejournaling them
2196 for (const auto &ino
: inos
) {
2197 CInode
*in
= mds
->mdcache
->get_inode(ino
);
2199 dout(0) << "EOpen.replay ino " << ino
<< " not in metablob" << dendl
;
2202 _segment
->open_files
.push_back(&in
->item_open_file
);
2204 for (const auto &vino
: snap_inos
) {
2205 CInode
*in
= mds
->mdcache
->get_inode(vino
);
2207 dout(0) << "EOpen.replay ino " << vino
<< " not in metablob" << dendl
;
2210 _segment
->open_files
.push_back(&in
->item_open_file
);
2215 // -----------------------
2218 void ECommitted::replay(MDSRank
*mds
)
2220 if (mds
->mdcache
->uncommitted_masters
.count(reqid
)) {
2221 dout(10) << "ECommitted.replay " << reqid
<< dendl
;
2222 mds
->mdcache
->uncommitted_masters
[reqid
].ls
->uncommitted_masters
.erase(reqid
);
2223 mds
->mdcache
->uncommitted_masters
.erase(reqid
);
2225 dout(10) << "ECommitted.replay " << reqid
<< " -- didn't see original op" << dendl
;
2229 void ECommitted::encode(bufferlist
& bl
, uint64_t features
) const
2231 ENCODE_START(3, 3, bl
);
2232 ::encode(stamp
, bl
);
2233 ::encode(reqid
, bl
);
2237 void ECommitted::decode(bufferlist::iterator
& bl
)
2239 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2241 ::decode(stamp
, bl
);
2242 ::decode(reqid
, bl
);
2246 void ECommitted::dump(Formatter
*f
) const {
2247 f
->dump_stream("stamp") << stamp
;
2248 f
->dump_stream("reqid") << reqid
;
2251 void ECommitted::generate_test_instances(list
<ECommitted
*>& ls
)
2253 ls
.push_back(new ECommitted
);
2254 ls
.push_back(new ECommitted
);
2255 ls
.back()->stamp
= utime_t(1, 2);
2256 ls
.back()->reqid
= metareqid_t(entity_name_t::CLIENT(123), 456);
2259 // -----------------------
2262 void link_rollback::encode(bufferlist
&bl
) const
2264 ENCODE_START(2, 2, bl
);
2265 ::encode(reqid
, bl
);
2267 ::encode(was_inc
, bl
);
2268 ::encode(old_ctime
, bl
);
2269 ::encode(old_dir_mtime
, bl
);
2270 ::encode(old_dir_rctime
, bl
);
2274 void link_rollback::decode(bufferlist::iterator
&bl
)
2276 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2277 ::decode(reqid
, bl
);
2279 ::decode(was_inc
, bl
);
2280 ::decode(old_ctime
, bl
);
2281 ::decode(old_dir_mtime
, bl
);
2282 ::decode(old_dir_rctime
, bl
);
2286 void link_rollback::dump(Formatter
*f
) const
2288 f
->dump_stream("metareqid") << reqid
;
2289 f
->dump_int("ino", ino
);
2290 f
->dump_string("was incremented", was_inc
? "true" : "false");
2291 f
->dump_stream("old_ctime") << old_ctime
;
2292 f
->dump_stream("old_dir_mtime") << old_dir_mtime
;
2293 f
->dump_stream("old_dir_rctime") << old_dir_rctime
;
2296 void link_rollback::generate_test_instances(list
<link_rollback
*>& ls
)
2298 ls
.push_back(new link_rollback());
2301 void rmdir_rollback::encode(bufferlist
& bl
) const
2303 ENCODE_START(2, 2, bl
);
2304 ::encode(reqid
, bl
);
2305 ::encode(src_dir
, bl
);
2306 ::encode(src_dname
, bl
);
2307 ::encode(dest_dir
, bl
);
2308 ::encode(dest_dname
, bl
);
2312 void rmdir_rollback::decode(bufferlist::iterator
& bl
)
2314 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2315 ::decode(reqid
, bl
);
2316 ::decode(src_dir
, bl
);
2317 ::decode(src_dname
, bl
);
2318 ::decode(dest_dir
, bl
);
2319 ::decode(dest_dname
, bl
);
2323 void rmdir_rollback::dump(Formatter
*f
) const
2325 f
->dump_stream("metareqid") << reqid
;
2326 f
->dump_stream("source directory") << src_dir
;
2327 f
->dump_string("source dname", src_dname
);
2328 f
->dump_stream("destination directory") << dest_dir
;
2329 f
->dump_string("destination dname", dest_dname
);
2332 void rmdir_rollback::generate_test_instances(list
<rmdir_rollback
*>& ls
)
2334 ls
.push_back(new rmdir_rollback());
2337 void rename_rollback::drec::encode(bufferlist
&bl
) const
2339 ENCODE_START(2, 2, bl
);
2340 ::encode(dirfrag
, bl
);
2341 ::encode(dirfrag_old_mtime
, bl
);
2342 ::encode(dirfrag_old_rctime
, bl
);
2344 ::encode(remote_ino
, bl
);
2345 ::encode(dname
, bl
);
2346 ::encode(remote_d_type
, bl
);
2347 ::encode(old_ctime
, bl
);
2351 void rename_rollback::drec::decode(bufferlist::iterator
&bl
)
2353 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2354 ::decode(dirfrag
, bl
);
2355 ::decode(dirfrag_old_mtime
, bl
);
2356 ::decode(dirfrag_old_rctime
, bl
);
2358 ::decode(remote_ino
, bl
);
2359 ::decode(dname
, bl
);
2360 ::decode(remote_d_type
, bl
);
2361 ::decode(old_ctime
, bl
);
2365 void rename_rollback::drec::dump(Formatter
*f
) const
2367 f
->dump_stream("directory fragment") << dirfrag
;
2368 f
->dump_stream("directory old mtime") << dirfrag_old_mtime
;
2369 f
->dump_stream("directory old rctime") << dirfrag_old_rctime
;
2370 f
->dump_int("ino", ino
);
2371 f
->dump_int("remote ino", remote_ino
);
2372 f
->dump_string("dname", dname
);
2373 uint32_t type
= DTTOIF(remote_d_type
) & S_IFMT
; // convert to type entries
2377 type_string
= "file"; break;
2379 type_string
= "symlink"; break;
2381 type_string
= "directory"; break;
2383 type_string
= "UNKNOWN-" + stringify((int)type
); break;
2385 f
->dump_string("remote dtype", type_string
);
2386 f
->dump_stream("old ctime") << old_ctime
;
2389 void rename_rollback::drec::generate_test_instances(list
<drec
*>& ls
)
2391 ls
.push_back(new drec());
2392 ls
.back()->remote_d_type
= IFTODT(S_IFREG
);
2395 void rename_rollback::encode(bufferlist
&bl
) const
2397 ENCODE_START(2, 2, bl
);
2398 ::encode(reqid
, bl
);
2399 encode(orig_src
, bl
);
2400 encode(orig_dest
, bl
);
2402 ::encode(ctime
, bl
);
2406 void rename_rollback::decode(bufferlist::iterator
&bl
)
2408 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2409 ::decode(reqid
, bl
);
2410 decode(orig_src
, bl
);
2411 decode(orig_dest
, bl
);
2413 ::decode(ctime
, bl
);
2417 void rename_rollback::dump(Formatter
*f
) const
2419 f
->dump_stream("request id") << reqid
;
2420 f
->open_object_section("original src drec");
2422 f
->close_section(); // original src drec
2423 f
->open_object_section("original dest drec");
2425 f
->close_section(); // original dest drec
2426 f
->open_object_section("stray drec");
2428 f
->close_section(); // stray drec
2429 f
->dump_stream("ctime") << ctime
;
2432 void rename_rollback::generate_test_instances(list
<rename_rollback
*>& ls
)
2434 ls
.push_back(new rename_rollback());
2435 ls
.back()->orig_src
.remote_d_type
= IFTODT(S_IFREG
);
2436 ls
.back()->orig_dest
.remote_d_type
= IFTODT(S_IFREG
);
2437 ls
.back()->stray
.remote_d_type
= IFTODT(S_IFREG
);
2440 void ESlaveUpdate::encode(bufferlist
&bl
, uint64_t features
) const
2442 ENCODE_START(3, 3, bl
);
2443 ::encode(stamp
, bl
);
2445 ::encode(reqid
, bl
);
2446 ::encode(master
, bl
);
2448 ::encode(origop
, bl
);
2449 ::encode(commit
, bl
, features
);
2450 ::encode(rollback
, bl
);
2454 void ESlaveUpdate::decode(bufferlist::iterator
&bl
)
2456 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2458 ::decode(stamp
, bl
);
2460 ::decode(reqid
, bl
);
2461 ::decode(master
, bl
);
2463 ::decode(origop
, bl
);
2464 ::decode(commit
, bl
);
2465 ::decode(rollback
, bl
);
2469 void ESlaveUpdate::dump(Formatter
*f
) const
2471 f
->open_object_section("metablob");
2473 f
->close_section(); // metablob
2475 f
->dump_int("rollback length", rollback
.length());
2476 f
->dump_string("type", type
);
2477 f
->dump_stream("metareqid") << reqid
;
2478 f
->dump_int("master", master
);
2479 f
->dump_int("op", op
);
2480 f
->dump_int("original op", origop
);
2483 void ESlaveUpdate::generate_test_instances(list
<ESlaveUpdate
*>& ls
)
2485 ls
.push_back(new ESlaveUpdate());
2489 void ESlaveUpdate::replay(MDSRank
*mds
)
2493 case ESlaveUpdate::OP_PREPARE
:
2494 dout(10) << "ESlaveUpdate.replay prepare " << reqid
<< " for mds." << master
2495 << ": applying commit, saving rollback info" << dendl
;
2496 su
= new MDSlaveUpdate(origop
, rollback
, _segment
->slave_updates
);
2497 commit
.replay(mds
, _segment
, su
);
2498 mds
->mdcache
->add_uncommitted_slave_update(reqid
, master
, su
);
2501 case ESlaveUpdate::OP_COMMIT
:
2502 su
= mds
->mdcache
->get_uncommitted_slave_update(reqid
, master
);
2504 dout(10) << "ESlaveUpdate.replay commit " << reqid
<< " for mds." << master
<< dendl
;
2505 mds
->mdcache
->finish_uncommitted_slave_update(reqid
, master
);
2507 dout(10) << "ESlaveUpdate.replay commit " << reqid
<< " for mds." << master
2508 << ": ignoring, no previously saved prepare" << dendl
;
2512 case ESlaveUpdate::OP_ROLLBACK
:
2513 dout(10) << "ESlaveUpdate.replay abort " << reqid
<< " for mds." << master
2514 << ": applying rollback commit blob" << dendl
;
2515 commit
.replay(mds
, _segment
);
2516 su
= mds
->mdcache
->get_uncommitted_slave_update(reqid
, master
);
2518 mds
->mdcache
->finish_uncommitted_slave_update(reqid
, master
);
2522 mds
->clog
->error() << "invalid op in ESlaveUpdate";
2524 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2529 // -----------------------
2532 void ESubtreeMap::encode(bufferlist
& bl
, uint64_t features
) const
2534 ENCODE_START(6, 5, bl
);
2535 ::encode(stamp
, bl
);
2536 ::encode(metablob
, bl
, features
);
2537 ::encode(subtrees
, bl
);
2538 ::encode(ambiguous_subtrees
, bl
);
2539 ::encode(expire_pos
, bl
);
2540 ::encode(event_seq
, bl
);
2544 void ESubtreeMap::decode(bufferlist::iterator
&bl
)
2546 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl
);
2548 ::decode(stamp
, bl
);
2549 ::decode(metablob
, bl
);
2550 ::decode(subtrees
, bl
);
2552 ::decode(ambiguous_subtrees
, bl
);
2554 ::decode(expire_pos
, bl
);
2556 ::decode(event_seq
, bl
);
2560 void ESubtreeMap::dump(Formatter
*f
) const
2562 f
->open_object_section("metablob");
2564 f
->close_section(); // metablob
2566 f
->open_array_section("subtrees");
2567 for(map
<dirfrag_t
,vector
<dirfrag_t
> >::const_iterator i
= subtrees
.begin();
2568 i
!= subtrees
.end(); ++i
) {
2569 f
->open_object_section("tree");
2570 f
->dump_stream("root dirfrag") << i
->first
;
2571 for (vector
<dirfrag_t
>::const_iterator j
= i
->second
.begin();
2572 j
!= i
->second
.end(); ++j
) {
2573 f
->dump_stream("bound dirfrag") << *j
;
2575 f
->close_section(); // tree
2577 f
->close_section(); // subtrees
2579 f
->open_array_section("ambiguous subtrees");
2580 for(set
<dirfrag_t
>::const_iterator i
= ambiguous_subtrees
.begin();
2581 i
!= ambiguous_subtrees
.end(); ++i
) {
2582 f
->dump_stream("dirfrag") << *i
;
2584 f
->close_section(); // ambiguous subtrees
2586 f
->dump_int("expire position", expire_pos
);
2589 void ESubtreeMap::generate_test_instances(list
<ESubtreeMap
*>& ls
)
2591 ls
.push_back(new ESubtreeMap());
2594 void ESubtreeMap::replay(MDSRank
*mds
)
2596 if (expire_pos
&& expire_pos
> mds
->mdlog
->journaler
->get_expire_pos())
2597 mds
->mdlog
->journaler
->set_expire_pos(expire_pos
);
2599 // suck up the subtree map?
2600 if (mds
->mdcache
->is_subtrees()) {
2601 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl
;
2604 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= subtrees
.begin();
2605 p
!= subtrees
.end();
2607 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
2609 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2610 << " subtree root " << p
->first
<< " not in cache";
2615 if (!mds
->mdcache
->is_subtree(dir
)) {
2616 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2617 << " subtree root " << p
->first
<< " not a subtree in cache";
2621 if (dir
->get_dir_auth().first
!= mds
->get_nodeid()) {
2622 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2623 << " subtree root " << p
->first
2624 << " is not mine in cache (it's " << dir
->get_dir_auth() << ")";
2629 for (vector
<dirfrag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
2630 mds
->mdcache
->get_force_dirfrag(*q
, true);
2633 mds
->mdcache
->get_subtree_bounds(dir
, bounds
);
2634 for (vector
<dirfrag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
2635 CDir
*b
= mds
->mdcache
->get_dirfrag(*q
);
2637 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2638 << " subtree " << p
->first
<< " bound " << *q
<< " not in cache";
2642 if (bounds
.count(b
) == 0) {
2643 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2644 << " subtree " << p
->first
<< " bound " << *q
<< " not a bound in cache";
2650 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
) {
2651 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2652 << " subtree " << p
->first
<< " has extra bound in cache " << (*q
)->dirfrag();
2656 if (ambiguous_subtrees
.count(p
->first
)) {
2657 if (!mds
->mdcache
->have_ambiguous_import(p
->first
)) {
2658 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2659 << " subtree " << p
->first
<< " is ambiguous but is not in our cache";
2663 if (mds
->mdcache
->have_ambiguous_import(p
->first
)) {
2664 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2665 << " subtree " << p
->first
<< " is not ambiguous but is in our cache";
2672 mds
->mdcache
->list_subtrees(subs
);
2673 for (list
<CDir
*>::iterator p
= subs
.begin(); p
!= subs
.end(); ++p
) {
2675 if (dir
->get_dir_auth().first
!= mds
->get_nodeid())
2677 if (subtrees
.count(dir
->dirfrag()) == 0) {
2678 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2679 << " does not include cache subtree " << dir
->dirfrag();
2685 dout(0) << "journal subtrees: " << subtrees
<< dendl
;
2686 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees
<< dendl
;
2687 mds
->mdcache
->show_subtrees();
2688 assert(!g_conf
->mds_debug_subtrees
|| errors
== 0);
2693 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl
;
2695 // first, stick the spanning tree in my cache
2696 //metablob.print(*_dout);
2697 metablob
.replay(mds
, _segment
);
2699 // restore import/export maps
2700 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= subtrees
.begin();
2701 p
!= subtrees
.end();
2703 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
2705 if (ambiguous_subtrees
.count(p
->first
)) {
2707 mds
->mdcache
->add_ambiguous_import(p
->first
, p
->second
);
2708 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, p
->second
,
2709 mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
2712 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, p
->second
, mds
->get_nodeid());
2716 mds
->mdcache
->recalc_auth_bits(true);
2718 mds
->mdcache
->show_subtrees();
2723 // -----------------------
2726 void EFragment::replay(MDSRank
*mds
)
2728 dout(10) << "EFragment.replay " << op_name(op
) << " " << ino
<< " " << basefrag
<< " by " << bits
<< dendl
;
2730 list
<CDir
*> resultfrags
;
2731 list
<MDSInternalContextBase
*> waiters
;
2732 list
<frag_t
> old_frags
;
2734 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2735 // it will be once we replay the metablob , but first we need to
2736 // refragment anything we already have in the cache.
2737 CInode
*in
= mds
->mdcache
->get_inode(ino
);
2741 mds
->mdcache
->add_uncommitted_fragment(dirfrag_t(ino
, basefrag
), bits
, orig_frags
, _segment
, &rollback
);
2744 mds
->mdcache
->adjust_dir_fragments(in
, basefrag
, bits
, resultfrags
, waiters
, true);
2749 in
->dirfragtree
.get_leaves_under(basefrag
, old_frags
);
2750 if (orig_frags
.empty()) {
2751 // old format EFragment
2752 mds
->mdcache
->adjust_dir_fragments(in
, basefrag
, -bits
, resultfrags
, waiters
, true);
2754 for (list
<frag_t
>::iterator p
= orig_frags
.begin(); p
!= orig_frags
.end(); ++p
)
2755 mds
->mdcache
->force_dir_fragment(in
, *p
);
2758 mds
->mdcache
->rollback_uncommitted_fragment(dirfrag_t(ino
, basefrag
), old_frags
);
2763 mds
->mdcache
->finish_uncommitted_fragment(dirfrag_t(ino
, basefrag
), op
);
2770 metablob
.replay(mds
, _segment
);
2771 if (in
&& g_conf
->mds_debug_frag
)
2772 in
->verify_dirfrags();
2775 void EFragment::encode(bufferlist
&bl
, uint64_t features
) const {
2776 ENCODE_START(5, 4, bl
);
2777 ::encode(stamp
, bl
);
2780 ::encode(basefrag
, bl
);
2782 ::encode(metablob
, bl
, features
);
2783 ::encode(orig_frags
, bl
);
2784 ::encode(rollback
, bl
);
2788 void EFragment::decode(bufferlist::iterator
&bl
) {
2789 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl
);
2791 ::decode(stamp
, bl
);
2795 ::decode(basefrag
, bl
);
2797 ::decode(metablob
, bl
);
2798 if (struct_v
>= 5) {
2799 ::decode(orig_frags
, bl
);
2800 ::decode(rollback
, bl
);
2805 void EFragment::dump(Formatter
*f
) const
2807 /*f->open_object_section("Metablob");
2808 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2809 f->close_section();*/
2810 f
->dump_string("op", op_name(op
));
2811 f
->dump_stream("ino") << ino
;
2812 f
->dump_stream("base frag") << basefrag
;
2813 f
->dump_int("bits", bits
);
2816 void EFragment::generate_test_instances(list
<EFragment
*>& ls
)
2818 ls
.push_back(new EFragment
);
2819 ls
.push_back(new EFragment
);
2820 ls
.back()->op
= OP_PREPARE
;
2822 ls
.back()->bits
= 5;
2825 void dirfrag_rollback::encode(bufferlist
&bl
) const
2827 ENCODE_START(1, 1, bl
);
2828 ::encode(fnode
, bl
);
2832 void dirfrag_rollback::decode(bufferlist::iterator
&bl
)
2834 DECODE_START(1, bl
);
2835 ::decode(fnode
, bl
);
2841 // =========================================================================
2843 // -----------------------
2846 void EExport::replay(MDSRank
*mds
)
2848 dout(10) << "EExport.replay " << base
<< dendl
;
2849 metablob
.replay(mds
, _segment
);
2851 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
2854 set
<CDir
*> realbounds
;
2855 for (set
<dirfrag_t
>::iterator p
= bounds
.begin();
2858 CDir
*bd
= mds
->mdcache
->get_dirfrag(*p
);
2860 realbounds
.insert(bd
);
2864 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, realbounds
, CDIR_AUTH_UNDEF
);
2866 mds
->mdcache
->try_trim_non_auth_subtree(dir
);
2869 void EExport::encode(bufferlist
& bl
, uint64_t features
) const
2871 ENCODE_START(4, 3, bl
);
2872 ::encode(stamp
, bl
);
2873 ::encode(metablob
, bl
, features
);
2875 ::encode(bounds
, bl
);
2876 ::encode(target
, bl
);
2880 void EExport::decode(bufferlist::iterator
&bl
)
2882 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2884 ::decode(stamp
, bl
);
2885 ::decode(metablob
, bl
);
2887 ::decode(bounds
, bl
);
2889 ::decode(target
, bl
);
2893 void EExport::dump(Formatter
*f
) const
2895 f
->dump_float("stamp", (double)stamp
);
2896 /*f->open_object_section("Metablob");
2897 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2898 f->close_section();*/
2899 f
->dump_stream("base dirfrag") << base
;
2900 f
->open_array_section("bounds dirfrags");
2901 for (set
<dirfrag_t
>::const_iterator i
= bounds
.begin();
2902 i
!= bounds
.end(); ++i
) {
2903 f
->dump_stream("dirfrag") << *i
;
2905 f
->close_section(); // bounds dirfrags
2908 void EExport::generate_test_instances(list
<EExport
*>& ls
)
2910 EExport
*sample
= new EExport();
2911 ls
.push_back(sample
);
2915 // -----------------------
2918 void EImportStart::update_segment()
2920 _segment
->sessionmapv
= cmapv
;
2923 void EImportStart::replay(MDSRank
*mds
)
2925 dout(10) << "EImportStart.replay " << base
<< " bounds " << bounds
<< dendl
;
2926 //metablob.print(*_dout);
2927 metablob
.replay(mds
, _segment
);
2929 // put in ambiguous import list
2930 mds
->mdcache
->add_ambiguous_import(base
, bounds
);
2932 // set auth partially to us so we don't trim it
2933 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
2936 set
<CDir
*> realbounds
;
2937 for (vector
<dirfrag_t
>::iterator p
= bounds
.begin();
2940 CDir
*bd
= mds
->mdcache
->get_dirfrag(*p
);
2942 if (!bd
->is_subtree_root())
2943 bd
->state_clear(CDir::STATE_AUTH
);
2944 realbounds
.insert(bd
);
2947 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, realbounds
,
2948 mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
2950 // open client sessions?
2951 if (mds
->sessionmap
.get_version() >= cmapv
) {
2952 dout(10) << "EImportStart.replay sessionmap " << mds
->sessionmap
.get_version()
2953 << " >= " << cmapv
<< ", noop" << dendl
;
2955 dout(10) << "EImportStart.replay sessionmap " << mds
->sessionmap
.get_version()
2956 << " < " << cmapv
<< dendl
;
2957 map
<client_t
,entity_inst_t
> cm
;
2958 bufferlist::iterator blp
= client_map
.begin();
2960 mds
->sessionmap
.replay_open_sessions(cm
);
2961 if (mds
->sessionmap
.get_version() != cmapv
)
2963 derr
<< "sessionmap version " << mds
->sessionmap
.get_version()
2964 << " != cmapv " << cmapv
<< dendl
;
2965 mds
->clog
->error() << "failure replaying journal (EImportStart)";
2967 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2973 void EImportStart::encode(bufferlist
&bl
, uint64_t features
) const {
2974 ENCODE_START(4, 3, bl
);
2975 ::encode(stamp
, bl
);
2977 ::encode(metablob
, bl
, features
);
2978 ::encode(bounds
, bl
);
2979 ::encode(cmapv
, bl
);
2980 ::encode(client_map
, bl
);
2985 void EImportStart::decode(bufferlist::iterator
&bl
) {
2986 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2988 ::decode(stamp
, bl
);
2990 ::decode(metablob
, bl
);
2991 ::decode(bounds
, bl
);
2992 ::decode(cmapv
, bl
);
2993 ::decode(client_map
, bl
);
2999 void EImportStart::dump(Formatter
*f
) const
3001 f
->dump_stream("base dirfrag") << base
;
3002 f
->open_array_section("boundary dirfrags");
3003 for (vector
<dirfrag_t
>::const_iterator iter
= bounds
.begin();
3004 iter
!= bounds
.end(); ++iter
) {
3005 f
->dump_stream("frag") << *iter
;
3010 void EImportStart::generate_test_instances(list
<EImportStart
*>& ls
)
3012 ls
.push_back(new EImportStart
);
3015 // -----------------------
3018 void EImportFinish::replay(MDSRank
*mds
)
3020 if (mds
->mdcache
->have_ambiguous_import(base
)) {
3021 dout(10) << "EImportFinish.replay " << base
<< " success=" << success
<< dendl
;
3023 mds
->mdcache
->finish_ambiguous_import(base
);
3025 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
3027 vector
<dirfrag_t
> bounds
;
3028 mds
->mdcache
->get_ambiguous_import_bounds(base
, bounds
);
3029 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, bounds
, CDIR_AUTH_UNDEF
);
3030 mds
->mdcache
->cancel_ambiguous_import(dir
);
3031 mds
->mdcache
->try_trim_non_auth_subtree(dir
);
3034 // this shouldn't happen unless this is an old journal
3035 dout(10) << "EImportFinish.replay " << base
<< " success=" << success
3036 << " on subtree not marked as ambiguous"
3038 mds
->clog
->error() << "failure replaying journal (EImportFinish)";
3040 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3044 void EImportFinish::encode(bufferlist
& bl
, uint64_t features
) const
3046 ENCODE_START(3, 3, bl
);
3047 ::encode(stamp
, bl
);
3049 ::encode(success
, bl
);
3053 void EImportFinish::decode(bufferlist::iterator
&bl
)
3055 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
3057 ::decode(stamp
, bl
);
3059 ::decode(success
, bl
);
3063 void EImportFinish::dump(Formatter
*f
) const
3065 f
->dump_stream("base dirfrag") << base
;
3066 f
->dump_string("success", success
? "true" : "false");
3068 void EImportFinish::generate_test_instances(list
<EImportFinish
*>& ls
)
3070 ls
.push_back(new EImportFinish
);
3071 ls
.push_back(new EImportFinish
);
3072 ls
.back()->success
= true;
3076 // ------------------------
3079 void EResetJournal::encode(bufferlist
& bl
, uint64_t features
) const
3081 ENCODE_START(2, 2, bl
);
3082 ::encode(stamp
, bl
);
3086 void EResetJournal::decode(bufferlist::iterator
&bl
)
3088 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
3089 ::decode(stamp
, bl
);
3093 void EResetJournal::dump(Formatter
*f
) const
3095 f
->dump_stream("timestamp") << stamp
;
3098 void EResetJournal::generate_test_instances(list
<EResetJournal
*>& ls
)
3100 ls
.push_back(new EResetJournal());
3103 void EResetJournal::replay(MDSRank
*mds
)
3105 dout(1) << "EResetJournal" << dendl
;
3107 mds
->sessionmap
.wipe();
3108 mds
->inotable
->replay_reset();
3110 if (mds
->mdsmap
->get_root() == mds
->get_nodeid()) {
3111 CDir
*rootdir
= mds
->mdcache
->get_root()->get_or_open_dirfrag(mds
->mdcache
, frag_t());
3112 mds
->mdcache
->adjust_subtree_auth(rootdir
, mds
->get_nodeid());
3115 CDir
*mydir
= mds
->mdcache
->get_myin()->get_or_open_dirfrag(mds
->mdcache
, frag_t());
3116 mds
->mdcache
->adjust_subtree_auth(mydir
, mds
->get_nodeid());
3118 mds
->mdcache
->recalc_auth_bits(true);
3120 mds
->mdcache
->show_subtrees();
3124 void ENoOp::encode(bufferlist
&bl
, uint64_t features
) const
3126 ENCODE_START(2, 2, bl
);
3127 ::encode(pad_size
, bl
);
3128 uint8_t const pad
= 0xff;
3129 for (unsigned int i
= 0; i
< pad_size
; ++i
) {
3136 void ENoOp::decode(bufferlist::iterator
&bl
)
3138 DECODE_START(2, bl
);
3139 ::decode(pad_size
, bl
);
3140 if (bl
.get_remaining() != pad_size
) {
3141 // This is spiritually an assertion, but expressing in a way that will let
3142 // journal debug tools catch it and recognise a malformed entry.
3143 throw buffer::end_of_buffer();
3145 bl
.advance(pad_size
);
3151 void ENoOp::replay(MDSRank
*mds
)
3153 dout(4) << "ENoOp::replay, " << pad_size
<< " bytes skipped in journal" << dendl
;
3157 * If re-formatting an old journal that used absolute log position
3158 * references as segment sequence numbers, use this function to update
3162 * MDSRank instance, just used for logging
3164 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3167 * True if the event was modified.
3169 bool EMetaBlob::rewrite_truncate_finish(MDSRank
const *mds
,
3170 std::map
<log_segment_seq_t
, log_segment_seq_t
> const &old_to_new
)
3172 bool modified
= false;
3173 map
<inodeno_t
, log_segment_seq_t
> new_trunc_finish
;
3174 for (std::map
<inodeno_t
, log_segment_seq_t
>::iterator i
= truncate_finish
.begin();
3175 i
!= truncate_finish
.end(); ++i
) {
3176 if (old_to_new
.count(i
->second
)) {
3177 dout(20) << __func__
<< " applying segment seq mapping "
3178 << i
->second
<< " -> " << old_to_new
.find(i
->second
)->second
<< dendl
;
3179 new_trunc_finish
[i
->first
] = old_to_new
.find(i
->second
)->second
;
3182 dout(20) << __func__
<< " no segment seq mapping found for "
3183 << i
->second
<< dendl
;
3184 new_trunc_finish
[i
->first
] = i
->second
;
3187 truncate_finish
= new_trunc_finish
;