1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "common/config.h"
16 #include "osdc/Journaler.h"
17 #include "events/ESubtreeMap.h"
18 #include "events/ESession.h"
19 #include "events/ESessions.h"
21 #include "events/EMetaBlob.h"
22 #include "events/EResetJournal.h"
23 #include "events/ENoOp.h"
25 #include "events/EUpdate.h"
26 #include "events/ESlaveUpdate.h"
27 #include "events/EOpen.h"
28 #include "events/ECommitted.h"
30 #include "events/EExport.h"
31 #include "events/EImportStart.h"
32 #include "events/EImportFinish.h"
33 #include "events/EFragment.h"
35 #include "events/ETableClient.h"
36 #include "events/ETableServer.h"
38 #include "include/stringify.h"
40 #include "LogSegment.h"
50 #include "MDSTableClient.h"
51 #include "MDSTableServer.h"
55 #define dout_context g_ceph_context
56 #define dout_subsys ceph_subsys_mds
58 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
61 // -----------------------
64 void LogSegment::try_to_expire(MDSRank
*mds
, MDSGatherBuilder
&gather_bld
, int op_prio
)
68 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire" << dendl
;
70 assert(g_conf
->mds_kill_journal_expire_at
!= 1);
73 for (elist
<CDir
*>::iterator p
= new_dirfrags
.begin(); !p
.end(); ++p
) {
74 dout(20) << " new_dirfrag " << **p
<< dendl
;
75 assert((*p
)->is_auth());
78 for (elist
<CDir
*>::iterator p
= dirty_dirfrags
.begin(); !p
.end(); ++p
) {
79 dout(20) << " dirty_dirfrag " << **p
<< dendl
;
80 assert((*p
)->is_auth());
83 for (elist
<CDentry
*>::iterator p
= dirty_dentries
.begin(); !p
.end(); ++p
) {
84 dout(20) << " dirty_dentry " << **p
<< dendl
;
85 assert((*p
)->is_auth());
86 commit
.insert((*p
)->get_dir());
88 for (elist
<CInode
*>::iterator p
= dirty_inodes
.begin(); !p
.end(); ++p
) {
89 dout(20) << " dirty_inode " << **p
<< dendl
;
90 assert((*p
)->is_auth());
91 if ((*p
)->is_base()) {
92 (*p
)->store(gather_bld
.new_sub());
94 commit
.insert((*p
)->get_parent_dn()->get_dir());
97 if (!commit
.empty()) {
98 for (set
<CDir
*>::iterator p
= commit
.begin();
102 assert(dir
->is_auth());
103 if (dir
->can_auth_pin()) {
104 dout(15) << "try_to_expire committing " << *dir
<< dendl
;
105 dir
->commit(0, gather_bld
.new_sub(), false, op_prio
);
107 dout(15) << "try_to_expire waiting for unfreeze on " << *dir
<< dendl
;
108 dir
->add_waiter(CDir::WAIT_UNFREEZE
, gather_bld
.new_sub());
113 // master ops with possibly uncommitted slaves
114 for (set
<metareqid_t
>::iterator p
= uncommitted_masters
.begin();
115 p
!= uncommitted_masters
.end();
117 dout(10) << "try_to_expire waiting for slaves to ack commit on " << *p
<< dendl
;
118 mds
->mdcache
->wait_for_uncommitted_master(*p
, gather_bld
.new_sub());
121 // uncommitted fragments
122 for (set
<dirfrag_t
>::iterator p
= uncommitted_fragments
.begin();
123 p
!= uncommitted_fragments
.end();
125 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p
<< dendl
;
126 mds
->mdcache
->wait_for_uncommitted_fragment(*p
, gather_bld
.new_sub());
129 // nudge scatterlocks
130 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_dir
.begin(); !p
.end(); ++p
) {
132 dout(10) << "try_to_expire waiting for dirlock flush on " << *in
<< dendl
;
133 mds
->locker
->scatter_nudge(&in
->filelock
, gather_bld
.new_sub());
135 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_dirfragtree
.begin(); !p
.end(); ++p
) {
137 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in
<< dendl
;
138 mds
->locker
->scatter_nudge(&in
->dirfragtreelock
, gather_bld
.new_sub());
140 for (elist
<CInode
*>::iterator p
= dirty_dirfrag_nest
.begin(); !p
.end(); ++p
) {
142 dout(10) << "try_to_expire waiting for nest flush on " << *in
<< dendl
;
143 mds
->locker
->scatter_nudge(&in
->nestlock
, gather_bld
.new_sub());
146 assert(g_conf
->mds_kill_journal_expire_at
!= 2);
148 // open files and snap inodes
149 if (!open_files
.empty()) {
150 assert(!mds
->mdlog
->is_capped()); // hmm FIXME
152 LogSegment
*ls
= mds
->mdlog
->get_current_segment();
154 elist
<CInode
*>::iterator p
= open_files
.begin(member_offset(CInode
, item_open_file
));
158 if (in
->last
== CEPH_NOSNAP
&& in
->is_auth() &&
159 !in
->is_ambiguous_auth() && in
->is_any_caps()) {
160 if (in
->is_any_caps_wanted()) {
161 dout(20) << "try_to_expire requeueing open file " << *in
<< dendl
;
163 le
= new EOpen(mds
->mdlog
);
164 mds
->mdlog
->start_entry(le
);
166 le
->add_clean_inode(in
);
167 ls
->open_files
.push_back(&in
->item_open_file
);
169 // drop inodes that aren't wanted
170 dout(20) << "try_to_expire not requeueing and delisting unwanted file " << *in
<< dendl
;
171 in
->item_open_file
.remove_myself();
173 } else if (in
->last
!= CEPH_NOSNAP
&& !in
->client_snap_caps
.empty()) {
174 // journal snap inodes that need flush. This simplify the mds failover hanlding
175 dout(20) << "try_to_expire requeueing snap needflush inode " << *in
<< dendl
;
177 le
= new EOpen(mds
->mdlog
);
178 mds
->mdlog
->start_entry(le
);
180 le
->add_clean_inode(in
);
181 ls
->open_files
.push_back(&in
->item_open_file
);
184 * we can get a capless inode here if we replay an open file, the client fails to
185 * reconnect it, but does REPLAY an open request (that adds it to the logseg). AFAICS
186 * it's ok for the client to replay an open on a file it doesn't have in it's cache
189 * this makes the mds less sensitive to strict open_file consistency, although it does
190 * make it easier to miss subtle problems.
192 dout(20) << "try_to_expire not requeueing and delisting capless file " << *in
<< dendl
;
193 in
->item_open_file
.remove_myself();
197 mds
->mdlog
->submit_entry(le
);
198 mds
->mdlog
->wait_for_safe(gather_bld
.new_sub());
199 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl
;
203 assert(g_conf
->mds_kill_journal_expire_at
!= 3);
205 // backtraces to be stored/updated
206 for (elist
<CInode
*>::iterator p
= dirty_parent_inodes
.begin(); !p
.end(); ++p
) {
208 assert(in
->is_auth());
209 if (in
->can_auth_pin()) {
210 dout(15) << "try_to_expire waiting for storing backtrace on " << *in
<< dendl
;
211 in
->store_backtrace(gather_bld
.new_sub(), op_prio
);
213 dout(15) << "try_to_expire waiting for unfreeze on " << *in
<< dendl
;
214 in
->add_waiter(CInode::WAIT_UNFREEZE
, gather_bld
.new_sub());
218 assert(g_conf
->mds_kill_journal_expire_at
!= 4);
221 for (elist
<MDSlaveUpdate
*>::iterator p
= slave_updates
.begin(member_offset(MDSlaveUpdate
,
224 MDSlaveUpdate
*su
= *p
;
225 dout(10) << "try_to_expire waiting on slave update " << su
<< dendl
;
226 assert(su
->waiter
== 0);
227 su
->waiter
= gather_bld
.new_sub();
231 if (inotablev
> mds
->inotable
->get_committed_version()) {
232 dout(10) << "try_to_expire saving inotable table, need " << inotablev
233 << ", committed is " << mds
->inotable
->get_committed_version()
234 << " (" << mds
->inotable
->get_committing_version() << ")"
236 mds
->inotable
->save(gather_bld
.new_sub(), inotablev
);
240 if (sessionmapv
> mds
->sessionmap
.get_committed()) {
241 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
242 << ", committed is " << mds
->sessionmap
.get_committed()
243 << " (" << mds
->sessionmap
.get_committing() << ")"
245 mds
->sessionmap
.save(gather_bld
.new_sub(), sessionmapv
);
248 // updates to sessions for completed_requests
249 mds
->sessionmap
.save_if_dirty(touched_sessions
, &gather_bld
);
250 touched_sessions
.clear();
252 // pending commit atids
253 for (map
<int, ceph::unordered_set
<version_t
> >::iterator p
= pending_commit_tids
.begin();
254 p
!= pending_commit_tids
.end();
256 MDSTableClient
*client
= mds
->get_table_client(p
->first
);
258 for (ceph::unordered_set
<version_t
>::iterator q
= p
->second
.begin();
259 q
!= p
->second
.end();
261 dout(10) << "try_to_expire " << get_mdstable_name(p
->first
) << " transaction " << *q
262 << " pending commit (not yet acked), waiting" << dendl
;
263 assert(!client
->has_committed(*q
));
264 client
->wait_for_ack(*q
, gather_bld
.new_sub());
269 for (map
<int, version_t
>::iterator p
= tablev
.begin();
272 MDSTableServer
*server
= mds
->get_table_server(p
->first
);
274 if (p
->second
> server
->get_committed_version()) {
275 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p
->first
)
276 << " to save, need " << p
->second
<< dendl
;
277 server
->save(gather_bld
.new_sub());
282 for (set
<CInode
*>::iterator p
= truncating_inodes
.begin();
283 p
!= truncating_inodes
.end();
285 dout(10) << "try_to_expire waiting for truncate of " << **p
<< dendl
;
286 (*p
)->add_waiter(CInode::WAIT_TRUNC
, gather_bld
.new_sub());
289 if (gather_bld
.has_subs()) {
290 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire waiting" << dendl
;
293 assert(g_conf
->mds_kill_journal_expire_at
!= 5);
294 dout(6) << "LogSegment(" << seq
<< "/" << offset
<< ").try_to_expire success" << dendl
;
299 // -----------------------
302 EMetaBlob::EMetaBlob(MDLog
*mdlog
) : opened_ino(0), renamed_dirino(0),
303 inotablev(0), sessionmapv(0), allocated_ino(0),
304 last_subtree_map(0), event_seq(0)
307 void EMetaBlob::add_dir_context(CDir
*dir
, int mode
)
309 MDSRank
*mds
= dir
->cache
->mds
;
311 list
<CDentry
*> parents
;
313 // it may be okay not to include the maybe items, if
314 // - we journaled the maybe child inode in this segment
315 // - that subtree turns out to be unambiguously auth
316 list
<CDentry
*> maybe
;
317 bool maybenot
= false;
320 // already have this dir? (we must always add in order)
321 if (lump_map
.count(dir
->dirfrag())) {
322 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") have lump " << dir
->dirfrag() << dendl
;
326 // stop at root/stray
327 CInode
*diri
= dir
->get_inode();
328 CDentry
*parent
= diri
->get_projected_parent_dn();
330 if (mode
== TO_AUTH_SUBTREE_ROOT
) {
332 if (dir
->is_subtree_root() &&
333 !dir
->state_test(CDir::STATE_EXPORTBOUND
)) {
334 if (dir
->is_auth() && !dir
->is_ambiguous_auth() ) {
335 if (dir
->state_test(CDir::STATE_AUXSUBTREE
) &&
336 dir
->get_dir_auth().first
== diri
->authority().first
) {
337 // auxiliary subtree. treat it as normal dirfrag
338 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") auxiliary subtree " << dendl
;
340 // it's an auth subtree, we don't need maybe (if any), and we're done.
341 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") reached unambig auth subtree, don't need " << maybe
342 << " at " << *dir
<< dendl
;
347 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") reached ambig or !auth subtree, need " << maybe
348 << " at " << *dir
<< dendl
;
349 // we need the maybe list after all!
350 parents
.splice(parents
.begin(), maybe
);
355 // was the inode journaled in this blob?
356 if (event_seq
&& diri
->last_journaled
== event_seq
) {
357 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") already have diri this blob " << *diri
<< dendl
;
361 // have we journaled this inode since the last subtree map?
362 if (!maybenot
&& last_subtree_map
&& diri
->last_journaled
>= last_subtree_map
) {
363 dout(20) << "EMetaBlob::add_dir_context(" << dir
<< ") already have diri in this segment ("
364 << diri
->last_journaled
<< " >= " << last_subtree_map
<< "), setting maybenot flag "
374 dout(25) << "EMetaBlob::add_dir_context(" << dir
<< ") maybe " << *parent
<< dendl
;
375 maybe
.push_front(parent
);
377 dout(25) << "EMetaBlob::add_dir_context(" << dir
<< ") definitely " << *parent
<< dendl
;
378 parents
.push_front(parent
);
381 dir
= parent
->get_dir();
384 parents
.splice(parents
.begin(), maybe
);
386 dout(20) << "EMetaBlob::add_dir_context final: " << parents
<< dendl
;
387 for (list
<CDentry
*>::iterator p
= parents
.begin(); p
!= parents
.end(); ++p
) {
388 assert((*p
)->get_projected_linkage()->is_primary());
389 add_dentry(*p
, false);
393 void EMetaBlob::update_segment(LogSegment
*ls
)
395 // dirty inode mtimes
396 // -> handled directly by Server.cc, replay()
398 // alloc table update?
400 ls
->inotablev
= inotablev
;
402 ls
->sessionmapv
= sessionmapv
;
405 // -> handled directly by Server.cc
408 // note the newest request per client
409 //if (!client_reqs.empty())
410 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
413 // EMetaBlob::fullbit
415 void EMetaBlob::fullbit::encode(bufferlist
& bl
, uint64_t features
) const {
416 ENCODE_START(8, 5, bl
);
418 ::encode(dnfirst
, bl
);
419 ::encode(dnlast
, bl
);
421 ::encode(inode
, bl
, features
);
422 ::encode(xattrs
, bl
);
423 if (inode
.is_symlink())
424 ::encode(symlink
, bl
);
425 if (inode
.is_dir()) {
426 ::encode(dirfragtree
, bl
);
427 ::encode(snapbl
, bl
);
430 if (old_inodes
.empty()) {
434 ::encode(old_inodes
, bl
, features
);
437 ::encode(snapbl
, bl
);
438 ::encode(oldest_snap
, bl
);
442 void EMetaBlob::fullbit::decode(bufferlist::iterator
&bl
) {
443 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
445 ::decode(dnfirst
, bl
);
446 ::decode(dnlast
, bl
);
449 ::decode(xattrs
, bl
);
450 if (inode
.is_symlink())
451 ::decode(symlink
, bl
);
452 if (inode
.is_dir()) {
453 ::decode(dirfragtree
, bl
);
454 ::decode(snapbl
, bl
);
455 if ((struct_v
== 2) || (struct_v
== 3)) {
456 bool dir_layout_exists
;
457 ::decode(dir_layout_exists
, bl
);
458 if (dir_layout_exists
) {
460 ::decode(dir_struct_v
, bl
); // default_file_layout version
461 ::decode(inode
.layout
, bl
); // and actual layout, that we care about
470 state
= dirty
? EMetaBlob::fullbit::STATE_DIRTY
: 0;
474 bool old_inodes_present
;
475 ::decode(old_inodes_present
, bl
);
476 if (old_inodes_present
) {
477 ::decode(old_inodes
, bl
);
480 if (!inode
.is_dir()) {
482 ::decode(snapbl
, bl
);
485 ::decode(oldest_snap
, bl
);
487 oldest_snap
= CEPH_NOSNAP
;
492 void EMetaBlob::fullbit::dump(Formatter
*f
) const
494 f
->dump_string("dentry", dn
);
495 f
->dump_stream("snapid.first") << dnfirst
;
496 f
->dump_stream("snapid.last") << dnlast
;
497 f
->dump_int("dentry version", dnv
);
498 f
->open_object_section("inode");
500 f
->close_section(); // inode
501 f
->open_object_section("xattrs");
502 for (map
<string
, bufferptr
>::const_iterator iter
= xattrs
.begin();
503 iter
!= xattrs
.end(); ++iter
) {
504 string
s(iter
->second
.c_str(), iter
->second
.length());
505 f
->dump_string(iter
->first
.c_str(), s
);
507 f
->close_section(); // xattrs
508 if (inode
.is_symlink()) {
509 f
->dump_string("symlink", symlink
);
511 if (inode
.is_dir()) {
512 f
->dump_stream("frag tree") << dirfragtree
;
513 f
->dump_string("has_snapbl", snapbl
.length() ? "true" : "false");
514 if (inode
.has_layout()) {
515 f
->open_object_section("file layout policy");
517 f
->dump_string("layout", "the layout exists");
518 f
->close_section(); // file layout policy
521 f
->dump_string("state", state_string());
522 if (!old_inodes
.empty()) {
523 f
->open_array_section("old inodes");
524 for (old_inodes_t::const_iterator iter
= old_inodes
.begin();
525 iter
!= old_inodes
.end();
527 f
->open_object_section("inode");
528 f
->dump_int("snapid", iter
->first
);
529 iter
->second
.dump(f
);
530 f
->close_section(); // inode
532 f
->close_section(); // old inodes
536 void EMetaBlob::fullbit::generate_test_instances(list
<EMetaBlob::fullbit
*>& ls
)
540 map
<string
,bufferptr
> empty_xattrs
;
541 bufferlist empty_snapbl
;
542 fullbit
*sample
= new fullbit("/testdn", 0, 0, 0,
543 inode
, fragtree
, empty_xattrs
, "", 0, empty_snapbl
,
545 ls
.push_back(sample
);
548 void EMetaBlob::fullbit::update_inode(MDSRank
*mds
, CInode
*in
)
552 if (in
->inode
.is_dir()) {
553 if (!(in
->dirfragtree
== dirfragtree
)) {
554 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in
->dirfragtree
<< " -> "
555 << dirfragtree
<< " on " << *in
<< dendl
;
556 in
->dirfragtree
= dirfragtree
;
557 in
->force_dirfrags();
558 if (in
->has_dirfrags() && in
->authority() == CDIR_AUTH_UNDEF
) {
560 in
->get_nested_dirfrags(ls
);
561 for (list
<CDir
*>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
563 if (dir
->get_num_any() == 0 &&
564 mds
->mdcache
->can_trim_non_auth_dirfrag(dir
)) {
565 dout(10) << " closing empty non-auth dirfrag " << *dir
<< dendl
;
566 in
->close_dirfrag(dir
->get_frag());
571 } else if (in
->inode
.is_symlink()) {
572 in
->symlink
= symlink
;
574 in
->old_inodes
= old_inodes
;
575 if (!in
->old_inodes
.empty()) {
576 snapid_t min_first
= in
->old_inodes
.rbegin()->first
+ 1;
577 if (min_first
> in
->first
)
578 in
->first
= min_first
;
582 * we can do this before linking hte inode bc the split_at would
583 * be a no-op.. we have no children (namely open snaprealms) to
586 in
->oldest_snap
= oldest_snap
;
587 in
->decode_snap_blob(snapbl
);
590 * In case there was anything malformed in the journal that we are
591 * replaying, do sanity checks on the inodes we're replaying and
592 * go damaged instead of letting any trash into a live cache
595 // Files must have valid layouts with a pool set
596 if (in
->inode
.layout
.pool_id
== -1 || !in
->inode
.layout
.is_valid()) {
597 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
598 << ": " << in
->inode
.layout
<< dendl
;
599 std::ostringstream oss
;
600 oss
<< "Invalid layout for inode 0x" << std::hex
<< in
->inode
.ino
601 << std::dec
<< " in journal";
602 mds
->clog
->error() << oss
.str();
604 ceph_abort(); // Should be unreachable because damaged() calls respawn()
609 // EMetaBlob::remotebit
611 void EMetaBlob::remotebit::encode(bufferlist
& bl
) const
613 ENCODE_START(2, 2, bl
);
615 ::encode(dnfirst
, bl
);
616 ::encode(dnlast
, bl
);
619 ::encode(d_type
, bl
);
624 void EMetaBlob::remotebit::decode(bufferlist::iterator
&bl
)
626 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
628 ::decode(dnfirst
, bl
);
629 ::decode(dnlast
, bl
);
632 ::decode(d_type
, bl
);
637 void EMetaBlob::remotebit::dump(Formatter
*f
) const
639 f
->dump_string("dentry", dn
);
640 f
->dump_int("snapid.first", dnfirst
);
641 f
->dump_int("snapid.last", dnlast
);
642 f
->dump_int("dentry version", dnv
);
643 f
->dump_int("inodeno", ino
);
644 uint32_t type
= DTTOIF(d_type
) & S_IFMT
; // convert to type entries
648 type_string
= "file"; break;
650 type_string
= "symlink"; break;
652 type_string
= "directory"; break;
654 type_string
= "fifo"; break;
656 type_string
= "chr"; break;
658 type_string
= "blk"; break;
660 type_string
= "sock"; break;
662 assert (0 == "unknown d_type!");
664 f
->dump_string("d_type", type_string
);
665 f
->dump_string("dirty", dirty
? "true" : "false");
668 void EMetaBlob::remotebit::
669 generate_test_instances(list
<EMetaBlob::remotebit
*>& ls
)
671 remotebit
*remote
= new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG
), false);
672 ls
.push_back(remote
);
675 // EMetaBlob::nullbit
677 void EMetaBlob::nullbit::encode(bufferlist
& bl
) const
679 ENCODE_START(2, 2, bl
);
681 ::encode(dnfirst
, bl
);
682 ::encode(dnlast
, bl
);
688 void EMetaBlob::nullbit::decode(bufferlist::iterator
&bl
)
690 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
692 ::decode(dnfirst
, bl
);
693 ::decode(dnlast
, bl
);
699 void EMetaBlob::nullbit::dump(Formatter
*f
) const
701 f
->dump_string("dentry", dn
);
702 f
->dump_int("snapid.first", dnfirst
);
703 f
->dump_int("snapid.last", dnlast
);
704 f
->dump_int("dentry version", dnv
);
705 f
->dump_string("dirty", dirty
? "true" : "false");
708 void EMetaBlob::nullbit::generate_test_instances(list
<nullbit
*>& ls
)
710 nullbit
*sample
= new nullbit("/test/dentry", 0, 10, 15, false);
711 nullbit
*sample2
= new nullbit("/test/dirty", 10, 20, 25, true);
712 ls
.push_back(sample
);
713 ls
.push_back(sample2
);
716 // EMetaBlob::dirlump
718 void EMetaBlob::dirlump::encode(bufferlist
& bl
, uint64_t features
) const
720 ENCODE_START(2, 2, bl
);
724 ::encode(nremote
, bl
);
726 _encode_bits(features
);
731 void EMetaBlob::dirlump::decode(bufferlist::iterator
&bl
)
733 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
)
737 ::decode(nremote
, bl
);
740 dn_decoded
= false; // don't decode bits unless we need them.
744 void EMetaBlob::dirlump::dump(Formatter
*f
) const
747 dirlump
*me
= const_cast<dirlump
*>(this);
750 f
->open_object_section("fnode");
752 f
->close_section(); // fnode
753 f
->dump_string("state", state_string());
754 f
->dump_int("nfull", nfull
);
755 f
->dump_int("nremote", nremote
);
756 f
->dump_int("nnull", nnull
);
758 f
->open_array_section("full bits");
759 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
760 iter
= dfull
.begin(); iter
!= dfull
.end(); ++iter
) {
761 f
->open_object_section("fullbit");
763 f
->close_section(); // fullbit
765 f
->close_section(); // full bits
766 f
->open_array_section("remote bits");
767 for (list
<remotebit
>::const_iterator
768 iter
= dremote
.begin(); iter
!= dremote
.end(); ++iter
) {
769 f
->open_object_section("remotebit");
771 f
->close_section(); // remotebit
773 f
->close_section(); // remote bits
774 f
->open_array_section("null bits");
775 for (list
<nullbit
>::const_iterator
776 iter
= dnull
.begin(); iter
!= dnull
.end(); ++iter
) {
777 f
->open_object_section("null bit");
779 f
->close_section(); // null bit
781 f
->close_section(); // null bits
784 void EMetaBlob::dirlump::generate_test_instances(list
<dirlump
*>& ls
)
786 ls
.push_back(new dirlump());
792 void EMetaBlob::encode(bufferlist
& bl
, uint64_t features
) const
794 ENCODE_START(8, 5, bl
);
795 ::encode(lump_order
, bl
);
796 ::encode(lump_map
, bl
, features
);
797 ::encode(roots
, bl
, features
);
798 ::encode(table_tids
, bl
);
799 ::encode(opened_ino
, bl
);
800 ::encode(allocated_ino
, bl
);
801 ::encode(used_preallocated_ino
, bl
);
802 ::encode(preallocated_inos
, bl
);
803 ::encode(client_name
, bl
);
804 ::encode(inotablev
, bl
);
805 ::encode(sessionmapv
, bl
);
806 ::encode(truncate_start
, bl
);
807 ::encode(truncate_finish
, bl
);
808 ::encode(destroyed_inodes
, bl
);
809 ::encode(client_reqs
, bl
);
810 ::encode(renamed_dirino
, bl
);
811 ::encode(renamed_dir_frags
, bl
);
813 // make MDSRank use v6 format happy
819 ::encode(client_flushes
, bl
);
822 void EMetaBlob::decode(bufferlist::iterator
&bl
)
824 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl
);
825 ::decode(lump_order
, bl
);
826 ::decode(lump_map
, bl
);
831 ::decode(rootbl
, bl
);
832 if (rootbl
.length()) {
833 bufferlist::iterator p
= rootbl
.begin();
834 roots
.push_back(ceph::shared_ptr
<fullbit
>(new fullbit(p
)));
837 ::decode(table_tids
, bl
);
838 ::decode(opened_ino
, bl
);
839 ::decode(allocated_ino
, bl
);
840 ::decode(used_preallocated_ino
, bl
);
841 ::decode(preallocated_inos
, bl
);
842 ::decode(client_name
, bl
);
843 ::decode(inotablev
, bl
);
844 ::decode(sessionmapv
, bl
);
845 ::decode(truncate_start
, bl
);
846 ::decode(truncate_finish
, bl
);
847 ::decode(destroyed_inodes
, bl
);
849 ::decode(client_reqs
, bl
);
854 client_reqs
.push_back(pair
<metareqid_t
,uint64_t>(r
.front(), 0));
859 ::decode(renamed_dirino
, bl
);
860 ::decode(renamed_dir_frags
, bl
);
870 ::decode(client_flushes
, bl
);
877 * Get all inodes touched by this metablob. Includes the 'bits' within
878 * dirlumps, and the inodes of the dirs themselves.
880 void EMetaBlob::get_inodes(
881 std::set
<inodeno_t
> &inodes
) const
883 // For all dirlumps in this metablob
884 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
885 // Record inode of dirlump
886 inodeno_t
const dir_ino
= i
->first
.ino
;
887 inodes
.insert(dir_ino
);
889 // Decode dirlump bits
890 dirlump
const &dl
= i
->second
;
893 // Record inodes of fullbits
894 list
<ceph::shared_ptr
<fullbit
> > const &fb_list
= dl
.get_dfull();
895 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
896 iter
= fb_list
.begin(); iter
!= fb_list
.end(); ++iter
) {
897 inodes
.insert((*iter
)->inode
.ino
);
900 // Record inodes of remotebits
901 list
<remotebit
> const &rb_list
= dl
.get_dremote();
902 for (list
<remotebit
>::const_iterator
903 iter
= rb_list
.begin(); iter
!= rb_list
.end(); ++iter
) {
904 inodes
.insert(iter
->ino
);
911 * Get a map of dirfrag to set of dentries in that dirfrag which are
912 * touched in this operation.
914 void EMetaBlob::get_dentries(std::map
<dirfrag_t
, std::set
<std::string
> > &dentries
) const
916 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
917 dirlump
const &dl
= i
->second
;
918 dirfrag_t
const &df
= i
->first
;
922 list
<ceph::shared_ptr
<fullbit
> > const &fb_list
= dl
.get_dfull();
923 list
<nullbit
> const &nb_list
= dl
.get_dnull();
924 list
<remotebit
> const &rb_list
= dl
.get_dremote();
926 // For all bits, store dentry
927 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
928 iter
= fb_list
.begin(); iter
!= fb_list
.end(); ++iter
) {
929 dentries
[df
].insert((*iter
)->dn
);
932 for (list
<nullbit
>::const_iterator
933 iter
= nb_list
.begin(); iter
!= nb_list
.end(); ++iter
) {
934 dentries
[df
].insert(iter
->dn
);
936 for (list
<remotebit
>::const_iterator
937 iter
= rb_list
.begin(); iter
!= rb_list
.end(); ++iter
) {
938 dentries
[df
].insert(iter
->dn
);
946 * Calculate all paths that we can infer are touched by this metablob. Only uses
947 * information local to this metablob so it may only be the path within the
950 void EMetaBlob::get_paths(
951 std::vector
<std::string
> &paths
) const
953 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
954 typedef std::pair
<inodeno_t
, std::string
> Location
;
956 // Whenever we see a dentry within a dirlump, we remember it as a child of
957 // the dirlump's inode
958 std::map
<inodeno_t
, std::list
<std::string
> > children
;
960 // Whenever we see a location for an inode, remember it: this allows us to
961 // build a path given an inode
962 std::map
<inodeno_t
, Location
> ino_locations
;
964 // Special case: operations on root inode populate roots but not dirlumps
965 if (lump_map
.empty() && !roots
.empty()) {
966 paths
.push_back("/");
972 // Build a tiny local metadata cache for the path structure in this metablob
973 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
974 inodeno_t
const dir_ino
= i
->first
.ino
;
975 dirlump
const &dl
= i
->second
;
978 list
<ceph::shared_ptr
<fullbit
> > const &fb_list
= dl
.get_dfull();
979 list
<nullbit
> const &nb_list
= dl
.get_dnull();
980 list
<remotebit
> const &rb_list
= dl
.get_dremote();
982 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
983 iter
= fb_list
.begin(); iter
!= fb_list
.end(); ++iter
) {
984 std::string
const &dentry
= (*iter
)->dn
;
985 children
[dir_ino
].push_back(dentry
);
986 ino_locations
[(*iter
)->inode
.ino
] = Location(dir_ino
, dentry
);
989 for (list
<nullbit
>::const_iterator
990 iter
= nb_list
.begin(); iter
!= nb_list
.end(); ++iter
) {
991 std::string
const &dentry
= iter
->dn
;
992 children
[dir_ino
].push_back(dentry
);
995 for (list
<remotebit
>::const_iterator
996 iter
= rb_list
.begin(); iter
!= rb_list
.end(); ++iter
) {
997 std::string
const &dentry
= iter
->dn
;
998 children
[dir_ino
].push_back(dentry
);
1002 std::vector
<Location
> leaf_locations
;
1006 // Output paths for all childless nodes in the metablob
1007 for (std::map
<dirfrag_t
, dirlump
>::const_iterator i
= lump_map
.begin(); i
!= lump_map
.end(); ++i
) {
1008 inodeno_t
const dir_ino
= i
->first
.ino
;
1009 dirlump
const &dl
= i
->second
;
1012 list
<ceph::shared_ptr
<fullbit
> > const &fb_list
= dl
.get_dfull();
1013 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator
1014 iter
= fb_list
.begin(); iter
!= fb_list
.end(); ++iter
) {
1015 std::string
const &dentry
= (*iter
)->dn
;
1016 children
[dir_ino
].push_back(dentry
);
1017 ino_locations
[(*iter
)->inode
.ino
] = Location(dir_ino
, dentry
);
1018 if (children
.find((*iter
)->inode
.ino
) == children
.end()) {
1019 leaf_locations
.push_back(Location(dir_ino
, dentry
));
1024 list
<nullbit
> const &nb_list
= dl
.get_dnull();
1025 for (list
<nullbit
>::const_iterator
1026 iter
= nb_list
.begin(); iter
!= nb_list
.end(); ++iter
) {
1027 std::string
const &dentry
= iter
->dn
;
1028 leaf_locations
.push_back(Location(dir_ino
, dentry
));
1031 list
<remotebit
> const &rb_list
= dl
.get_dremote();
1032 for (list
<remotebit
>::const_iterator
1033 iter
= rb_list
.begin(); iter
!= rb_list
.end(); ++iter
) {
1034 std::string
const &dentry
= iter
->dn
;
1035 leaf_locations
.push_back(Location(dir_ino
, dentry
));
1039 // For all the leaf locations identified, generate paths
1040 for (std::vector
<Location
>::iterator i
= leaf_locations
.begin(); i
!= leaf_locations
.end(); ++i
) {
1041 Location
const &loc
= *i
;
1042 std::string path
= loc
.second
;
1043 inodeno_t ino
= loc
.first
;
1044 while(ino_locations
.find(ino
) != ino_locations
.end()) {
1045 Location
const &loc
= ino_locations
[ino
];
1046 if (!path
.empty()) {
1047 path
= loc
.second
+ "/" + path
;
1049 path
= loc
.second
+ path
;
1054 paths
.push_back(path
);
1059 void EMetaBlob::dump(Formatter
*f
) const
1061 f
->open_array_section("lumps");
1062 for (list
<dirfrag_t
>::const_iterator i
= lump_order
.begin();
1063 i
!= lump_order
.end(); ++i
) {
1064 f
->open_object_section("lump");
1065 f
->open_object_section("dirfrag");
1066 f
->dump_stream("dirfrag") << *i
;
1067 f
->close_section(); // dirfrag
1068 f
->open_object_section("dirlump");
1069 lump_map
.at(*i
).dump(f
);
1070 f
->close_section(); // dirlump
1071 f
->close_section(); // lump
1073 f
->close_section(); // lumps
1075 f
->open_array_section("roots");
1076 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator i
= roots
.begin();
1077 i
!= roots
.end(); ++i
) {
1078 f
->open_object_section("root");
1080 f
->close_section(); // root
1082 f
->close_section(); // roots
1084 f
->open_array_section("tableclient tranactions");
1085 for (list
<pair
<__u8
,version_t
> >::const_iterator i
= table_tids
.begin();
1086 i
!= table_tids
.end(); ++i
) {
1087 f
->open_object_section("transaction");
1088 f
->dump_int("tid", i
->first
);
1089 f
->dump_int("version", i
->second
);
1090 f
->close_section(); // transaction
1092 f
->close_section(); // tableclient transactions
1094 f
->dump_int("renamed directory inodeno", renamed_dirino
);
1096 f
->open_array_section("renamed directory fragments");
1097 for (list
<frag_t
>::const_iterator i
= renamed_dir_frags
.begin();
1098 i
!= renamed_dir_frags
.end(); ++i
) {
1099 f
->dump_int("frag", *i
);
1101 f
->close_section(); // renamed directory fragments
1103 f
->dump_int("inotable version", inotablev
);
1104 f
->dump_int("SessionMap version", sessionmapv
);
1105 f
->dump_int("allocated ino", allocated_ino
);
1107 f
->dump_stream("preallocated inos") << preallocated_inos
;
1108 f
->dump_int("used preallocated ino", used_preallocated_ino
);
1110 f
->open_object_section("client name");
1111 client_name
.dump(f
);
1112 f
->close_section(); // client name
1114 f
->open_array_section("inodes starting a truncate");
1115 for(list
<inodeno_t
>::const_iterator i
= truncate_start
.begin();
1116 i
!= truncate_start
.end(); ++i
) {
1117 f
->dump_int("inodeno", *i
);
1119 f
->close_section(); // truncate inodes
1120 f
->open_array_section("inodes finishing a truncated");
1121 for(map
<inodeno_t
,uint64_t>::const_iterator i
= truncate_finish
.begin();
1122 i
!= truncate_finish
.end(); ++i
) {
1123 f
->open_object_section("inode+segment");
1124 f
->dump_int("inodeno", i
->first
);
1125 f
->dump_int("truncate starting segment", i
->second
);
1126 f
->close_section(); // truncated inode
1128 f
->close_section(); // truncate finish inodes
1130 f
->open_array_section("destroyed inodes");
1131 for(vector
<inodeno_t
>::const_iterator i
= destroyed_inodes
.begin();
1132 i
!= destroyed_inodes
.end(); ++i
) {
1133 f
->dump_int("inodeno", *i
);
1135 f
->close_section(); // destroyed inodes
1137 f
->open_array_section("client requests");
1138 for(list
<pair
<metareqid_t
,uint64_t> >::const_iterator i
= client_reqs
.begin();
1139 i
!= client_reqs
.end(); ++i
) {
1140 f
->open_object_section("Client request");
1141 f
->dump_stream("request ID") << i
->first
;
1142 f
->dump_int("oldest request on client", i
->second
);
1143 f
->close_section(); // request
1145 f
->close_section(); // client requests
1148 void EMetaBlob::generate_test_instances(list
<EMetaBlob
*>& ls
)
1150 ls
.push_back(new EMetaBlob());
1153 void EMetaBlob::replay(MDSRank
*mds
, LogSegment
*logseg
, MDSlaveUpdate
*slaveup
)
1155 dout(10) << "EMetaBlob.replay " << lump_map
.size() << " dirlumps by " << client_name
<< dendl
;
1159 assert(g_conf
->mds_kill_journal_replay_at
!= 1);
1161 for (list
<ceph::shared_ptr
<fullbit
> >::iterator p
= roots
.begin(); p
!= roots
.end(); ++p
) {
1162 CInode
*in
= mds
->mdcache
->get_inode((*p
)->inode
.ino
);
1163 bool isnew
= in
? false:true;
1165 in
= new CInode(mds
->mdcache
, false);
1166 (*p
)->update_inode(mds
, in
);
1169 mds
->mdcache
->add_inode(in
);
1170 if ((*p
)->is_dirty()) in
->_mark_dirty(logseg
);
1171 dout(10) << "EMetaBlob.replay " << (isnew
? " added root ":" updated root ") << *in
<< dendl
;
1174 CInode
*renamed_diri
= 0;
1176 if (renamed_dirino
) {
1177 renamed_diri
= mds
->mdcache
->get_inode(renamed_dirino
);
1179 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri
<< dendl
;
1181 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino
<< dendl
;
1184 for (list
<dirfrag_t
>::iterator lp
= lump_order
.begin(); lp
!= lump_order
.end(); ++lp
) {
1185 dirlump
&lump
= lump_map
[*lp
];
1187 dout(10) << "EMetaBlob.replay found null dentry in dir " << *lp
<< dendl
;
1188 nnull
+= lump
.nnull
;
1194 // keep track of any inodes we unlink and don't relink elsewhere
1195 map
<CInode
*, CDir
*> unlinked
;
1196 set
<CInode
*> linked
;
1198 // walk through my dirs (in order!)
1199 for (list
<dirfrag_t
>::iterator lp
= lump_order
.begin();
1200 lp
!= lump_order
.end();
1202 dout(10) << "EMetaBlob.replay dir " << *lp
<< dendl
;
1203 dirlump
&lump
= lump_map
[*lp
];
1206 CDir
*dir
= mds
->mdcache
->get_force_dirfrag(*lp
, true);
1208 // hmm. do i have the inode?
1209 CInode
*diri
= mds
->mdcache
->get_inode((*lp
).ino
);
1211 if (MDS_INO_IS_MDSDIR(lp
->ino
)) {
1212 assert(MDS_INO_MDSDIR(mds
->get_nodeid()) != lp
->ino
);
1213 diri
= mds
->mdcache
->create_system_inode(lp
->ino
, S_IFDIR
|0755);
1214 diri
->state_clear(CInode::STATE_AUTH
);
1215 dout(10) << "EMetaBlob.replay created base " << *diri
<< dendl
;
1217 dout(0) << "EMetaBlob.replay missing dir ino " << (*lp
).ino
<< dendl
;
1218 mds
->clog
->error() << "failure replaying journal (EMetaBlob)";
1220 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1224 // create the dirfrag
1225 dir
= diri
->get_or_open_dirfrag(mds
->mdcache
, (*lp
).frag
);
1227 if (MDS_INO_IS_BASE(lp
->ino
))
1228 mds
->mdcache
->adjust_subtree_auth(dir
, CDIR_AUTH_UNDEF
);
1230 dout(10) << "EMetaBlob.replay added dir " << *dir
<< dendl
;
1232 dir
->set_version( lump
.fnode
.version
);
1233 dir
->fnode
= lump
.fnode
;
1235 if (lump
.is_importing()) {
1236 dir
->state_set(CDir::STATE_AUTH
);
1237 dir
->state_clear(CDir::STATE_COMPLETE
);
1239 if (lump
.is_dirty()) {
1240 dir
->_mark_dirty(logseg
);
1242 if (!(dir
->fnode
.rstat
== dir
->fnode
.accounted_rstat
)) {
1243 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir
<< dendl
;
1244 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->nestlock
);
1245 logseg
->dirty_dirfrag_nest
.push_back(&dir
->inode
->item_dirty_dirfrag_nest
);
1247 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir
<< dendl
;
1249 if (!(dir
->fnode
.fragstat
== dir
->fnode
.accounted_fragstat
)) {
1250 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir
<< dendl
;
1251 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->filelock
);
1252 logseg
->dirty_dirfrag_dir
.push_back(&dir
->inode
->item_dirty_dirfrag_dir
);
1254 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir
<< dendl
;
1257 if (lump
.is_dirty_dft()) {
1258 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir
<< dendl
;
1259 dir
->state_set(CDir::STATE_DIRTYDFT
);
1260 mds
->locker
->mark_updated_scatterlock(&dir
->inode
->dirfragtreelock
);
1261 logseg
->dirty_dirfrag_dirfragtree
.push_back(&dir
->inode
->item_dirty_dirfrag_dirfragtree
);
1264 dir
->mark_new(logseg
);
1265 if (lump
.is_complete())
1266 dir
->mark_complete();
1268 dout(10) << "EMetaBlob.replay updated dir " << *dir
<< dendl
;
1271 lump
._decode_bits();
1273 // full dentry+inode pairs
1274 for (list
<ceph::shared_ptr
<fullbit
> >::const_iterator pp
= lump
.get_dfull().begin();
1275 pp
!= lump
.get_dfull().end();
1277 ceph::shared_ptr
<fullbit
> p
= *pp
;
1278 CDentry
*dn
= dir
->lookup_exact_snap(p
->dn
, p
->dnlast
);
1280 dn
= dir
->add_null_dentry(p
->dn
, p
->dnfirst
, p
->dnlast
);
1281 dn
->set_version(p
->dnv
);
1282 if (p
->is_dirty()) dn
->_mark_dirty(logseg
);
1283 dout(10) << "EMetaBlob.replay added (full) " << *dn
<< dendl
;
1285 dn
->set_version(p
->dnv
);
1286 if (p
->is_dirty()) dn
->_mark_dirty(logseg
);
1287 dout(10) << "EMetaBlob.replay for [" << p
->dnfirst
<< "," << p
->dnlast
<< "] had " << *dn
<< dendl
;
1288 dn
->first
= p
->dnfirst
;
1289 assert(dn
->last
== p
->dnlast
);
1291 if (lump
.is_importing())
1292 dn
->state_set(CDentry::STATE_AUTH
);
1294 CInode
*in
= mds
->mdcache
->get_inode(p
->inode
.ino
, p
->dnlast
);
1296 in
= new CInode(mds
->mdcache
, dn
->is_auth(), p
->dnfirst
, p
->dnlast
);
1297 p
->update_inode(mds
, in
);
1298 mds
->mdcache
->add_inode(in
);
1299 if (!dn
->get_linkage()->is_null()) {
1300 if (dn
->get_linkage()->is_primary()) {
1301 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1303 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1304 << " " << *dn
->get_linkage()->get_inode() << " should be " << p
->inode
.ino
;
1305 dout(0) << ss
.str() << dendl
;
1306 mds
->clog
->warn(ss
);
1308 dir
->unlink_inode(dn
);
1309 mds
->mdcache
->touch_dentry_bottom(dn
);
1311 if (unlinked
.count(in
))
1313 dir
->link_primary_inode(dn
, in
);
1314 dout(10) << "EMetaBlob.replay added " << *in
<< dendl
;
1316 in
->first
= p
->dnfirst
;
1317 p
->update_inode(mds
, in
);
1318 if (dn
->get_linkage()->get_inode() != in
&& in
->get_parent_dn()) {
1319 dout(10) << "EMetaBlob.replay unlinking " << *in
<< dendl
;
1320 unlinked
[in
] = in
->get_parent_dir();
1321 CDentry
*unlinked_dn
= in
->get_parent_dn();
1322 in
->get_parent_dir()->unlink_inode(in
->get_parent_dn());
1323 mds
->mdcache
->touch_dentry_bottom(unlinked_dn
);
1325 if (dn
->get_linkage()->get_inode() != in
) {
1326 if (!dn
->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1327 if (dn
->get_linkage()->is_primary()) {
1328 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1330 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1331 << " " << *dn
->get_linkage()->get_inode() << " should be " << p
->inode
.ino
;
1332 dout(0) << ss
.str() << dendl
;
1333 mds
->clog
->warn(ss
);
1335 dir
->unlink_inode(dn
);
1336 mds
->mdcache
->touch_dentry_bottom(dn
);
1338 if (unlinked
.count(in
))
1340 dir
->link_primary_inode(dn
, in
);
1341 dout(10) << "EMetaBlob.replay linked " << *in
<< dendl
;
1343 dout(10) << "EMetaBlob.replay for [" << p
->dnfirst
<< "," << p
->dnlast
<< "] had " << *in
<< dendl
;
1345 assert(in
->first
== p
->dnfirst
||
1346 (in
->is_multiversion() && in
->first
> p
->dnfirst
));
1349 in
->_mark_dirty(logseg
);
1350 if (p
->is_dirty_parent())
1351 in
->_mark_dirty_parent(logseg
, p
->is_dirty_pool());
1352 if (p
->need_snapflush())
1353 logseg
->open_files
.push_back(&in
->item_open_file
);
1355 in
->state_set(CInode::STATE_AUTH
);
1357 in
->state_clear(CInode::STATE_AUTH
);
1358 assert(g_conf
->mds_kill_journal_replay_at
!= 2);
1362 for (list
<remotebit
>::const_iterator p
= lump
.get_dremote().begin();
1363 p
!= lump
.get_dremote().end();
1365 CDentry
*dn
= dir
->lookup_exact_snap(p
->dn
, p
->dnlast
);
1367 dn
= dir
->add_remote_dentry(p
->dn
, p
->ino
, p
->d_type
, p
->dnfirst
, p
->dnlast
);
1368 dn
->set_version(p
->dnv
);
1369 if (p
->dirty
) dn
->_mark_dirty(logseg
);
1370 dout(10) << "EMetaBlob.replay added " << *dn
<< dendl
;
1372 if (!dn
->get_linkage()->is_null()) {
1373 dout(10) << "EMetaBlob.replay unlinking " << *dn
<< dendl
;
1374 if (dn
->get_linkage()->is_primary()) {
1375 unlinked
[dn
->get_linkage()->get_inode()] = dir
;
1377 ss
<< "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1378 << " " << *dn
->get_linkage()->get_inode() << " should be remote " << p
->ino
;
1379 dout(0) << ss
.str() << dendl
;
1381 dir
->unlink_inode(dn
);
1382 mds
->mdcache
->touch_dentry_bottom(dn
);
1384 dir
->link_remote_inode(dn
, p
->ino
, p
->d_type
);
1385 dn
->set_version(p
->dnv
);
1386 if (p
->dirty
) dn
->_mark_dirty(logseg
);
1387 dout(10) << "EMetaBlob.replay for [" << p
->dnfirst
<< "," << p
->dnlast
<< "] had " << *dn
<< dendl
;
1388 dn
->first
= p
->dnfirst
;
1389 assert(dn
->last
== p
->dnlast
);
1391 if (lump
.is_importing())
1392 dn
->state_set(CDentry::STATE_AUTH
);
1396 for (list
<nullbit
>::const_iterator p
= lump
.get_dnull().begin();
1397 p
!= lump
.get_dnull().end();
1399 CDentry
*dn
= dir
->lookup_exact_snap(p
->dn
, p
->dnlast
);
1401 dn
= dir
->add_null_dentry(p
->dn
, p
->dnfirst
, p
->dnlast
);
1402 dn
->set_version(p
->dnv
);
1403 if (p
->dirty
) dn
->_mark_dirty(logseg
);
1404 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn
<< dendl
;
1406 dn
->first
= p
->dnfirst
;
1407 if (!dn
->get_linkage()->is_null()) {
1408 dout(10) << "EMetaBlob.replay unlinking " << *dn
<< dendl
;
1409 CInode
*in
= dn
->get_linkage()->get_inode();
1410 // For renamed inode, We may call CInode::force_dirfrag() later.
1411 // CInode::force_dirfrag() doesn't work well when inode is detached
1412 // from the hierarchy.
1413 if (!renamed_diri
|| renamed_diri
!= in
) {
1414 if (dn
->get_linkage()->is_primary())
1416 dir
->unlink_inode(dn
);
1417 mds
->mdcache
->touch_dentry_bottom(dn
);
1420 dn
->set_version(p
->dnv
);
1421 if (p
->dirty
) dn
->_mark_dirty(logseg
);
1422 dout(10) << "EMetaBlob.replay had " << *dn
<< dendl
;
1423 assert(dn
->last
== p
->dnlast
);
1426 if (lump
.is_importing())
1427 dn
->state_set(CDentry::STATE_AUTH
);
1429 // Make null dentries the first things we trim
1430 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn
<< dendl
;
1431 mds
->mdcache
->touch_dentry_bottom(dn
);
1435 assert(g_conf
->mds_kill_journal_replay_at
!= 3);
1437 if (renamed_dirino
) {
1439 assert(unlinked
.count(renamed_diri
));
1440 assert(linked
.count(renamed_diri
));
1441 olddir
= unlinked
[renamed_diri
];
1443 // we imported a diri we haven't seen before
1444 renamed_diri
= mds
->mdcache
->get_inode(renamed_dirino
);
1445 assert(renamed_diri
); // it was in the metablob
1449 if (olddir
->authority() != CDIR_AUTH_UNDEF
&&
1450 renamed_diri
->authority() == CDIR_AUTH_UNDEF
) {
1451 assert(slaveup
); // auth to non-auth, must be slave prepare
1452 list
<frag_t
> leaves
;
1453 renamed_diri
->dirfragtree
.get_leaves(leaves
);
1454 for (list
<frag_t
>::iterator p
= leaves
.begin(); p
!= leaves
.end(); ++p
) {
1455 CDir
*dir
= renamed_diri
->get_dirfrag(*p
);
1457 if (dir
->get_dir_auth() == CDIR_AUTH_UNDEF
)
1458 // preserve subtree bound until slave commit
1459 slaveup
->olddirs
.insert(dir
->inode
);
1461 dir
->state_set(CDir::STATE_AUTH
);
1465 mds
->mdcache
->adjust_subtree_after_rename(renamed_diri
, olddir
, false);
1467 // see if we can discard the subtree we renamed out of
1468 CDir
*root
= mds
->mdcache
->get_subtree_root(olddir
);
1469 if (root
->get_dir_auth() == CDIR_AUTH_UNDEF
) {
1470 if (slaveup
) // preserve the old dir until slave commit
1471 slaveup
->olddirs
.insert(olddir
->inode
);
1473 mds
->mdcache
->try_trim_non_auth_subtree(root
);
1477 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1478 if (renamed_diri
->authority() != CDIR_AUTH_UNDEF
) {
1479 for (list
<frag_t
>::iterator p
= renamed_dir_frags
.begin(); p
!= renamed_dir_frags
.end(); ++p
) {
1480 CDir
*dir
= renamed_diri
->get_dirfrag(*p
);
1482 // we already had the inode before, and we already adjusted this subtree accordingly.
1483 dout(10) << " already had+adjusted rename import bound " << *dir
<< dendl
;
1487 dir
= renamed_diri
->get_or_open_dirfrag(mds
->mdcache
, *p
);
1488 dout(10) << " creating new rename import bound " << *dir
<< dendl
;
1489 dir
->state_clear(CDir::STATE_AUTH
);
1490 mds
->mdcache
->adjust_subtree_auth(dir
, CDIR_AUTH_UNDEF
, false);
1494 // rename may overwrite an empty directory and move it into stray dir.
1495 unlinked
.erase(renamed_diri
);
1496 for (map
<CInode
*, CDir
*>::iterator p
= unlinked
.begin(); p
!= unlinked
.end(); ++p
) {
1497 if (!linked
.count(p
->first
))
1499 assert(p
->first
->is_dir());
1500 mds
->mdcache
->adjust_subtree_after_rename(p
->first
, p
->second
, false);
1504 if (!unlinked
.empty()) {
1505 for (set
<CInode
*>::iterator p
= linked
.begin(); p
!= linked
.end(); ++p
)
1507 dout(10) << " unlinked set contains " << unlinked
<< dendl
;
1508 for (map
<CInode
*, CDir
*>::iterator p
= unlinked
.begin(); p
!= unlinked
.end(); ++p
) {
1509 if (slaveup
) // preserve unlinked inodes until slave commit
1510 slaveup
->unlinked
.insert(p
->first
);
1512 mds
->mdcache
->remove_inode_recursive(p
->first
);
1516 // table client transactions
1517 for (list
<pair
<__u8
,version_t
> >::iterator p
= table_tids
.begin();
1518 p
!= table_tids
.end();
1520 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p
->first
)
1521 << " transaction " << p
->second
<< dendl
;
1522 MDSTableClient
*client
= mds
->get_table_client(p
->first
);
1524 client
->got_journaled_agree(p
->second
, logseg
);
1529 CInode
*in
= mds
->mdcache
->get_inode(opened_ino
);
1531 dout(10) << "EMetaBlob.replay noting opened inode " << *in
<< dendl
;
1532 logseg
->open_files
.push_back(&in
->item_open_file
);
1537 if (mds
->inotable
->get_version() >= inotablev
) {
1538 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1539 << " <= table " << mds
->inotable
->get_version() << dendl
;
1541 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1542 << " - 1 == table " << mds
->inotable
->get_version()
1543 << " allocated+used " << allocated_ino
1544 << " prealloc " << preallocated_inos
1547 mds
->inotable
->replay_alloc_id(allocated_ino
);
1548 if (preallocated_inos
.size())
1549 mds
->inotable
->replay_alloc_ids(preallocated_inos
);
1551 // [repair bad inotable updates]
1552 if (inotablev
> mds
->inotable
->get_version()) {
1553 mds
->clog
->error() << "journal replay inotablev mismatch "
1554 << mds
->inotable
->get_version() << " -> " << inotablev
;
1555 mds
->inotable
->force_replay_version(inotablev
);
1558 assert(inotablev
== mds
->inotable
->get_version());
1562 if (mds
->sessionmap
.get_version() >= sessionmapv
) {
1563 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1564 << " <= table " << mds
->sessionmap
.get_version() << dendl
;
1565 } else if (mds
->sessionmap
.get_version() + 2 >= sessionmapv
) {
1566 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1567 << " -(1|2) == table " << mds
->sessionmap
.get_version()
1568 << " prealloc " << preallocated_inos
1569 << " used " << used_preallocated_ino
1571 Session
*session
= mds
->sessionmap
.get_session(client_name
);
1573 dout(20) << " (session prealloc " << session
->info
.prealloc_inos
<< ")" << dendl
;
1574 if (used_preallocated_ino
) {
1575 if (!session
->info
.prealloc_inos
.empty()) {
1576 inodeno_t next
= session
->next_ino();
1577 inodeno_t i
= session
->take_ino(used_preallocated_ino
);
1579 mds
->clog
->warn() << " replayed op " << client_reqs
<< " used ino " << i
1580 << " but session next is " << next
;
1581 assert(i
== used_preallocated_ino
);
1582 session
->info
.used_inos
.clear();
1584 mds
->sessionmap
.replay_dirty_session(session
);
1586 if (!preallocated_inos
.empty()) {
1587 session
->info
.prealloc_inos
.insert(preallocated_inos
);
1588 mds
->sessionmap
.replay_dirty_session(session
);
1592 dout(10) << "EMetaBlob.replay no session for " << client_name
<< dendl
;
1593 if (used_preallocated_ino
) {
1594 mds
->sessionmap
.replay_advance_version();
1596 if (!preallocated_inos
.empty())
1597 mds
->sessionmap
.replay_advance_version();
1599 assert(sessionmapv
== mds
->sessionmap
.get_version());
1601 mds
->clog
->error() << "journal replay sessionmap v " << sessionmapv
1602 << " -(1|2) > table " << mds
->sessionmap
.get_version();
1603 assert(g_conf
->mds_wipe_sessions
);
1604 mds
->sessionmap
.wipe();
1605 mds
->sessionmap
.set_version(sessionmapv
);
1609 // truncating inodes
1610 for (list
<inodeno_t
>::iterator p
= truncate_start
.begin();
1611 p
!= truncate_start
.end();
1613 CInode
*in
= mds
->mdcache
->get_inode(*p
);
1615 mds
->mdcache
->add_recovered_truncate(in
, logseg
);
1617 for (map
<inodeno_t
,uint64_t>::iterator p
= truncate_finish
.begin();
1618 p
!= truncate_finish
.end();
1620 LogSegment
*ls
= mds
->mdlog
->get_segment(p
->second
);
1622 CInode
*in
= mds
->mdcache
->get_inode(p
->first
);
1624 mds
->mdcache
->remove_recovered_truncate(in
, ls
);
1629 for (vector
<inodeno_t
>::iterator p
= destroyed_inodes
.begin();
1630 p
!= destroyed_inodes
.end();
1632 CInode
*in
= mds
->mdcache
->get_inode(*p
);
1634 dout(10) << "EMetaBlob.replay destroyed " << *p
<< ", dropping " << *in
<< dendl
;
1635 CDentry
*parent
= in
->get_parent_dn();
1636 mds
->mdcache
->remove_inode(in
);
1638 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent
<< dendl
;
1639 assert(parent
->get_linkage()->is_null());
1640 mds
->mdcache
->touch_dentry_bottom(parent
);
1643 dout(10) << "EMetaBlob.replay destroyed " << *p
<< ", not in cache" << dendl
;
1648 for (list
<pair
<metareqid_t
, uint64_t> >::iterator p
= client_reqs
.begin();
1649 p
!= client_reqs
.end();
1651 if (p
->first
.name
.is_client()) {
1652 dout(10) << "EMetaBlob.replay request " << p
->first
<< " trim_to " << p
->second
<< dendl
;
1653 inodeno_t created
= allocated_ino
? allocated_ino
: used_preallocated_ino
;
1654 // if we allocated an inode, there should be exactly one client request id.
1655 assert(created
== inodeno_t() || client_reqs
.size() == 1);
1657 Session
*session
= mds
->sessionmap
.get_session(p
->first
.name
);
1659 session
->add_completed_request(p
->first
.tid
, created
);
1661 session
->trim_completed_requests(p
->second
);
1667 for (list
<pair
<metareqid_t
, uint64_t> >::iterator p
= client_flushes
.begin();
1668 p
!= client_flushes
.end();
1670 if (p
->first
.name
.is_client()) {
1671 dout(10) << "EMetaBlob.replay flush " << p
->first
<< " trim_to " << p
->second
<< dendl
;
1672 Session
*session
= mds
->sessionmap
.get_session(p
->first
.name
);
1674 session
->add_completed_flush(p
->first
.tid
);
1676 session
->trim_completed_flushes(p
->second
);
1682 update_segment(logseg
);
1684 assert(g_conf
->mds_kill_journal_replay_at
!= 4);
1687 // -----------------------
1690 void ESession::update_segment()
1692 _segment
->sessionmapv
= cmapv
;
1693 if (inos
.size() && inotablev
)
1694 _segment
->inotablev
= inotablev
;
1697 void ESession::replay(MDSRank
*mds
)
1699 if (mds
->sessionmap
.get_version() >= cmapv
) {
1700 dout(10) << "ESession.replay sessionmap " << mds
->sessionmap
.get_version()
1701 << " >= " << cmapv
<< ", noop" << dendl
;
1703 dout(10) << "ESession.replay sessionmap " << mds
->sessionmap
.get_version()
1704 << " < " << cmapv
<< " " << (open
? "open":"close") << " " << client_inst
<< dendl
;
1707 session
= mds
->sessionmap
.get_or_add_session(client_inst
);
1708 mds
->sessionmap
.set_state(session
, Session::STATE_OPEN
);
1709 session
->set_client_metadata(client_metadata
);
1710 dout(10) << " opened session " << session
->info
.inst
<< dendl
;
1712 session
= mds
->sessionmap
.get_session(client_inst
.name
);
1713 if (session
) { // there always should be a session, but there's a bug
1714 if (session
->connection
== NULL
) {
1715 dout(10) << " removed session " << session
->info
.inst
<< dendl
;
1716 mds
->sessionmap
.remove_session(session
);
1719 session
->clear(); // the client has reconnected; keep the Session, but reset
1720 dout(10) << " reset session " << session
->info
.inst
<< " (they reconnected)" << dendl
;
1723 mds
->clog
->error() << "replayed stray Session close event for " << client_inst
1724 << " from time " << stamp
<< ", ignoring";
1728 mds
->sessionmap
.replay_dirty_session(session
);
1730 mds
->sessionmap
.replay_advance_version();
1732 assert(mds
->sessionmap
.get_version() == cmapv
);
1735 if (inos
.size() && inotablev
) {
1736 if (mds
->inotable
->get_version() >= inotablev
) {
1737 dout(10) << "ESession.replay inotable " << mds
->inotable
->get_version()
1738 << " >= " << inotablev
<< ", noop" << dendl
;
1740 dout(10) << "ESession.replay inotable " << mds
->inotable
->get_version()
1741 << " < " << inotablev
<< " " << (open
? "add":"remove") << dendl
;
1742 assert(!open
); // for now
1743 mds
->inotable
->replay_release_ids(inos
);
1744 assert(mds
->inotable
->get_version() == inotablev
);
1751 void ESession::encode(bufferlist
&bl
, uint64_t features
) const
1753 ENCODE_START(4, 3, bl
);
1754 ::encode(stamp
, bl
);
1755 ::encode(client_inst
, bl
, features
);
1757 ::encode(cmapv
, bl
);
1759 ::encode(inotablev
, bl
);
1760 ::encode(client_metadata
, bl
);
1764 void ESession::decode(bufferlist::iterator
&bl
)
1766 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl
);
1768 ::decode(stamp
, bl
);
1769 ::decode(client_inst
, bl
);
1771 ::decode(cmapv
, bl
);
1773 ::decode(inotablev
, bl
);
1774 if (struct_v
>= 4) {
1775 ::decode(client_metadata
, bl
);
1780 void ESession::dump(Formatter
*f
) const
1782 f
->dump_stream("client instance") << client_inst
;
1783 f
->dump_string("open", open
? "true" : "false");
1784 f
->dump_int("client map version", cmapv
);
1785 f
->dump_stream("inos") << inos
;
1786 f
->dump_int("inotable version", inotablev
);
1787 f
->open_object_section("client_metadata");
1788 for (map
<string
, string
>::const_iterator i
= client_metadata
.begin();
1789 i
!= client_metadata
.end(); ++i
) {
1790 f
->dump_string(i
->first
.c_str(), i
->second
);
1792 f
->close_section(); // client_metadata
1795 void ESession::generate_test_instances(list
<ESession
*>& ls
)
1797 ls
.push_back(new ESession
);
1800 // -----------------------
1803 void ESessions::encode(bufferlist
&bl
, uint64_t features
) const
1805 ENCODE_START(1, 1, bl
);
1806 ::encode(client_map
, bl
, features
);
1807 ::encode(cmapv
, bl
);
1808 ::encode(stamp
, bl
);
1812 void ESessions::decode_old(bufferlist::iterator
&bl
)
1814 ::decode(client_map
, bl
);
1815 ::decode(cmapv
, bl
);
1817 ::decode(stamp
, bl
);
1820 void ESessions::decode_new(bufferlist::iterator
&bl
)
1822 DECODE_START(1, bl
);
1823 ::decode(client_map
, bl
);
1824 ::decode(cmapv
, bl
);
1826 ::decode(stamp
, bl
);
1830 void ESessions::dump(Formatter
*f
) const
1832 f
->dump_int("client map version", cmapv
);
1834 f
->open_array_section("client map");
1835 for (map
<client_t
,entity_inst_t
>::const_iterator i
= client_map
.begin();
1836 i
!= client_map
.end(); ++i
) {
1837 f
->open_object_section("client");
1838 f
->dump_int("client id", i
->first
.v
);
1839 f
->dump_stream("client entity") << i
->second
;
1840 f
->close_section(); // client
1842 f
->close_section(); // client map
1845 void ESessions::generate_test_instances(list
<ESessions
*>& ls
)
1847 ls
.push_back(new ESessions());
1850 void ESessions::update_segment()
1852 _segment
->sessionmapv
= cmapv
;
1855 void ESessions::replay(MDSRank
*mds
)
1857 if (mds
->sessionmap
.get_version() >= cmapv
) {
1858 dout(10) << "ESessions.replay sessionmap " << mds
->sessionmap
.get_version()
1859 << " >= " << cmapv
<< ", noop" << dendl
;
1861 dout(10) << "ESessions.replay sessionmap " << mds
->sessionmap
.get_version()
1862 << " < " << cmapv
<< dendl
;
1863 mds
->sessionmap
.open_sessions(client_map
);
1864 assert(mds
->sessionmap
.get_version() == cmapv
);
1865 mds
->sessionmap
.set_projected(mds
->sessionmap
.get_version());
1871 // -----------------------
1874 void ETableServer::encode(bufferlist
& bl
, uint64_t features
) const
1876 ENCODE_START(3, 3, bl
);
1877 ::encode(stamp
, bl
);
1878 ::encode(table
, bl
);
1880 ::encode(reqid
, bl
);
1881 ::encode(bymds
, bl
);
1882 ::encode(mutation
, bl
);
1884 ::encode(version
, bl
);
1888 void ETableServer::decode(bufferlist::iterator
&bl
)
1890 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
1892 ::decode(stamp
, bl
);
1893 ::decode(table
, bl
);
1895 ::decode(reqid
, bl
);
1896 ::decode(bymds
, bl
);
1897 ::decode(mutation
, bl
);
1899 ::decode(version
, bl
);
1903 void ETableServer::dump(Formatter
*f
) const
1905 f
->dump_int("table id", table
);
1906 f
->dump_int("op", op
);
1907 f
->dump_int("request id", reqid
);
1908 f
->dump_int("by mds", bymds
);
1909 f
->dump_int("tid", tid
);
1910 f
->dump_int("version", version
);
1913 void ETableServer::generate_test_instances(list
<ETableServer
*>& ls
)
1915 ls
.push_back(new ETableServer());
1919 void ETableServer::update_segment()
1921 _segment
->tablev
[table
] = version
;
1924 void ETableServer::replay(MDSRank
*mds
)
1926 MDSTableServer
*server
= mds
->get_table_server(table
);
1930 if (server
->get_version() >= version
) {
1931 dout(10) << "ETableServer.replay " << get_mdstable_name(table
)
1932 << " " << get_mdstableserver_opname(op
)
1933 << " event " << version
1934 << " <= table " << server
->get_version() << dendl
;
1938 dout(10) << " ETableServer.replay " << get_mdstable_name(table
)
1939 << " " << get_mdstableserver_opname(op
)
1940 << " event " << version
<< " - 1 == table " << server
->get_version() << dendl
;
1941 assert(version
-1 == server
->get_version());
1944 case TABLESERVER_OP_PREPARE
:
1945 server
->_prepare(mutation
, reqid
, bymds
);
1946 server
->_note_prepare(bymds
, reqid
);
1948 case TABLESERVER_OP_COMMIT
:
1949 server
->_commit(tid
);
1950 server
->_note_commit(tid
);
1952 case TABLESERVER_OP_ROLLBACK
:
1953 server
->_rollback(tid
);
1954 server
->_note_rollback(tid
);
1956 case TABLESERVER_OP_SERVER_UPDATE
:
1957 server
->_server_update(mutation
);
1960 mds
->clog
->error() << "invalid tableserver op in ETableServer";
1962 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1965 assert(version
== server
->get_version());
1970 // ---------------------
1973 void ETableClient::encode(bufferlist
& bl
, uint64_t features
) const
1975 ENCODE_START(3, 3, bl
);
1976 ::encode(stamp
, bl
);
1977 ::encode(table
, bl
);
1983 void ETableClient::decode(bufferlist::iterator
&bl
)
1985 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
1987 ::decode(stamp
, bl
);
1988 ::decode(table
, bl
);
1994 void ETableClient::dump(Formatter
*f
) const
1996 f
->dump_int("table", table
);
1997 f
->dump_int("op", op
);
1998 f
->dump_int("tid", tid
);
2001 void ETableClient::generate_test_instances(list
<ETableClient
*>& ls
)
2003 ls
.push_back(new ETableClient());
2006 void ETableClient::replay(MDSRank
*mds
)
2008 dout(10) << " ETableClient.replay " << get_mdstable_name(table
)
2009 << " op " << get_mdstableserver_opname(op
)
2010 << " tid " << tid
<< dendl
;
2012 MDSTableClient
*client
= mds
->get_table_client(table
);
2016 assert(op
== TABLESERVER_OP_ACK
);
2017 client
->got_journaled_ack(tid
);
2021 // -----------------------
2024 void ESnap::update_segment()
2026 _segment->tablev[TABLE_SNAP] = version;
2029 void ESnap::replay(MDSRank *mds)
2031 if (mds->snaptable->get_version() >= version) {
2032 dout(10) << "ESnap.replay event " << version
2033 << " <= table " << mds->snaptable->get_version() << dendl;
2037 dout(10) << " ESnap.replay event " << version
2038 << " - 1 == table " << mds->snaptable->get_version() << dendl;
2039 assert(version-1 == mds->snaptable->get_version());
2043 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
2044 assert(s == snap.snapid);
2046 mds->snaptable->remove(snap.snapid);
2049 assert(version == mds->snaptable->get_version());
2055 // -----------------------
2058 void EUpdate::encode(bufferlist
&bl
, uint64_t features
) const
2060 ENCODE_START(4, 4, bl
);
2061 ::encode(stamp
, bl
);
2063 ::encode(metablob
, bl
, features
);
2064 ::encode(client_map
, bl
);
2065 ::encode(cmapv
, bl
);
2066 ::encode(reqid
, bl
);
2067 ::encode(had_slaves
, bl
);
2071 void EUpdate::decode(bufferlist::iterator
&bl
)
2073 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl
);
2075 ::decode(stamp
, bl
);
2077 ::decode(metablob
, bl
);
2078 ::decode(client_map
, bl
);
2080 ::decode(cmapv
, bl
);
2081 ::decode(reqid
, bl
);
2082 ::decode(had_slaves
, bl
);
2086 void EUpdate::dump(Formatter
*f
) const
2088 f
->open_object_section("metablob");
2090 f
->close_section(); // metablob
2092 f
->dump_string("type", type
);
2093 f
->dump_int("client map length", client_map
.length());
2094 f
->dump_int("client map version", cmapv
);
2095 f
->dump_stream("reqid") << reqid
;
2096 f
->dump_string("had slaves", had_slaves
? "true" : "false");
2099 void EUpdate::generate_test_instances(list
<EUpdate
*>& ls
)
2101 ls
.push_back(new EUpdate());
2105 void EUpdate::update_segment()
2107 metablob
.update_segment(_segment
);
2109 if (client_map
.length())
2110 _segment
->sessionmapv
= cmapv
;
2113 _segment
->uncommitted_masters
.insert(reqid
);
2116 void EUpdate::replay(MDSRank
*mds
)
2118 metablob
.replay(mds
, _segment
);
2121 dout(10) << "EUpdate.replay " << reqid
<< " had slaves, expecting a matching ECommitted" << dendl
;
2122 _segment
->uncommitted_masters
.insert(reqid
);
2123 set
<mds_rank_t
> slaves
;
2124 mds
->mdcache
->add_uncommitted_master(reqid
, _segment
, slaves
, true);
2127 if (client_map
.length()) {
2128 if (mds
->sessionmap
.get_version() >= cmapv
) {
2129 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2130 << " <= table " << mds
->sessionmap
.get_version() << dendl
;
2132 dout(10) << "EUpdate.replay sessionmap " << mds
->sessionmap
.get_version()
2133 << " < " << cmapv
<< dendl
;
2134 // open client sessions?
2135 map
<client_t
,entity_inst_t
> cm
;
2136 bufferlist::iterator blp
= client_map
.begin();
2138 mds
->sessionmap
.open_sessions(cm
);
2140 assert(mds
->sessionmap
.get_version() == cmapv
);
2141 mds
->sessionmap
.set_projected(mds
->sessionmap
.get_version());
2148 // ------------------------
2151 void EOpen::encode(bufferlist
&bl
, uint64_t features
) const {
2152 ENCODE_START(4, 3, bl
);
2153 ::encode(stamp
, bl
);
2154 ::encode(metablob
, bl
, features
);
2156 ::encode(snap_inos
, bl
);
2160 void EOpen::decode(bufferlist::iterator
&bl
) {
2161 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2163 ::decode(stamp
, bl
);
2164 ::decode(metablob
, bl
);
2167 ::decode(snap_inos
, bl
);
2171 void EOpen::dump(Formatter
*f
) const
2173 f
->open_object_section("metablob");
2175 f
->close_section(); // metablob
2176 f
->open_array_section("inos involved");
2177 for (vector
<inodeno_t
>::const_iterator i
= inos
.begin();
2178 i
!= inos
.end(); ++i
) {
2179 f
->dump_int("ino", *i
);
2181 f
->close_section(); // inos
2184 void EOpen::generate_test_instances(list
<EOpen
*>& ls
)
2186 ls
.push_back(new EOpen());
2187 ls
.push_back(new EOpen());
2188 ls
.back()->add_ino(0);
2191 void EOpen::update_segment()
2196 void EOpen::replay(MDSRank
*mds
)
2198 dout(10) << "EOpen.replay " << dendl
;
2199 metablob
.replay(mds
, _segment
);
2201 // note which segments inodes belong to, so we don't have to start rejournaling them
2202 for (const auto &ino
: inos
) {
2203 CInode
*in
= mds
->mdcache
->get_inode(ino
);
2205 dout(0) << "EOpen.replay ino " << ino
<< " not in metablob" << dendl
;
2208 _segment
->open_files
.push_back(&in
->item_open_file
);
2210 for (const auto &vino
: snap_inos
) {
2211 CInode
*in
= mds
->mdcache
->get_inode(vino
);
2213 dout(0) << "EOpen.replay ino " << vino
<< " not in metablob" << dendl
;
2216 _segment
->open_files
.push_back(&in
->item_open_file
);
2221 // -----------------------
2224 void ECommitted::replay(MDSRank
*mds
)
2226 if (mds
->mdcache
->uncommitted_masters
.count(reqid
)) {
2227 dout(10) << "ECommitted.replay " << reqid
<< dendl
;
2228 mds
->mdcache
->uncommitted_masters
[reqid
].ls
->uncommitted_masters
.erase(reqid
);
2229 mds
->mdcache
->uncommitted_masters
.erase(reqid
);
2231 dout(10) << "ECommitted.replay " << reqid
<< " -- didn't see original op" << dendl
;
2235 void ECommitted::encode(bufferlist
& bl
, uint64_t features
) const
2237 ENCODE_START(3, 3, bl
);
2238 ::encode(stamp
, bl
);
2239 ::encode(reqid
, bl
);
2243 void ECommitted::decode(bufferlist::iterator
& bl
)
2245 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2247 ::decode(stamp
, bl
);
2248 ::decode(reqid
, bl
);
2252 void ECommitted::dump(Formatter
*f
) const {
2253 f
->dump_stream("stamp") << stamp
;
2254 f
->dump_stream("reqid") << reqid
;
2257 void ECommitted::generate_test_instances(list
<ECommitted
*>& ls
)
2259 ls
.push_back(new ECommitted
);
2260 ls
.push_back(new ECommitted
);
2261 ls
.back()->stamp
= utime_t(1, 2);
2262 ls
.back()->reqid
= metareqid_t(entity_name_t::CLIENT(123), 456);
2265 // -----------------------
2268 void link_rollback::encode(bufferlist
&bl
) const
2270 ENCODE_START(2, 2, bl
);
2271 ::encode(reqid
, bl
);
2273 ::encode(was_inc
, bl
);
2274 ::encode(old_ctime
, bl
);
2275 ::encode(old_dir_mtime
, bl
);
2276 ::encode(old_dir_rctime
, bl
);
2280 void link_rollback::decode(bufferlist::iterator
&bl
)
2282 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2283 ::decode(reqid
, bl
);
2285 ::decode(was_inc
, bl
);
2286 ::decode(old_ctime
, bl
);
2287 ::decode(old_dir_mtime
, bl
);
2288 ::decode(old_dir_rctime
, bl
);
2292 void link_rollback::dump(Formatter
*f
) const
2294 f
->dump_stream("metareqid") << reqid
;
2295 f
->dump_int("ino", ino
);
2296 f
->dump_string("was incremented", was_inc
? "true" : "false");
2297 f
->dump_stream("old_ctime") << old_ctime
;
2298 f
->dump_stream("old_dir_mtime") << old_dir_mtime
;
2299 f
->dump_stream("old_dir_rctime") << old_dir_rctime
;
2302 void link_rollback::generate_test_instances(list
<link_rollback
*>& ls
)
2304 ls
.push_back(new link_rollback());
2307 void rmdir_rollback::encode(bufferlist
& bl
) const
2309 ENCODE_START(2, 2, bl
);
2310 ::encode(reqid
, bl
);
2311 ::encode(src_dir
, bl
);
2312 ::encode(src_dname
, bl
);
2313 ::encode(dest_dir
, bl
);
2314 ::encode(dest_dname
, bl
);
2318 void rmdir_rollback::decode(bufferlist::iterator
& bl
)
2320 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2321 ::decode(reqid
, bl
);
2322 ::decode(src_dir
, bl
);
2323 ::decode(src_dname
, bl
);
2324 ::decode(dest_dir
, bl
);
2325 ::decode(dest_dname
, bl
);
2329 void rmdir_rollback::dump(Formatter
*f
) const
2331 f
->dump_stream("metareqid") << reqid
;
2332 f
->dump_stream("source directory") << src_dir
;
2333 f
->dump_string("source dname", src_dname
);
2334 f
->dump_stream("destination directory") << dest_dir
;
2335 f
->dump_string("destination dname", dest_dname
);
2338 void rmdir_rollback::generate_test_instances(list
<rmdir_rollback
*>& ls
)
2340 ls
.push_back(new rmdir_rollback());
2343 void rename_rollback::drec::encode(bufferlist
&bl
) const
2345 ENCODE_START(2, 2, bl
);
2346 ::encode(dirfrag
, bl
);
2347 ::encode(dirfrag_old_mtime
, bl
);
2348 ::encode(dirfrag_old_rctime
, bl
);
2350 ::encode(remote_ino
, bl
);
2351 ::encode(dname
, bl
);
2352 ::encode(remote_d_type
, bl
);
2353 ::encode(old_ctime
, bl
);
2357 void rename_rollback::drec::decode(bufferlist::iterator
&bl
)
2359 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2360 ::decode(dirfrag
, bl
);
2361 ::decode(dirfrag_old_mtime
, bl
);
2362 ::decode(dirfrag_old_rctime
, bl
);
2364 ::decode(remote_ino
, bl
);
2365 ::decode(dname
, bl
);
2366 ::decode(remote_d_type
, bl
);
2367 ::decode(old_ctime
, bl
);
2371 void rename_rollback::drec::dump(Formatter
*f
) const
2373 f
->dump_stream("directory fragment") << dirfrag
;
2374 f
->dump_stream("directory old mtime") << dirfrag_old_mtime
;
2375 f
->dump_stream("directory old rctime") << dirfrag_old_rctime
;
2376 f
->dump_int("ino", ino
);
2377 f
->dump_int("remote ino", remote_ino
);
2378 f
->dump_string("dname", dname
);
2379 uint32_t type
= DTTOIF(remote_d_type
) & S_IFMT
; // convert to type entries
2383 type_string
= "file"; break;
2385 type_string
= "symlink"; break;
2387 type_string
= "directory"; break;
2389 type_string
= "UNKNOWN-" + stringify((int)type
); break;
2391 f
->dump_string("remote dtype", type_string
);
2392 f
->dump_stream("old ctime") << old_ctime
;
2395 void rename_rollback::drec::generate_test_instances(list
<drec
*>& ls
)
2397 ls
.push_back(new drec());
2398 ls
.back()->remote_d_type
= IFTODT(S_IFREG
);
2401 void rename_rollback::encode(bufferlist
&bl
) const
2403 ENCODE_START(2, 2, bl
);
2404 ::encode(reqid
, bl
);
2405 encode(orig_src
, bl
);
2406 encode(orig_dest
, bl
);
2408 ::encode(ctime
, bl
);
2412 void rename_rollback::decode(bufferlist::iterator
&bl
)
2414 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
2415 ::decode(reqid
, bl
);
2416 decode(orig_src
, bl
);
2417 decode(orig_dest
, bl
);
2419 ::decode(ctime
, bl
);
2423 void rename_rollback::dump(Formatter
*f
) const
2425 f
->dump_stream("request id") << reqid
;
2426 f
->open_object_section("original src drec");
2428 f
->close_section(); // original src drec
2429 f
->open_object_section("original dest drec");
2431 f
->close_section(); // original dest drec
2432 f
->open_object_section("stray drec");
2434 f
->close_section(); // stray drec
2435 f
->dump_stream("ctime") << ctime
;
2438 void rename_rollback::generate_test_instances(list
<rename_rollback
*>& ls
)
2440 ls
.push_back(new rename_rollback());
2441 ls
.back()->orig_src
.remote_d_type
= IFTODT(S_IFREG
);
2442 ls
.back()->orig_dest
.remote_d_type
= IFTODT(S_IFREG
);
2443 ls
.back()->stray
.remote_d_type
= IFTODT(S_IFREG
);
2446 void ESlaveUpdate::encode(bufferlist
&bl
, uint64_t features
) const
2448 ENCODE_START(3, 3, bl
);
2449 ::encode(stamp
, bl
);
2451 ::encode(reqid
, bl
);
2452 ::encode(master
, bl
);
2454 ::encode(origop
, bl
);
2455 ::encode(commit
, bl
, features
);
2456 ::encode(rollback
, bl
);
2460 void ESlaveUpdate::decode(bufferlist::iterator
&bl
)
2462 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2464 ::decode(stamp
, bl
);
2466 ::decode(reqid
, bl
);
2467 ::decode(master
, bl
);
2469 ::decode(origop
, bl
);
2470 ::decode(commit
, bl
);
2471 ::decode(rollback
, bl
);
2475 void ESlaveUpdate::dump(Formatter
*f
) const
2477 f
->open_object_section("metablob");
2479 f
->close_section(); // metablob
2481 f
->dump_int("rollback length", rollback
.length());
2482 f
->dump_string("type", type
);
2483 f
->dump_stream("metareqid") << reqid
;
2484 f
->dump_int("master", master
);
2485 f
->dump_int("op", op
);
2486 f
->dump_int("original op", origop
);
2489 void ESlaveUpdate::generate_test_instances(list
<ESlaveUpdate
*>& ls
)
2491 ls
.push_back(new ESlaveUpdate());
2495 void ESlaveUpdate::replay(MDSRank
*mds
)
2499 case ESlaveUpdate::OP_PREPARE
:
2500 dout(10) << "ESlaveUpdate.replay prepare " << reqid
<< " for mds." << master
2501 << ": applying commit, saving rollback info" << dendl
;
2502 su
= new MDSlaveUpdate(origop
, rollback
, _segment
->slave_updates
);
2503 commit
.replay(mds
, _segment
, su
);
2504 mds
->mdcache
->add_uncommitted_slave_update(reqid
, master
, su
);
2507 case ESlaveUpdate::OP_COMMIT
:
2508 su
= mds
->mdcache
->get_uncommitted_slave_update(reqid
, master
);
2510 dout(10) << "ESlaveUpdate.replay commit " << reqid
<< " for mds." << master
<< dendl
;
2511 mds
->mdcache
->finish_uncommitted_slave_update(reqid
, master
);
2513 dout(10) << "ESlaveUpdate.replay commit " << reqid
<< " for mds." << master
2514 << ": ignoring, no previously saved prepare" << dendl
;
2518 case ESlaveUpdate::OP_ROLLBACK
:
2519 dout(10) << "ESlaveUpdate.replay abort " << reqid
<< " for mds." << master
2520 << ": applying rollback commit blob" << dendl
;
2521 commit
.replay(mds
, _segment
);
2522 su
= mds
->mdcache
->get_uncommitted_slave_update(reqid
, master
);
2524 mds
->mdcache
->finish_uncommitted_slave_update(reqid
, master
);
2528 mds
->clog
->error() << "invalid op in ESlaveUpdate";
2530 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2535 // -----------------------
2538 void ESubtreeMap::encode(bufferlist
& bl
, uint64_t features
) const
2540 ENCODE_START(6, 5, bl
);
2541 ::encode(stamp
, bl
);
2542 ::encode(metablob
, bl
, features
);
2543 ::encode(subtrees
, bl
);
2544 ::encode(ambiguous_subtrees
, bl
);
2545 ::encode(expire_pos
, bl
);
2546 ::encode(event_seq
, bl
);
2550 void ESubtreeMap::decode(bufferlist::iterator
&bl
)
2552 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl
);
2554 ::decode(stamp
, bl
);
2555 ::decode(metablob
, bl
);
2556 ::decode(subtrees
, bl
);
2558 ::decode(ambiguous_subtrees
, bl
);
2560 ::decode(expire_pos
, bl
);
2562 ::decode(event_seq
, bl
);
2566 void ESubtreeMap::dump(Formatter
*f
) const
2568 f
->open_object_section("metablob");
2570 f
->close_section(); // metablob
2572 f
->open_array_section("subtrees");
2573 for(map
<dirfrag_t
,vector
<dirfrag_t
> >::const_iterator i
= subtrees
.begin();
2574 i
!= subtrees
.end(); ++i
) {
2575 f
->open_object_section("tree");
2576 f
->dump_stream("root dirfrag") << i
->first
;
2577 for (vector
<dirfrag_t
>::const_iterator j
= i
->second
.begin();
2578 j
!= i
->second
.end(); ++j
) {
2579 f
->dump_stream("bound dirfrag") << *j
;
2581 f
->close_section(); // tree
2583 f
->close_section(); // subtrees
2585 f
->open_array_section("ambiguous subtrees");
2586 for(set
<dirfrag_t
>::const_iterator i
= ambiguous_subtrees
.begin();
2587 i
!= ambiguous_subtrees
.end(); ++i
) {
2588 f
->dump_stream("dirfrag") << *i
;
2590 f
->close_section(); // ambiguous subtrees
2592 f
->dump_int("expire position", expire_pos
);
2595 void ESubtreeMap::generate_test_instances(list
<ESubtreeMap
*>& ls
)
2597 ls
.push_back(new ESubtreeMap());
2600 void ESubtreeMap::replay(MDSRank
*mds
)
2602 if (expire_pos
&& expire_pos
> mds
->mdlog
->journaler
->get_expire_pos())
2603 mds
->mdlog
->journaler
->set_expire_pos(expire_pos
);
2605 // suck up the subtree map?
2606 if (mds
->mdcache
->is_subtrees()) {
2607 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl
;
2610 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= subtrees
.begin();
2611 p
!= subtrees
.end();
2613 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
2615 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2616 << " subtree root " << p
->first
<< " not in cache";
2621 if (!mds
->mdcache
->is_subtree(dir
)) {
2622 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2623 << " subtree root " << p
->first
<< " not a subtree in cache";
2627 if (dir
->get_dir_auth().first
!= mds
->get_nodeid()) {
2628 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2629 << " subtree root " << p
->first
2630 << " is not mine in cache (it's " << dir
->get_dir_auth() << ")";
2635 for (vector
<dirfrag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
)
2636 mds
->mdcache
->get_force_dirfrag(*q
, true);
2639 mds
->mdcache
->get_subtree_bounds(dir
, bounds
);
2640 for (vector
<dirfrag_t
>::iterator q
= p
->second
.begin(); q
!= p
->second
.end(); ++q
) {
2641 CDir
*b
= mds
->mdcache
->get_dirfrag(*q
);
2643 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2644 << " subtree " << p
->first
<< " bound " << *q
<< " not in cache";
2648 if (bounds
.count(b
) == 0) {
2649 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2650 << " subtree " << p
->first
<< " bound " << *q
<< " not a bound in cache";
2656 for (set
<CDir
*>::iterator q
= bounds
.begin(); q
!= bounds
.end(); ++q
) {
2657 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2658 << " subtree " << p
->first
<< " has extra bound in cache " << (*q
)->dirfrag();
2662 if (ambiguous_subtrees
.count(p
->first
)) {
2663 if (!mds
->mdcache
->have_ambiguous_import(p
->first
)) {
2664 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2665 << " subtree " << p
->first
<< " is ambiguous but is not in our cache";
2669 if (mds
->mdcache
->have_ambiguous_import(p
->first
)) {
2670 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2671 << " subtree " << p
->first
<< " is not ambiguous but is in our cache";
2678 mds
->mdcache
->list_subtrees(subs
);
2679 for (list
<CDir
*>::iterator p
= subs
.begin(); p
!= subs
.end(); ++p
) {
2681 if (dir
->get_dir_auth().first
!= mds
->get_nodeid())
2683 if (subtrees
.count(dir
->dirfrag()) == 0) {
2684 mds
->clog
->error() << " replayed ESubtreeMap at " << get_start_off()
2685 << " does not include cache subtree " << dir
->dirfrag();
2691 dout(0) << "journal subtrees: " << subtrees
<< dendl
;
2692 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees
<< dendl
;
2693 mds
->mdcache
->show_subtrees();
2694 assert(!g_conf
->mds_debug_subtrees
|| errors
== 0);
2699 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl
;
2701 // first, stick the spanning tree in my cache
2702 //metablob.print(*_dout);
2703 metablob
.replay(mds
, _segment
);
2705 // restore import/export maps
2706 for (map
<dirfrag_t
, vector
<dirfrag_t
> >::iterator p
= subtrees
.begin();
2707 p
!= subtrees
.end();
2709 CDir
*dir
= mds
->mdcache
->get_dirfrag(p
->first
);
2711 if (ambiguous_subtrees
.count(p
->first
)) {
2713 mds
->mdcache
->add_ambiguous_import(p
->first
, p
->second
);
2714 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, p
->second
,
2715 mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
2718 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, p
->second
, mds
->get_nodeid());
2722 mds
->mdcache
->recalc_auth_bits(true);
2724 mds
->mdcache
->show_subtrees();
2729 // -----------------------
2732 void EFragment::replay(MDSRank
*mds
)
2734 dout(10) << "EFragment.replay " << op_name(op
) << " " << ino
<< " " << basefrag
<< " by " << bits
<< dendl
;
2736 list
<CDir
*> resultfrags
;
2737 list
<MDSInternalContextBase
*> waiters
;
2738 list
<frag_t
> old_frags
;
2740 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2741 // it will be once we replay the metablob , but first we need to
2742 // refragment anything we already have in the cache.
2743 CInode
*in
= mds
->mdcache
->get_inode(ino
);
2747 mds
->mdcache
->add_uncommitted_fragment(dirfrag_t(ino
, basefrag
), bits
, orig_frags
, _segment
, &rollback
);
2750 mds
->mdcache
->adjust_dir_fragments(in
, basefrag
, bits
, resultfrags
, waiters
, true);
2755 in
->dirfragtree
.get_leaves_under(basefrag
, old_frags
);
2756 if (orig_frags
.empty()) {
2757 // old format EFragment
2758 mds
->mdcache
->adjust_dir_fragments(in
, basefrag
, -bits
, resultfrags
, waiters
, true);
2760 for (list
<frag_t
>::iterator p
= orig_frags
.begin(); p
!= orig_frags
.end(); ++p
)
2761 mds
->mdcache
->force_dir_fragment(in
, *p
);
2764 mds
->mdcache
->rollback_uncommitted_fragment(dirfrag_t(ino
, basefrag
), old_frags
);
2769 mds
->mdcache
->finish_uncommitted_fragment(dirfrag_t(ino
, basefrag
), op
);
2776 metablob
.replay(mds
, _segment
);
2777 if (in
&& g_conf
->mds_debug_frag
)
2778 in
->verify_dirfrags();
2781 void EFragment::encode(bufferlist
&bl
, uint64_t features
) const {
2782 ENCODE_START(5, 4, bl
);
2783 ::encode(stamp
, bl
);
2786 ::encode(basefrag
, bl
);
2788 ::encode(metablob
, bl
, features
);
2789 ::encode(orig_frags
, bl
);
2790 ::encode(rollback
, bl
);
2794 void EFragment::decode(bufferlist::iterator
&bl
) {
2795 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl
);
2797 ::decode(stamp
, bl
);
2801 ::decode(basefrag
, bl
);
2803 ::decode(metablob
, bl
);
2804 if (struct_v
>= 5) {
2805 ::decode(orig_frags
, bl
);
2806 ::decode(rollback
, bl
);
2811 void EFragment::dump(Formatter
*f
) const
2813 /*f->open_object_section("Metablob");
2814 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2815 f->close_section();*/
2816 f
->dump_string("op", op_name(op
));
2817 f
->dump_stream("ino") << ino
;
2818 f
->dump_stream("base frag") << basefrag
;
2819 f
->dump_int("bits", bits
);
2822 void EFragment::generate_test_instances(list
<EFragment
*>& ls
)
2824 ls
.push_back(new EFragment
);
2825 ls
.push_back(new EFragment
);
2826 ls
.back()->op
= OP_PREPARE
;
2828 ls
.back()->bits
= 5;
2831 void dirfrag_rollback::encode(bufferlist
&bl
) const
2833 ENCODE_START(1, 1, bl
);
2834 ::encode(fnode
, bl
);
2838 void dirfrag_rollback::decode(bufferlist::iterator
&bl
)
2840 DECODE_START(1, bl
);
2841 ::decode(fnode
, bl
);
2847 // =========================================================================
2849 // -----------------------
2852 void EExport::replay(MDSRank
*mds
)
2854 dout(10) << "EExport.replay " << base
<< dendl
;
2855 metablob
.replay(mds
, _segment
);
2857 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
2860 set
<CDir
*> realbounds
;
2861 for (set
<dirfrag_t
>::iterator p
= bounds
.begin();
2864 CDir
*bd
= mds
->mdcache
->get_dirfrag(*p
);
2866 realbounds
.insert(bd
);
2870 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, realbounds
, CDIR_AUTH_UNDEF
);
2872 mds
->mdcache
->try_trim_non_auth_subtree(dir
);
2875 void EExport::encode(bufferlist
& bl
, uint64_t features
) const
2877 ENCODE_START(3, 3, bl
);
2878 ::encode(stamp
, bl
);
2879 ::encode(metablob
, bl
, features
);
2881 ::encode(bounds
, bl
);
2885 void EExport::decode(bufferlist::iterator
&bl
)
2887 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2889 ::decode(stamp
, bl
);
2890 ::decode(metablob
, bl
);
2892 ::decode(bounds
, bl
);
2896 void EExport::dump(Formatter
*f
) const
2898 f
->dump_float("stamp", (double)stamp
);
2899 /*f->open_object_section("Metablob");
2900 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2901 f->close_section();*/
2902 f
->dump_stream("base dirfrag") << base
;
2903 f
->open_array_section("bounds dirfrags");
2904 for (set
<dirfrag_t
>::const_iterator i
= bounds
.begin();
2905 i
!= bounds
.end(); ++i
) {
2906 f
->dump_stream("dirfrag") << *i
;
2908 f
->close_section(); // bounds dirfrags
2911 void EExport::generate_test_instances(list
<EExport
*>& ls
)
2913 EExport
*sample
= new EExport();
2914 ls
.push_back(sample
);
2918 // -----------------------
2921 void EImportStart::update_segment()
2923 _segment
->sessionmapv
= cmapv
;
2926 void EImportStart::replay(MDSRank
*mds
)
2928 dout(10) << "EImportStart.replay " << base
<< " bounds " << bounds
<< dendl
;
2929 //metablob.print(*_dout);
2930 metablob
.replay(mds
, _segment
);
2932 // put in ambiguous import list
2933 mds
->mdcache
->add_ambiguous_import(base
, bounds
);
2935 // set auth partially to us so we don't trim it
2936 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
2939 set
<CDir
*> realbounds
;
2940 for (vector
<dirfrag_t
>::iterator p
= bounds
.begin();
2943 CDir
*bd
= mds
->mdcache
->get_dirfrag(*p
);
2945 if (!bd
->is_subtree_root())
2946 bd
->state_clear(CDir::STATE_AUTH
);
2947 realbounds
.insert(bd
);
2950 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, realbounds
,
2951 mds_authority_t(mds
->get_nodeid(), mds
->get_nodeid()));
2953 // open client sessions?
2954 if (mds
->sessionmap
.get_version() >= cmapv
) {
2955 dout(10) << "EImportStart.replay sessionmap " << mds
->sessionmap
.get_version()
2956 << " >= " << cmapv
<< ", noop" << dendl
;
2958 dout(10) << "EImportStart.replay sessionmap " << mds
->sessionmap
.get_version()
2959 << " < " << cmapv
<< dendl
;
2960 map
<client_t
,entity_inst_t
> cm
;
2961 bufferlist::iterator blp
= client_map
.begin();
2963 mds
->sessionmap
.open_sessions(cm
);
2964 assert(mds
->sessionmap
.get_version() == cmapv
);
2965 mds
->sessionmap
.set_projected(mds
->sessionmap
.get_version());
2970 void EImportStart::encode(bufferlist
&bl
, uint64_t features
) const {
2971 ENCODE_START(3, 3, bl
);
2972 ::encode(stamp
, bl
);
2974 ::encode(metablob
, bl
, features
);
2975 ::encode(bounds
, bl
);
2976 ::encode(cmapv
, bl
);
2977 ::encode(client_map
, bl
);
2981 void EImportStart::decode(bufferlist::iterator
&bl
) {
2982 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
2984 ::decode(stamp
, bl
);
2986 ::decode(metablob
, bl
);
2987 ::decode(bounds
, bl
);
2988 ::decode(cmapv
, bl
);
2989 ::decode(client_map
, bl
);
2993 void EImportStart::dump(Formatter
*f
) const
2995 f
->dump_stream("base dirfrag") << base
;
2996 f
->open_array_section("boundary dirfrags");
2997 for (vector
<dirfrag_t
>::const_iterator iter
= bounds
.begin();
2998 iter
!= bounds
.end(); ++iter
) {
2999 f
->dump_stream("frag") << *iter
;
3004 void EImportStart::generate_test_instances(list
<EImportStart
*>& ls
)
3006 ls
.push_back(new EImportStart
);
3009 // -----------------------
3012 void EImportFinish::replay(MDSRank
*mds
)
3014 if (mds
->mdcache
->have_ambiguous_import(base
)) {
3015 dout(10) << "EImportFinish.replay " << base
<< " success=" << success
<< dendl
;
3017 mds
->mdcache
->finish_ambiguous_import(base
);
3019 CDir
*dir
= mds
->mdcache
->get_dirfrag(base
);
3021 vector
<dirfrag_t
> bounds
;
3022 mds
->mdcache
->get_ambiguous_import_bounds(base
, bounds
);
3023 mds
->mdcache
->adjust_bounded_subtree_auth(dir
, bounds
, CDIR_AUTH_UNDEF
);
3024 mds
->mdcache
->cancel_ambiguous_import(dir
);
3025 mds
->mdcache
->try_trim_non_auth_subtree(dir
);
3028 // this shouldn't happen unless this is an old journal
3029 dout(10) << "EImportFinish.replay " << base
<< " success=" << success
3030 << " on subtree not marked as ambiguous"
3032 mds
->clog
->error() << "failure replaying journal (EImportFinish)";
3034 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3038 void EImportFinish::encode(bufferlist
& bl
, uint64_t features
) const
3040 ENCODE_START(3, 3, bl
);
3041 ::encode(stamp
, bl
);
3043 ::encode(success
, bl
);
3047 void EImportFinish::decode(bufferlist::iterator
&bl
)
3049 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl
);
3051 ::decode(stamp
, bl
);
3053 ::decode(success
, bl
);
3057 void EImportFinish::dump(Formatter
*f
) const
3059 f
->dump_stream("base dirfrag") << base
;
3060 f
->dump_string("success", success
? "true" : "false");
3062 void EImportFinish::generate_test_instances(list
<EImportFinish
*>& ls
)
3064 ls
.push_back(new EImportFinish
);
3065 ls
.push_back(new EImportFinish
);
3066 ls
.back()->success
= true;
3070 // ------------------------
3073 void EResetJournal::encode(bufferlist
& bl
, uint64_t features
) const
3075 ENCODE_START(2, 2, bl
);
3076 ::encode(stamp
, bl
);
3080 void EResetJournal::decode(bufferlist::iterator
&bl
)
3082 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl
);
3083 ::decode(stamp
, bl
);
3087 void EResetJournal::dump(Formatter
*f
) const
3089 f
->dump_stream("timestamp") << stamp
;
3092 void EResetJournal::generate_test_instances(list
<EResetJournal
*>& ls
)
3094 ls
.push_back(new EResetJournal());
3097 void EResetJournal::replay(MDSRank
*mds
)
3099 dout(1) << "EResetJournal" << dendl
;
3101 mds
->sessionmap
.wipe();
3102 mds
->inotable
->replay_reset();
3104 if (mds
->mdsmap
->get_root() == mds
->get_nodeid()) {
3105 CDir
*rootdir
= mds
->mdcache
->get_root()->get_or_open_dirfrag(mds
->mdcache
, frag_t());
3106 mds
->mdcache
->adjust_subtree_auth(rootdir
, mds
->get_nodeid());
3109 CDir
*mydir
= mds
->mdcache
->get_myin()->get_or_open_dirfrag(mds
->mdcache
, frag_t());
3110 mds
->mdcache
->adjust_subtree_auth(mydir
, mds
->get_nodeid());
3112 mds
->mdcache
->recalc_auth_bits(true);
3114 mds
->mdcache
->show_subtrees();
3118 void ENoOp::encode(bufferlist
&bl
, uint64_t features
) const
3120 ENCODE_START(2, 2, bl
);
3121 ::encode(pad_size
, bl
);
3122 uint8_t const pad
= 0xff;
3123 for (unsigned int i
= 0; i
< pad_size
; ++i
) {
3130 void ENoOp::decode(bufferlist::iterator
&bl
)
3132 DECODE_START(2, bl
);
3133 ::decode(pad_size
, bl
);
3134 if (bl
.get_remaining() != pad_size
) {
3135 // This is spiritually an assertion, but expressing in a way that will let
3136 // journal debug tools catch it and recognise a malformed entry.
3137 throw buffer::end_of_buffer();
3139 bl
.advance(pad_size
);
3145 void ENoOp::replay(MDSRank
*mds
)
3147 dout(4) << "ENoOp::replay, " << pad_size
<< " bytes skipped in journal" << dendl
;
3151 * If re-formatting an old journal that used absolute log position
3152 * references as segment sequence numbers, use this function to update
3156 * MDSRank instance, just used for logging
3158 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3161 * True if the event was modified.
3163 bool EMetaBlob::rewrite_truncate_finish(MDSRank
const *mds
,
3164 std::map
<log_segment_seq_t
, log_segment_seq_t
> const &old_to_new
)
3166 bool modified
= false;
3167 map
<inodeno_t
, log_segment_seq_t
> new_trunc_finish
;
3168 for (std::map
<inodeno_t
, log_segment_seq_t
>::iterator i
= truncate_finish
.begin();
3169 i
!= truncate_finish
.end(); ++i
) {
3170 if (old_to_new
.count(i
->second
)) {
3171 dout(20) << __func__
<< " applying segment seq mapping "
3172 << i
->second
<< " -> " << old_to_new
.find(i
->second
)->second
<< dendl
;
3173 new_trunc_finish
[i
->first
] = old_to_new
.find(i
->second
)->second
;
3176 dout(20) << __func__
<< " no segment seq mapping found for "
3177 << i
->second
<< dendl
;
3178 new_trunc_finish
[i
->first
] = i
->second
;
3181 truncate_finish
= new_trunc_finish
;