]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/journal.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / mds / journal.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "common/config.h"
16 #include "osdc/Journaler.h"
17 #include "events/ESubtreeMap.h"
18 #include "events/ESession.h"
19 #include "events/ESessions.h"
20
21 #include "events/EMetaBlob.h"
22 #include "events/EResetJournal.h"
23 #include "events/ENoOp.h"
24
25 #include "events/EUpdate.h"
26 #include "events/EPeerUpdate.h"
27 #include "events/EOpen.h"
28 #include "events/ECommitted.h"
29 #include "events/EPurged.h"
30
31 #include "events/EExport.h"
32 #include "events/EImportStart.h"
33 #include "events/EImportFinish.h"
34 #include "events/EFragment.h"
35
36 #include "events/ETableClient.h"
37 #include "events/ETableServer.h"
38
39 #include "include/stringify.h"
40
41 #include "LogSegment.h"
42
43 #include "MDSRank.h"
44 #include "MDLog.h"
45 #include "MDCache.h"
46 #include "Server.h"
47 #include "Migrator.h"
48 #include "Mutation.h"
49
50 #include "InoTable.h"
51 #include "MDSTableClient.h"
52 #include "MDSTableServer.h"
53
54 #include "Locker.h"
55
56 #define dout_context g_ceph_context
57 #define dout_subsys ceph_subsys_mds
58 #undef dout_prefix
59 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
60
61 using std::list;
62 using std::map;
63 using std::ostream;
64 using std::pair;
65 using std::set;
66 using std::string;
67 using std::vector;
68
69 // -----------------------
70 // LogSegment
71
72 struct BatchStoredBacktrace : public MDSIOContext {
73 MDSContext *fin;
74 std::vector<CInodeCommitOperations> ops_vec;
75
76 BatchStoredBacktrace(MDSRank *m, MDSContext *f,
77 std::vector<CInodeCommitOperations>&& ops) :
78 MDSIOContext(m), fin(f), ops_vec(std::move(ops)) {}
79 void finish(int r) override {
80 for (auto& op : ops_vec) {
81 op.in->_stored_backtrace(r, op.version, nullptr);
82 }
83 fin->complete(r);
84 }
85 void print(ostream& out) const override {
86 out << "batch backtrace_store";
87 }
88 };
89
90 struct BatchCommitBacktrace : public Context {
91 MDSRank *mds;
92 MDSContext *fin;
93 std::vector<CInodeCommitOperations> ops_vec;
94
95 BatchCommitBacktrace(MDSRank *m, MDSContext *f,
96 std::vector<CInodeCommitOperations>&& ops) :
97 mds(m), fin(f), ops_vec(std::move(ops)) {}
98 void finish(int r) override {
99 C_GatherBuilder gather(g_ceph_context);
100
101 for (auto &op : ops_vec) {
102 op.in->_commit_ops(r, gather, op.ops_vec, op.bt);
103 op.ops_vec.clear();
104 op.bt.clear();
105 }
106 ceph_assert(gather.has_subs());
107 gather.set_finisher(new C_OnFinisher(
108 new BatchStoredBacktrace(mds, fin, std::move(ops_vec)),
109 mds->finisher));
110 gather.activate();
111 }
112 };
113
114 void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio)
115 {
116 set<CDir*> commit;
117
118 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire" << dendl;
119
120 ceph_assert(g_conf()->mds_kill_journal_expire_at != 1);
121
122 // commit dirs
123 for (elist<CDir*>::iterator p = new_dirfrags.begin(); !p.end(); ++p) {
124 dout(20) << " new_dirfrag " << **p << dendl;
125 ceph_assert((*p)->is_auth());
126 commit.insert(*p);
127 }
128 for (elist<CDir*>::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) {
129 dout(20) << " dirty_dirfrag " << **p << dendl;
130 ceph_assert((*p)->is_auth());
131 commit.insert(*p);
132 }
133 for (elist<CDentry*>::iterator p = dirty_dentries.begin(); !p.end(); ++p) {
134 dout(20) << " dirty_dentry " << **p << dendl;
135 ceph_assert((*p)->is_auth());
136 commit.insert((*p)->get_dir());
137 }
138 for (elist<CInode*>::iterator p = dirty_inodes.begin(); !p.end(); ++p) {
139 dout(20) << " dirty_inode " << **p << dendl;
140 ceph_assert((*p)->is_auth());
141 if ((*p)->is_base()) {
142 (*p)->store(gather_bld.new_sub());
143 } else
144 commit.insert((*p)->get_parent_dn()->get_dir());
145 }
146
147 if (!commit.empty()) {
148 for (set<CDir*>::iterator p = commit.begin();
149 p != commit.end();
150 ++p) {
151 CDir *dir = *p;
152 ceph_assert(dir->is_auth());
153 if (dir->can_auth_pin()) {
154 dout(15) << "try_to_expire committing " << *dir << dendl;
155 dir->commit(0, gather_bld.new_sub(), false, op_prio);
156 } else {
157 dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl;
158 dir->add_waiter(CDir::WAIT_UNFREEZE, gather_bld.new_sub());
159 }
160 }
161 }
162
163 // leader ops with possibly uncommitted peers
164 for (set<metareqid_t>::iterator p = uncommitted_leaders.begin();
165 p != uncommitted_leaders.end();
166 ++p) {
167 dout(10) << "try_to_expire waiting for peers to ack commit on " << *p << dendl;
168 mds->mdcache->wait_for_uncommitted_leader(*p, gather_bld.new_sub());
169 }
170
171 // peer ops that haven't been committed
172 for (set<metareqid_t>::iterator p = uncommitted_peers.begin();
173 p != uncommitted_peers.end();
174 ++p) {
175 dout(10) << "try_to_expire waiting for leader to ack OP_FINISH on " << *p << dendl;
176 mds->mdcache->wait_for_uncommitted_peer(*p, gather_bld.new_sub());
177 }
178
179 // uncommitted fragments
180 for (set<dirfrag_t>::iterator p = uncommitted_fragments.begin();
181 p != uncommitted_fragments.end();
182 ++p) {
183 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p << dendl;
184 mds->mdcache->wait_for_uncommitted_fragment(*p, gather_bld.new_sub());
185 }
186
187 // nudge scatterlocks
188 for (elist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) {
189 CInode *in = *p;
190 dout(10) << "try_to_expire waiting for dirlock flush on " << *in << dendl;
191 mds->locker->scatter_nudge(&in->filelock, gather_bld.new_sub());
192 }
193 for (elist<CInode*>::iterator p = dirty_dirfrag_dirfragtree.begin(); !p.end(); ++p) {
194 CInode *in = *p;
195 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in << dendl;
196 mds->locker->scatter_nudge(&in->dirfragtreelock, gather_bld.new_sub());
197 }
198 for (elist<CInode*>::iterator p = dirty_dirfrag_nest.begin(); !p.end(); ++p) {
199 CInode *in = *p;
200 dout(10) << "try_to_expire waiting for nest flush on " << *in << dendl;
201 mds->locker->scatter_nudge(&in->nestlock, gather_bld.new_sub());
202 }
203
204 ceph_assert(g_conf()->mds_kill_journal_expire_at != 2);
205
206 // open files and snap inodes
207 if (!open_files.empty()) {
208 ceph_assert(!mds->mdlog->is_capped()); // hmm FIXME
209 EOpen *le = 0;
210 LogSegment *ls = mds->mdlog->get_current_segment();
211 ceph_assert(ls != this);
212 elist<CInode*>::iterator p = open_files.begin(member_offset(CInode, item_open_file));
213 while (!p.end()) {
214 CInode *in = *p;
215 ++p;
216 if (in->last != CEPH_NOSNAP && in->is_auth() && !in->client_snap_caps.empty()) {
217 // journal snap inodes that need flush. This simplify the mds failover hanlding
218 dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl;
219 if (!le) {
220 le = new EOpen(mds->mdlog);
221 mds->mdlog->start_entry(le);
222 }
223 le->add_clean_inode(in);
224 ls->open_files.push_back(&in->item_open_file);
225 } else {
226 // open files are tracked by open file table, no need to journal them again
227 in->item_open_file.remove_myself();
228 }
229 }
230 if (le) {
231 mds->mdlog->submit_entry(le);
232 mds->mdlog->wait_for_safe(gather_bld.new_sub());
233 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl;
234 }
235 }
236
237 ceph_assert(g_conf()->mds_kill_journal_expire_at != 3);
238
239 size_t count = 0;
240 for (elist<CInode*>::iterator it = dirty_parent_inodes.begin(); !it.end(); ++it)
241 count++;
242
243 std::vector<CInodeCommitOperations> ops_vec;
244 ops_vec.reserve(count);
245 // backtraces to be stored/updated
246 for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) {
247 CInode *in = *p;
248 ceph_assert(in->is_auth());
249 if (in->can_auth_pin()) {
250 dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl;
251 ops_vec.resize(ops_vec.size() + 1);
252 in->store_backtrace(ops_vec.back(), op_prio);
253 } else {
254 dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl;
255 in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub());
256 }
257 }
258 if (!ops_vec.empty())
259 mds->finisher->queue(new BatchCommitBacktrace(mds, gather_bld.new_sub(), std::move(ops_vec)));
260
261 ceph_assert(g_conf()->mds_kill_journal_expire_at != 4);
262
263 // idalloc
264 if (inotablev > mds->inotable->get_committed_version()) {
265 dout(10) << "try_to_expire saving inotable table, need " << inotablev
266 << ", committed is " << mds->inotable->get_committed_version()
267 << " (" << mds->inotable->get_committing_version() << ")"
268 << dendl;
269 mds->inotable->save(gather_bld.new_sub(), inotablev);
270 }
271
272 // sessionmap
273 if (sessionmapv > mds->sessionmap.get_committed()) {
274 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
275 << ", committed is " << mds->sessionmap.get_committed()
276 << " (" << mds->sessionmap.get_committing() << ")"
277 << dendl;
278 mds->sessionmap.save(gather_bld.new_sub(), sessionmapv);
279 }
280
281 // updates to sessions for completed_requests
282 mds->sessionmap.save_if_dirty(touched_sessions, &gather_bld);
283 touched_sessions.clear();
284
285 // pending commit atids
286 for (map<int, ceph::unordered_set<version_t> >::iterator p = pending_commit_tids.begin();
287 p != pending_commit_tids.end();
288 ++p) {
289 MDSTableClient *client = mds->get_table_client(p->first);
290 ceph_assert(client);
291 for (ceph::unordered_set<version_t>::iterator q = p->second.begin();
292 q != p->second.end();
293 ++q) {
294 dout(10) << "try_to_expire " << get_mdstable_name(p->first) << " transaction " << *q
295 << " pending commit (not yet acked), waiting" << dendl;
296 ceph_assert(!client->has_committed(*q));
297 client->wait_for_ack(*q, gather_bld.new_sub());
298 }
299 }
300
301 // table servers
302 for (map<int, version_t>::iterator p = tablev.begin();
303 p != tablev.end();
304 ++p) {
305 MDSTableServer *server = mds->get_table_server(p->first);
306 ceph_assert(server);
307 if (p->second > server->get_committed_version()) {
308 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p->first)
309 << " to save, need " << p->second << dendl;
310 server->save(gather_bld.new_sub());
311 }
312 }
313
314 // truncating
315 for (set<CInode*>::iterator p = truncating_inodes.begin();
316 p != truncating_inodes.end();
317 ++p) {
318 dout(10) << "try_to_expire waiting for truncate of " << **p << dendl;
319 (*p)->add_waiter(CInode::WAIT_TRUNC, gather_bld.new_sub());
320 }
321 // purge inodes
322 dout(10) << "try_to_expire waiting for purge of " << purging_inodes << dendl;
323 if (purging_inodes.size())
324 set_purged_cb(gather_bld.new_sub());
325
326 if (gather_bld.has_subs()) {
327 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire waiting" << dendl;
328 mds->mdlog->flush();
329 } else {
330 ceph_assert(g_conf()->mds_kill_journal_expire_at != 5);
331 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire success" << dendl;
332 }
333 }
334
335 // -----------------------
336 // EMetaBlob
337
338 void EMetaBlob::add_dir_context(CDir *dir, int mode)
339 {
340 MDSRank *mds = dir->mdcache->mds;
341
342 list<CDentry*> parents;
343
344 // it may be okay not to include the maybe items, if
345 // - we journaled the maybe child inode in this segment
346 // - that subtree turns out to be unambiguously auth
347 list<CDentry*> maybe;
348 bool maybenot = false;
349
350 while (true) {
351 // already have this dir? (we must always add in order)
352 if (lump_map.count(dir->dirfrag())) {
353 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") have lump " << dir->dirfrag() << dendl;
354 break;
355 }
356
357 // stop at root/stray
358 CInode *diri = dir->get_inode();
359 CDentry *parent = diri->get_projected_parent_dn();
360
361 if (mode == TO_AUTH_SUBTREE_ROOT) {
362 // subtree root?
363 if (dir->is_subtree_root()) {
364 // match logic in MDCache::create_subtree_map()
365 if (dir->get_dir_auth().first == mds->get_nodeid()) {
366 mds_authority_t parent_auth = parent ? parent->authority() : CDIR_AUTH_UNDEF;
367 if (parent_auth.first == dir->get_dir_auth().first) {
368 if (parent_auth.second == CDIR_AUTH_UNKNOWN &&
369 !dir->is_ambiguous_dir_auth() &&
370 !dir->state_test(CDir::STATE_EXPORTBOUND) &&
371 !dir->state_test(CDir::STATE_AUXSUBTREE) &&
372 !diri->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
373 dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir << dendl;
374 ceph_abort();
375 }
376 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") ambiguous or transient subtree " << dendl;
377 } else {
378 // it's an auth subtree, we don't need maybe (if any), and we're done.
379 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
380 << " at " << *dir << dendl;
381 maybe.clear();
382 break;
383 }
384 } else {
385 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached ambig or !auth subtree, need " << maybe
386 << " at " << *dir << dendl;
387 // we need the maybe list after all!
388 parents.splice(parents.begin(), maybe);
389 maybenot = false;
390 }
391 }
392
393 // was the inode journaled in this blob?
394 if (event_seq && diri->last_journaled == event_seq) {
395 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri this blob " << *diri << dendl;
396 break;
397 }
398
399 // have we journaled this inode since the last subtree map?
400 if (!maybenot && last_subtree_map && diri->last_journaled >= last_subtree_map) {
401 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri in this segment ("
402 << diri->last_journaled << " >= " << last_subtree_map << "), setting maybenot flag "
403 << *diri << dendl;
404 maybenot = true;
405 }
406 }
407
408 if (!parent)
409 break;
410
411 if (maybenot) {
412 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") maybe " << *parent << dendl;
413 maybe.push_front(parent);
414 } else {
415 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") definitely " << *parent << dendl;
416 parents.push_front(parent);
417 }
418
419 dir = parent->get_dir();
420 }
421
422 parents.splice(parents.begin(), maybe);
423
424 dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl;
425 for (const auto& dentry : parents) {
426 ceph_assert(dentry->get_projected_linkage()->is_primary());
427 add_dentry(dentry, false);
428 }
429 }
430
431 void EMetaBlob::update_segment(LogSegment *ls)
432 {
433 // dirty inode mtimes
434 // -> handled directly by Server.cc, replay()
435
436 // alloc table update?
437 if (inotablev)
438 ls->inotablev = inotablev;
439 if (sessionmapv)
440 ls->sessionmapv = sessionmapv;
441
442 // truncated inodes
443 // -> handled directly by Server.cc
444
445 // client requests
446 // note the newest request per client
447 //if (!client_reqs.empty())
448 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
449 }
450
451 // EMetaBlob::fullbit
452
453 void EMetaBlob::fullbit::encode(bufferlist& bl, uint64_t features) const {
454 ENCODE_START(9, 5, bl);
455 encode(dn, bl);
456 encode(dnfirst, bl);
457 encode(dnlast, bl);
458 encode(dnv, bl);
459 encode(*inode, bl, features);
460 if (xattrs)
461 encode(*xattrs, bl);
462 else
463 encode((__u32)0, bl);
464
465 if (inode->is_symlink())
466 encode(symlink, bl);
467 if (inode->is_dir()) {
468 encode(dirfragtree, bl);
469 encode(snapbl, bl);
470 }
471 encode(state, bl);
472 if (!old_inodes || old_inodes->empty()) {
473 encode(false, bl);
474 } else {
475 encode(true, bl);
476 encode(*old_inodes, bl, features);
477 }
478 if (!inode->is_dir())
479 encode(snapbl, bl);
480 encode(oldest_snap, bl);
481 encode(alternate_name, bl);
482 ENCODE_FINISH(bl);
483 }
484
485 void EMetaBlob::fullbit::decode(bufferlist::const_iterator &bl) {
486 DECODE_START(9, bl);
487 decode(dn, bl);
488 decode(dnfirst, bl);
489 decode(dnlast, bl);
490 decode(dnv, bl);
491 {
492 auto _inode = CInode::allocate_inode();
493 decode(*_inode, bl);
494 inode = std::move(_inode);
495 }
496 {
497 CInode::mempool_xattr_map tmp;
498 decode_noshare(tmp, bl);
499 if (!tmp.empty())
500 xattrs = CInode::allocate_xattr_map(std::move(tmp));
501 }
502 if (inode->is_symlink())
503 decode(symlink, bl);
504 if (inode->is_dir()) {
505 decode(dirfragtree, bl);
506 decode(snapbl, bl);
507 }
508 decode(state, bl);
509 bool old_inodes_present;
510 decode(old_inodes_present, bl);
511 if (old_inodes_present) {
512 auto _old_inodes = CInode::allocate_old_inode_map();
513 decode(*_old_inodes, bl);
514 old_inodes = std::move(_old_inodes);
515 }
516 if (!inode->is_dir()) {
517 decode(snapbl, bl);
518 }
519 decode(oldest_snap, bl);
520 if (struct_v >= 9) {
521 decode(alternate_name, bl);
522 }
523 DECODE_FINISH(bl);
524 }
525
526 void EMetaBlob::fullbit::dump(Formatter *f) const
527 {
528 f->dump_string("dentry", dn);
529 f->dump_stream("snapid.first") << dnfirst;
530 f->dump_stream("snapid.last") << dnlast;
531 f->dump_int("dentry version", dnv);
532 f->open_object_section("inode");
533 inode->dump(f);
534 f->close_section(); // inode
535 f->open_object_section("xattrs");
536 if (xattrs) {
537 for (const auto &p : *xattrs) {
538 std::string s(p.second.c_str(), p.second.length());
539 f->dump_string(p.first.c_str(), s);
540 }
541 }
542 f->close_section(); // xattrs
543 if (inode->is_symlink()) {
544 f->dump_string("symlink", symlink);
545 }
546 if (inode->is_dir()) {
547 f->dump_stream("frag tree") << dirfragtree;
548 f->dump_string("has_snapbl", snapbl.length() ? "true" : "false");
549 if (inode->has_layout()) {
550 f->open_object_section("file layout policy");
551 // FIXME
552 f->dump_string("layout", "the layout exists");
553 f->close_section(); // file layout policy
554 }
555 }
556 f->dump_string("state", state_string());
557 if (old_inodes && !old_inodes->empty()) {
558 f->open_array_section("old inodes");
559 for (const auto &p : *old_inodes) {
560 f->open_object_section("inode");
561 f->dump_int("snapid", p.first);
562 p.second.dump(f);
563 f->close_section(); // inode
564 }
565 f->close_section(); // old inodes
566 }
567 f->dump_string("alternate_name", alternate_name);
568 }
569
570 void EMetaBlob::fullbit::generate_test_instances(std::list<EMetaBlob::fullbit*>& ls)
571 {
572 auto _inode = CInode::allocate_inode();
573 fragtree_t fragtree;
574 auto _xattrs = CInode::allocate_xattr_map();
575 bufferlist empty_snapbl;
576 fullbit *sample = new fullbit("/testdn", "", 0, 0, 0,
577 _inode, fragtree, _xattrs, "", 0, empty_snapbl,
578 false, NULL);
579 ls.push_back(sample);
580 }
581
582 void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in)
583 {
584 in->reset_inode(std::move(inode));
585 in->reset_xattrs(std::move(xattrs));
586 if (in->is_dir()) {
587 if (is_export_ephemeral_random()) {
588 dout(15) << "random ephemeral pin on " << *in << dendl;
589 in->set_ephemeral_pin(false, true);
590 }
591 in->maybe_export_pin();
592 if (!(in->dirfragtree == dirfragtree)) {
593 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> "
594 << dirfragtree << " on " << *in << dendl;
595 in->dirfragtree = std::move(dirfragtree);
596 in->force_dirfrags();
597 if (in->get_num_dirfrags() && in->authority() == CDIR_AUTH_UNDEF) {
598 auto&& ls = in->get_nested_dirfrags();
599 for (const auto& dir : ls) {
600 if (dir->get_num_any() == 0 &&
601 mds->mdcache->can_trim_non_auth_dirfrag(dir)) {
602 dout(10) << " closing empty non-auth dirfrag " << *dir << dendl;
603 in->close_dirfrag(dir->get_frag());
604 }
605 }
606 }
607 }
608 } else if (in->is_symlink()) {
609 in->symlink = symlink;
610 }
611 in->reset_old_inodes(std::move(old_inodes));
612 if (in->is_any_old_inodes()) {
613 snapid_t min_first = in->get_old_inodes()->rbegin()->first + 1;
614 if (min_first > in->first)
615 in->first = min_first;
616 }
617
618 /*
619 * we can do this before linking hte inode bc the split_at would
620 * be a no-op.. we have no children (namely open snaprealms) to
621 * divy up
622 */
623 in->oldest_snap = oldest_snap;
624 in->decode_snap_blob(snapbl);
625
626 /*
627 * In case there was anything malformed in the journal that we are
628 * replaying, do sanity checks on the inodes we're replaying and
629 * go damaged instead of letting any trash into a live cache
630 */
631 if (in->is_file()) {
632 // Files must have valid layouts with a pool set
633 if (in->get_inode()->layout.pool_id == -1 ||
634 !in->get_inode()->layout.is_valid()) {
635 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
636 << ": " << in->get_inode()->layout << dendl;
637 CachedStackStringStream css;
638 *css << "Invalid layout for inode " << in->ino() << " in journal";
639 mds->clog->error() << css->strv();
640 mds->damaged();
641 ceph_abort(); // Should be unreachable because damaged() calls respawn()
642 }
643 }
644 }
645
646 // EMetaBlob::remotebit
647
648 void EMetaBlob::remotebit::encode(bufferlist& bl) const
649 {
650 ENCODE_START(3, 2, bl);
651 encode(dn, bl);
652 encode(dnfirst, bl);
653 encode(dnlast, bl);
654 encode(dnv, bl);
655 encode(ino, bl);
656 encode(d_type, bl);
657 encode(dirty, bl);
658 encode(alternate_name, bl);
659 ENCODE_FINISH(bl);
660 }
661
662 void EMetaBlob::remotebit::decode(bufferlist::const_iterator &bl)
663 {
664 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
665 decode(dn, bl);
666 decode(dnfirst, bl);
667 decode(dnlast, bl);
668 decode(dnv, bl);
669 decode(ino, bl);
670 decode(d_type, bl);
671 decode(dirty, bl);
672 if (struct_v >= 3)
673 decode(alternate_name, bl);
674 DECODE_FINISH(bl);
675 }
676
677 void EMetaBlob::remotebit::dump(Formatter *f) const
678 {
679 f->dump_string("dentry", dn);
680 f->dump_int("snapid.first", dnfirst);
681 f->dump_int("snapid.last", dnlast);
682 f->dump_int("dentry version", dnv);
683 f->dump_int("inodeno", ino);
684 uint32_t type = DTTOIF(d_type) & S_IFMT; // convert to type entries
685 string type_string;
686 switch(type) {
687 case S_IFREG:
688 type_string = "file"; break;
689 case S_IFLNK:
690 type_string = "symlink"; break;
691 case S_IFDIR:
692 type_string = "directory"; break;
693 case S_IFIFO:
694 type_string = "fifo"; break;
695 case S_IFCHR:
696 type_string = "chr"; break;
697 case S_IFBLK:
698 type_string = "blk"; break;
699 case S_IFSOCK:
700 type_string = "sock"; break;
701 default:
702 assert (0 == "unknown d_type!");
703 }
704 f->dump_string("d_type", type_string);
705 f->dump_string("dirty", dirty ? "true" : "false");
706 f->dump_string("alternate_name", alternate_name);
707 }
708
709 void EMetaBlob::remotebit::
710 generate_test_instances(std::list<EMetaBlob::remotebit*>& ls)
711 {
712 remotebit *remote = new remotebit("/test/dn", "", 0, 10, 15, 1, IFTODT(S_IFREG), false);
713 ls.push_back(remote);
714 remote = new remotebit("/test/dn2", "foo", 0, 10, 15, 1, IFTODT(S_IFREG), false);
715 ls.push_back(remote);
716 }
717
718 // EMetaBlob::nullbit
719
720 void EMetaBlob::nullbit::encode(bufferlist& bl) const
721 {
722 ENCODE_START(2, 2, bl);
723 encode(dn, bl);
724 encode(dnfirst, bl);
725 encode(dnlast, bl);
726 encode(dnv, bl);
727 encode(dirty, bl);
728 ENCODE_FINISH(bl);
729 }
730
731 void EMetaBlob::nullbit::decode(bufferlist::const_iterator &bl)
732 {
733 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
734 decode(dn, bl);
735 decode(dnfirst, bl);
736 decode(dnlast, bl);
737 decode(dnv, bl);
738 decode(dirty, bl);
739 DECODE_FINISH(bl);
740 }
741
742 void EMetaBlob::nullbit::dump(Formatter *f) const
743 {
744 f->dump_string("dentry", dn);
745 f->dump_int("snapid.first", dnfirst);
746 f->dump_int("snapid.last", dnlast);
747 f->dump_int("dentry version", dnv);
748 f->dump_string("dirty", dirty ? "true" : "false");
749 }
750
751 void EMetaBlob::nullbit::generate_test_instances(std::list<nullbit*>& ls)
752 {
753 nullbit *sample = new nullbit("/test/dentry", 0, 10, 15, false);
754 nullbit *sample2 = new nullbit("/test/dirty", 10, 20, 25, true);
755 ls.push_back(sample);
756 ls.push_back(sample2);
757 }
758
759 // EMetaBlob::dirlump
760
761 void EMetaBlob::dirlump::encode(bufferlist& bl, uint64_t features) const
762 {
763 ENCODE_START(2, 2, bl);
764 encode(*fnode, bl);
765 encode(state, bl);
766 encode(nfull, bl);
767 encode(nremote, bl);
768 encode(nnull, bl);
769 _encode_bits(features);
770 encode(dnbl, bl);
771 ENCODE_FINISH(bl);
772 }
773
774 void EMetaBlob::dirlump::decode(bufferlist::const_iterator &bl)
775 {
776 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl)
777 {
778 auto _fnode = CDir::allocate_fnode();
779 decode(*_fnode, bl);
780 fnode = std::move(_fnode);
781 }
782 decode(state, bl);
783 decode(nfull, bl);
784 decode(nremote, bl);
785 decode(nnull, bl);
786 decode(dnbl, bl);
787 dn_decoded = false; // don't decode bits unless we need them.
788 DECODE_FINISH(bl);
789 }
790
791 void EMetaBlob::dirlump::dump(Formatter *f) const
792 {
793 if (!dn_decoded) {
794 dirlump *me = const_cast<dirlump*>(this);
795 me->_decode_bits();
796 }
797 f->open_object_section("fnode");
798 fnode->dump(f);
799 f->close_section(); // fnode
800 f->dump_string("state", state_string());
801 f->dump_int("nfull", nfull);
802 f->dump_int("nremote", nremote);
803 f->dump_int("nnull", nnull);
804
805 f->open_array_section("full bits");
806 for (const auto& iter : dfull) {
807 f->open_object_section("fullbit");
808 iter.dump(f);
809 f->close_section(); // fullbit
810 }
811 f->close_section(); // full bits
812 f->open_array_section("remote bits");
813 for (const auto& iter : dremote) {
814 f->open_object_section("remotebit");
815 iter.dump(f);
816 f->close_section(); // remotebit
817 }
818 f->close_section(); // remote bits
819 f->open_array_section("null bits");
820 for (const auto& iter : dnull) {
821 f->open_object_section("null bit");
822 iter.dump(f);
823 f->close_section(); // null bit
824 }
825 f->close_section(); // null bits
826 }
827
828 void EMetaBlob::dirlump::generate_test_instances(std::list<dirlump*>& ls)
829 {
830 auto dl = new dirlump();
831 dl->fnode = CDir::allocate_fnode();
832 ls.push_back(dl);
833 }
834
835 /**
836 * EMetaBlob proper
837 */
838 void EMetaBlob::encode(bufferlist& bl, uint64_t features) const
839 {
840 ENCODE_START(8, 5, bl);
841 encode(lump_order, bl);
842 encode(lump_map, bl, features);
843 encode(roots, bl, features);
844 encode(table_tids, bl);
845 encode(opened_ino, bl);
846 encode(allocated_ino, bl);
847 encode(used_preallocated_ino, bl);
848 encode(preallocated_inos, bl);
849 encode(client_name, bl);
850 encode(inotablev, bl);
851 encode(sessionmapv, bl);
852 encode(truncate_start, bl);
853 encode(truncate_finish, bl);
854 encode(destroyed_inodes, bl);
855 encode(client_reqs, bl);
856 encode(renamed_dirino, bl);
857 encode(renamed_dir_frags, bl);
858 {
859 // make MDSRank use v6 format happy
860 int64_t i = -1;
861 bool b = false;
862 encode(i, bl);
863 encode(b, bl);
864 }
865 encode(client_flushes, bl);
866 ENCODE_FINISH(bl);
867 }
868 void EMetaBlob::decode(bufferlist::const_iterator &bl)
869 {
870 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
871 decode(lump_order, bl);
872 decode(lump_map, bl);
873 if (struct_v >= 4) {
874 decode(roots, bl);
875 } else {
876 bufferlist rootbl;
877 decode(rootbl, bl);
878 if (rootbl.length()) {
879 auto p = rootbl.cbegin();
880 roots.emplace_back(p);
881 }
882 }
883 decode(table_tids, bl);
884 decode(opened_ino, bl);
885 decode(allocated_ino, bl);
886 decode(used_preallocated_ino, bl);
887 decode(preallocated_inos, bl);
888 decode(client_name, bl);
889 decode(inotablev, bl);
890 decode(sessionmapv, bl);
891 decode(truncate_start, bl);
892 decode(truncate_finish, bl);
893 decode(destroyed_inodes, bl);
894 if (struct_v >= 2) {
895 decode(client_reqs, bl);
896 } else {
897 list<metareqid_t> r;
898 decode(r, bl);
899 while (!r.empty()) {
900 client_reqs.push_back(pair<metareqid_t,uint64_t>(r.front(), 0));
901 r.pop_front();
902 }
903 }
904 if (struct_v >= 3) {
905 decode(renamed_dirino, bl);
906 decode(renamed_dir_frags, bl);
907 }
908 if (struct_v >= 6) {
909 // ignore
910 int64_t i;
911 bool b;
912 decode(i, bl);
913 decode(b, bl);
914 }
915 if (struct_v >= 8) {
916 decode(client_flushes, bl);
917 }
918 DECODE_FINISH(bl);
919 }
920
921
922 /**
923 * Get all inodes touched by this metablob. Includes the 'bits' within
924 * dirlumps, and the inodes of the dirs themselves.
925 */
926 void EMetaBlob::get_inodes(
927 std::set<inodeno_t> &inodes) const
928 {
929 // For all dirlumps in this metablob
930 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
931 // Record inode of dirlump
932 inodeno_t const dir_ino = i->first.ino;
933 inodes.insert(dir_ino);
934
935 // Decode dirlump bits
936 dirlump const &dl = i->second;
937 dl._decode_bits();
938
939 // Record inodes of fullbits
940 for (const auto& iter : dl.get_dfull()) {
941 inodes.insert(iter.inode->ino);
942 }
943
944 // Record inodes of remotebits
945 for (const auto& iter : dl.get_dremote()) {
946 inodes.insert(iter.ino);
947 }
948 }
949 }
950
951
952 /**
953 * Get a map of dirfrag to set of dentries in that dirfrag which are
954 * touched in this operation.
955 */
956 void EMetaBlob::get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const
957 {
958 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
959 dirlump const &dl = i->second;
960 dirfrag_t const &df = i->first;
961
962 // Get all bits
963 dl._decode_bits();
964
965 // For all bits, store dentry
966 for (const auto& iter : dl.get_dfull()) {
967 dentries[df].insert(iter.dn);
968 }
969 for (const auto& iter : dl.get_dremote()) {
970 dentries[df].insert(iter.dn);
971 }
972 for (const auto& iter : dl.get_dnull()) {
973 dentries[df].insert(iter.dn);
974 }
975 }
976 }
977
978
979
980 /**
981 * Calculate all paths that we can infer are touched by this metablob. Only uses
982 * information local to this metablob so it may only be the path within the
983 * subtree.
984 */
985 void EMetaBlob::get_paths(
986 std::vector<std::string> &paths) const
987 {
988 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
989 typedef std::pair<inodeno_t, std::string> Location;
990
991 // Whenever we see a dentry within a dirlump, we remember it as a child of
992 // the dirlump's inode
993 std::map<inodeno_t, std::vector<std::string> > children;
994
995 // Whenever we see a location for an inode, remember it: this allows us to
996 // build a path given an inode
997 std::map<inodeno_t, Location> ino_locations;
998
999 // Special case: operations on root inode populate roots but not dirlumps
1000 if (lump_map.empty() && !roots.empty()) {
1001 paths.push_back("/");
1002 return;
1003 }
1004
1005 // First pass
1006 // ==========
1007 // Build a tiny local metadata cache for the path structure in this metablob
1008 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
1009 inodeno_t const dir_ino = i->first.ino;
1010 dirlump const &dl = i->second;
1011 dl._decode_bits();
1012
1013 for (const auto& iter : dl.get_dfull()) {
1014 std::string_view dentry = iter.dn;
1015 children[dir_ino].emplace_back(dentry);
1016 ino_locations[iter.inode->ino] = Location(dir_ino, dentry);
1017 }
1018
1019 for (const auto& iter : dl.get_dremote()) {
1020 std::string_view dentry = iter.dn;
1021 children[dir_ino].emplace_back(dentry);
1022 }
1023
1024 for (const auto& iter : dl.get_dnull()) {
1025 std::string_view dentry = iter.dn;
1026 children[dir_ino].emplace_back(dentry);
1027 }
1028 }
1029
1030 std::vector<Location> leaf_locations;
1031
1032 // Second pass
1033 // ===========
1034 // Output paths for all childless nodes in the metablob
1035 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
1036 inodeno_t const dir_ino = i->first.ino;
1037 dirlump const &dl = i->second;
1038 dl._decode_bits();
1039
1040 for (const auto& iter : dl.get_dfull()) {
1041 std::string_view dentry = iter.dn;
1042 if (children.find(iter.inode->ino) == children.end()) {
1043 leaf_locations.push_back(Location(dir_ino, dentry));
1044 }
1045 }
1046
1047 for (const auto& iter : dl.get_dremote()) {
1048 std::string_view dentry = iter.dn;
1049 leaf_locations.push_back(Location(dir_ino, dentry));
1050 }
1051
1052 for (const auto& iter : dl.get_dnull()) {
1053 std::string_view dentry = iter.dn;
1054 leaf_locations.push_back(Location(dir_ino, dentry));
1055 }
1056 }
1057
1058 // For all the leaf locations identified, generate paths
1059 for (std::vector<Location>::iterator i = leaf_locations.begin(); i != leaf_locations.end(); ++i) {
1060 Location const &loc = *i;
1061 std::string path = loc.second;
1062 inodeno_t ino = loc.first;
1063 std::map<inodeno_t, Location>::iterator iter = ino_locations.find(ino);
1064 while(iter != ino_locations.end()) {
1065 Location const &loc = iter->second;
1066 if (!path.empty()) {
1067 path = loc.second + "/" + path;
1068 } else {
1069 path = loc.second + path;
1070 }
1071 iter = ino_locations.find(loc.first);
1072 }
1073
1074 paths.push_back(path);
1075 }
1076 }
1077
1078
1079 void EMetaBlob::dump(Formatter *f) const
1080 {
1081 f->open_array_section("lumps");
1082 for (const auto& d : lump_order) {
1083 f->open_object_section("lump");
1084 f->open_object_section("dirfrag");
1085 f->dump_stream("dirfrag") << d;
1086 f->close_section(); // dirfrag
1087 f->open_object_section("dirlump");
1088 lump_map.at(d).dump(f);
1089 f->close_section(); // dirlump
1090 f->close_section(); // lump
1091 }
1092 f->close_section(); // lumps
1093
1094 f->open_array_section("roots");
1095 for (const auto& iter : roots) {
1096 f->open_object_section("root");
1097 iter.dump(f);
1098 f->close_section(); // root
1099 }
1100 f->close_section(); // roots
1101
1102 f->open_array_section("tableclient tranactions");
1103 for (const auto& p : table_tids) {
1104 f->open_object_section("transaction");
1105 f->dump_int("tid", p.first);
1106 f->dump_int("version", p.second);
1107 f->close_section(); // transaction
1108 }
1109 f->close_section(); // tableclient transactions
1110
1111 f->dump_int("renamed directory inodeno", renamed_dirino);
1112
1113 f->open_array_section("renamed directory fragments");
1114 for (const auto& p : renamed_dir_frags) {
1115 f->dump_int("frag", p);
1116 }
1117 f->close_section(); // renamed directory fragments
1118
1119 f->dump_int("inotable version", inotablev);
1120 f->dump_int("SessionMap version", sessionmapv);
1121 f->dump_int("allocated ino", allocated_ino);
1122
1123 f->dump_stream("preallocated inos") << preallocated_inos;
1124 f->dump_int("used preallocated ino", used_preallocated_ino);
1125
1126 f->open_object_section("client name");
1127 client_name.dump(f);
1128 f->close_section(); // client name
1129
1130 f->open_array_section("inodes starting a truncate");
1131 for(const auto& ino : truncate_start) {
1132 f->dump_int("inodeno", ino);
1133 }
1134 f->close_section(); // truncate inodes
1135 f->open_array_section("inodes finishing a truncated");
1136 for(const auto& p : truncate_finish) {
1137 f->open_object_section("inode+segment");
1138 f->dump_int("inodeno", p.first);
1139 f->dump_int("truncate starting segment", p.second);
1140 f->close_section(); // truncated inode
1141 }
1142 f->close_section(); // truncate finish inodes
1143
1144 f->open_array_section("destroyed inodes");
1145 for(vector<inodeno_t>::const_iterator i = destroyed_inodes.begin();
1146 i != destroyed_inodes.end(); ++i) {
1147 f->dump_int("inodeno", *i);
1148 }
1149 f->close_section(); // destroyed inodes
1150
1151 f->open_array_section("client requests");
1152 for(const auto& p : client_reqs) {
1153 f->open_object_section("Client request");
1154 f->dump_stream("request ID") << p.first;
1155 f->dump_int("oldest request on client", p.second);
1156 f->close_section(); // request
1157 }
1158 f->close_section(); // client requests
1159 }
1160
1161 void EMetaBlob::generate_test_instances(std::list<EMetaBlob*>& ls)
1162 {
1163 ls.push_back(new EMetaBlob());
1164 }
1165
1166 void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDPeerUpdate *peerup)
1167 {
1168 dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
1169
1170 ceph_assert(logseg);
1171
1172 ceph_assert(g_conf()->mds_kill_journal_replay_at != 1);
1173
1174 for (auto& p : roots) {
1175 CInode *in = mds->mdcache->get_inode(p.inode->ino);
1176 bool isnew = in ? false:true;
1177 if (!in)
1178 in = new CInode(mds->mdcache, false, 2, CEPH_NOSNAP);
1179 p.update_inode(mds, in);
1180
1181 if (isnew)
1182 mds->mdcache->add_inode(in);
1183 if (p.is_dirty()) in->_mark_dirty(logseg);
1184 dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl;
1185 }
1186
1187 CInode *renamed_diri = 0;
1188 CDir *olddir = 0;
1189 if (renamed_dirino) {
1190 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
1191 if (renamed_diri)
1192 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri << dendl;
1193 else
1194 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino << dendl;
1195
1196 int nnull = 0;
1197 for (const auto& lp : lump_order) {
1198 dirlump &lump = lump_map[lp];
1199 if (lump.nnull) {
1200 dout(10) << "EMetaBlob.replay found null dentry in dir " << lp << dendl;
1201 nnull += lump.nnull;
1202 }
1203 }
1204 ceph_assert(nnull <= 1);
1205 }
1206
1207 // keep track of any inodes we unlink and don't relink elsewhere
1208 map<CInode*, CDir*> unlinked;
1209 set<CInode*> linked;
1210
1211 // walk through my dirs (in order!)
1212 int count = 0;
1213 for (const auto& lp : lump_order) {
1214 dout(10) << "EMetaBlob.replay dir " << lp << dendl;
1215 dirlump &lump = lump_map[lp];
1216
1217 // the dir
1218 CDir *dir = mds->mdcache->get_force_dirfrag(lp, true);
1219 if (!dir) {
1220 // hmm. do i have the inode?
1221 CInode *diri = mds->mdcache->get_inode((lp).ino);
1222 if (!diri) {
1223 if (MDS_INO_IS_MDSDIR(lp.ino)) {
1224 ceph_assert(MDS_INO_MDSDIR(mds->get_nodeid()) != lp.ino);
1225 diri = mds->mdcache->create_system_inode(lp.ino, S_IFDIR|0755);
1226 diri->state_clear(CInode::STATE_AUTH);
1227 dout(10) << "EMetaBlob.replay created base " << *diri << dendl;
1228 } else {
1229 dout(0) << "EMetaBlob.replay missing dir ino " << lp.ino << dendl;
1230 mds->clog->error() << "failure replaying journal (EMetaBlob)";
1231 mds->damaged();
1232 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1233 }
1234 }
1235
1236 // create the dirfrag
1237 dir = diri->get_or_open_dirfrag(mds->mdcache, lp.frag);
1238
1239 if (MDS_INO_IS_BASE(lp.ino))
1240 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
1241
1242 dout(10) << "EMetaBlob.replay added dir " << *dir << dendl;
1243 }
1244 dir->reset_fnode(std::move(lump.fnode));
1245 dir->update_projected_version();
1246
1247 if (lump.is_importing()) {
1248 dir->state_set(CDir::STATE_AUTH);
1249 dir->state_clear(CDir::STATE_COMPLETE);
1250 }
1251 if (lump.is_dirty()) {
1252 dir->_mark_dirty(logseg);
1253
1254 if (!(dir->get_fnode()->rstat == dir->get_fnode()->accounted_rstat)) {
1255 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir << dendl;
1256 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
1257 logseg->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
1258 } else {
1259 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir << dendl;
1260 }
1261 if (!(dir->get_fnode()->fragstat == dir->get_fnode()->accounted_fragstat)) {
1262 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir << dendl;
1263 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
1264 logseg->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
1265 } else {
1266 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir << dendl;
1267 }
1268 }
1269 if (lump.is_dirty_dft()) {
1270 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir << dendl;
1271 dir->state_set(CDir::STATE_DIRTYDFT);
1272 mds->locker->mark_updated_scatterlock(&dir->inode->dirfragtreelock);
1273 logseg->dirty_dirfrag_dirfragtree.push_back(&dir->inode->item_dirty_dirfrag_dirfragtree);
1274 }
1275 if (lump.is_new())
1276 dir->mark_new(logseg);
1277 if (lump.is_complete())
1278 dir->mark_complete();
1279
1280 dout(10) << "EMetaBlob.replay updated dir " << *dir << dendl;
1281
1282 // decode bits
1283 lump._decode_bits();
1284
1285 // full dentry+inode pairs
1286 for (auto& fb : lump._get_dfull()) {
1287 CDentry *dn = dir->lookup_exact_snap(fb.dn, fb.dnlast);
1288 if (!dn) {
1289 dn = dir->add_null_dentry(fb.dn, fb.dnfirst, fb.dnlast);
1290 dn->set_version(fb.dnv);
1291 if (fb.is_dirty()) dn->_mark_dirty(logseg);
1292 dout(10) << "EMetaBlob.replay added (full) " << *dn << dendl;
1293 } else {
1294 dn->set_version(fb.dnv);
1295 if (fb.is_dirty()) dn->_mark_dirty(logseg);
1296 dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *dn << dendl;
1297 dn->first = fb.dnfirst;
1298 ceph_assert(dn->last == fb.dnlast);
1299 }
1300 if (lump.is_importing())
1301 dn->mark_auth();
1302
1303 CInode *in = mds->mdcache->get_inode(fb.inode->ino, fb.dnlast);
1304 if (!in) {
1305 in = new CInode(mds->mdcache, dn->is_auth(), fb.dnfirst, fb.dnlast);
1306 fb.update_inode(mds, in);
1307 mds->mdcache->add_inode(in);
1308 if (!dn->get_linkage()->is_null()) {
1309 if (dn->get_linkage()->is_primary()) {
1310 unlinked[dn->get_linkage()->get_inode()] = dir;
1311 CachedStackStringStream css;
1312 *css << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1313 << " " << *dn->get_linkage()->get_inode() << " should be " << in->ino();
1314 dout(0) << css->strv() << dendl;
1315 mds->clog->warn() << css->strv();
1316 }
1317 dir->unlink_inode(dn, false);
1318 }
1319 if (unlinked.count(in))
1320 linked.insert(in);
1321 dir->link_primary_inode(dn, in);
1322 dout(10) << "EMetaBlob.replay added " << *in << dendl;
1323 } else {
1324 in->first = fb.dnfirst;
1325 fb.update_inode(mds, in);
1326 if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) {
1327 dout(10) << "EMetaBlob.replay unlinking " << *in << dendl;
1328 unlinked[in] = in->get_parent_dir();
1329 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
1330 }
1331 if (dn->get_linkage()->get_inode() != in) {
1332 if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1333 if (dn->get_linkage()->is_primary()) {
1334 unlinked[dn->get_linkage()->get_inode()] = dir;
1335 CachedStackStringStream css;
1336 *css << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1337 << " " << *dn->get_linkage()->get_inode() << " should be " << in->ino();
1338 dout(0) << css->strv() << dendl;
1339 mds->clog->warn() << css->strv();
1340 }
1341 dir->unlink_inode(dn, false);
1342 }
1343 if (unlinked.count(in))
1344 linked.insert(in);
1345 dir->link_primary_inode(dn, in);
1346 dout(10) << "EMetaBlob.replay linked " << *in << dendl;
1347 } else {
1348 dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *in << dendl;
1349 }
1350 ceph_assert(in->first == fb.dnfirst ||
1351 (in->is_multiversion() && in->first > fb.dnfirst));
1352 }
1353 if (fb.is_dirty())
1354 in->_mark_dirty(logseg);
1355 if (fb.is_dirty_parent())
1356 in->mark_dirty_parent(logseg, fb.is_dirty_pool());
1357 if (fb.need_snapflush())
1358 logseg->open_files.push_back(&in->item_open_file);
1359 if (dn->is_auth())
1360 in->state_set(CInode::STATE_AUTH);
1361 else
1362 in->state_clear(CInode::STATE_AUTH);
1363 ceph_assert(g_conf()->mds_kill_journal_replay_at != 2);
1364
1365 {
1366 auto do_corruption = mds->get_inject_journal_corrupt_dentry_first();
1367 if (unlikely(do_corruption > 0.0)) {
1368 auto r = ceph::util::generate_random_number(0.0, 1.0);
1369 if (r < do_corruption) {
1370 dout(0) << "corrupting dn: " << *dn << dendl;
1371 dn->first = -10;
1372 }
1373 }
1374 }
1375
1376 if (!(++count % mds->heartbeat_reset_grace()))
1377 mds->heartbeat_reset();
1378 }
1379
1380 // remote dentries
1381 for (const auto& rb : lump.get_dremote()) {
1382 CDentry *dn = dir->lookup_exact_snap(rb.dn, rb.dnlast);
1383 if (!dn) {
1384 dn = dir->add_remote_dentry(rb.dn, rb.ino, rb.d_type, mempool::mds_co::string(rb.alternate_name), rb.dnfirst, rb.dnlast);
1385 dn->set_version(rb.dnv);
1386 if (rb.dirty) dn->_mark_dirty(logseg);
1387 dout(10) << "EMetaBlob.replay added " << *dn << dendl;
1388 } else {
1389 if (!dn->get_linkage()->is_null()) {
1390 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1391 if (dn->get_linkage()->is_primary()) {
1392 unlinked[dn->get_linkage()->get_inode()] = dir;
1393 CachedStackStringStream css;
1394 *css << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1395 << " " << *dn->get_linkage()->get_inode() << " should be remote " << rb.ino;
1396 dout(0) << css->strv() << dendl;
1397 }
1398 dir->unlink_inode(dn, false);
1399 }
1400 dn->set_alternate_name(mempool::mds_co::string(rb.alternate_name));
1401 dir->link_remote_inode(dn, rb.ino, rb.d_type);
1402 dn->set_version(rb.dnv);
1403 if (rb.dirty) dn->_mark_dirty(logseg);
1404 dout(10) << "EMetaBlob.replay for [" << rb.dnfirst << "," << rb.dnlast << "] had " << *dn << dendl;
1405 dn->first = rb.dnfirst;
1406 ceph_assert(dn->last == rb.dnlast);
1407 }
1408 if (lump.is_importing())
1409 dn->mark_auth();
1410
1411 if (!(++count % mds->heartbeat_reset_grace()))
1412 mds->heartbeat_reset();
1413 }
1414
1415 // null dentries
1416 for (const auto& nb : lump.get_dnull()) {
1417 CDentry *dn = dir->lookup_exact_snap(nb.dn, nb.dnlast);
1418 if (!dn) {
1419 dn = dir->add_null_dentry(nb.dn, nb.dnfirst, nb.dnlast);
1420 dn->set_version(nb.dnv);
1421 if (nb.dirty) dn->_mark_dirty(logseg);
1422 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn << dendl;
1423 } else {
1424 dn->first = nb.dnfirst;
1425 if (!dn->get_linkage()->is_null()) {
1426 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1427 CInode *in = dn->get_linkage()->get_inode();
1428 // For renamed inode, We may call CInode::force_dirfrag() later.
1429 // CInode::force_dirfrag() doesn't work well when inode is detached
1430 // from the hierarchy.
1431 if (!renamed_diri || renamed_diri != in) {
1432 if (dn->get_linkage()->is_primary())
1433 unlinked[in] = dir;
1434 dir->unlink_inode(dn);
1435 }
1436 }
1437 dn->set_version(nb.dnv);
1438 if (nb.dirty) dn->_mark_dirty(logseg);
1439 dout(10) << "EMetaBlob.replay had " << *dn << dendl;
1440 ceph_assert(dn->last == nb.dnlast);
1441 }
1442 olddir = dir;
1443 if (lump.is_importing())
1444 dn->mark_auth();
1445
1446 // Make null dentries the first things we trim
1447 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn << dendl;
1448
1449 if (!(++count % mds->heartbeat_reset_grace()))
1450 mds->heartbeat_reset();
1451 }
1452 }
1453
1454 ceph_assert(g_conf()->mds_kill_journal_replay_at != 3);
1455
1456 if (renamed_dirino) {
1457 if (renamed_diri) {
1458 ceph_assert(unlinked.count(renamed_diri));
1459 ceph_assert(linked.count(renamed_diri));
1460 olddir = unlinked[renamed_diri];
1461 } else {
1462 // we imported a diri we haven't seen before
1463 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
1464 ceph_assert(renamed_diri); // it was in the metablob
1465 }
1466
1467 if (olddir) {
1468 if (olddir->authority() != CDIR_AUTH_UNDEF &&
1469 renamed_diri->authority() == CDIR_AUTH_UNDEF) {
1470 ceph_assert(peerup); // auth to non-auth, must be peer prepare
1471 frag_vec_t leaves;
1472 renamed_diri->dirfragtree.get_leaves(leaves);
1473 for (const auto& leaf : leaves) {
1474 CDir *dir = renamed_diri->get_dirfrag(leaf);
1475 ceph_assert(dir);
1476 if (dir->get_dir_auth() == CDIR_AUTH_UNDEF)
1477 // preserve subtree bound until peer commit
1478 peerup->olddirs.insert(dir->inode);
1479 else
1480 dir->state_set(CDir::STATE_AUTH);
1481
1482 if (!(++count % mds->heartbeat_reset_grace()))
1483 mds->heartbeat_reset();
1484 }
1485 }
1486
1487 mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
1488
1489 // see if we can discard the subtree we renamed out of
1490 CDir *root = mds->mdcache->get_subtree_root(olddir);
1491 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
1492 if (peerup) // preserve the old dir until peer commit
1493 peerup->olddirs.insert(olddir->inode);
1494 else
1495 mds->mdcache->try_trim_non_auth_subtree(root);
1496 }
1497 }
1498
1499 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1500 if (renamed_diri->authority() != CDIR_AUTH_UNDEF) {
1501 for (const auto& p : renamed_dir_frags) {
1502 CDir *dir = renamed_diri->get_dirfrag(p);
1503 if (dir) {
1504 // we already had the inode before, and we already adjusted this subtree accordingly.
1505 dout(10) << " already had+adjusted rename import bound " << *dir << dendl;
1506 ceph_assert(olddir);
1507 continue;
1508 }
1509 dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, p);
1510 dout(10) << " creating new rename import bound " << *dir << dendl;
1511 dir->state_clear(CDir::STATE_AUTH);
1512 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
1513
1514 if (!(++count % mds->heartbeat_reset_grace()))
1515 mds->heartbeat_reset();
1516 }
1517 }
1518
1519 // rename may overwrite an empty directory and move it into stray dir.
1520 unlinked.erase(renamed_diri);
1521 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
1522 if (!linked.count(p->first))
1523 continue;
1524 ceph_assert(p->first->is_dir());
1525 mds->mdcache->adjust_subtree_after_rename(p->first, p->second, false);
1526
1527 if (!(++count % mds->heartbeat_reset_grace()))
1528 mds->heartbeat_reset();
1529 }
1530 }
1531
1532 if (!unlinked.empty()) {
1533 for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); ++p)
1534 unlinked.erase(*p);
1535 dout(10) << " unlinked set contains " << unlinked << dendl;
1536 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
1537 CInode *in = p->first;
1538 if (peerup) { // preserve unlinked inodes until peer commit
1539 peerup->unlinked.insert(in);
1540 if (in->snaprealm)
1541 in->snaprealm->adjust_parent();
1542 } else
1543 mds->mdcache->remove_inode_recursive(in);
1544
1545 if (!(++count % mds->heartbeat_reset_grace()))
1546 mds->heartbeat_reset();
1547 }
1548 }
1549
1550 // table client transactions
1551 for (const auto& p : table_tids) {
1552 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p.first)
1553 << " transaction " << p.second << dendl;
1554 MDSTableClient *client = mds->get_table_client(p.first);
1555 if (client)
1556 client->got_journaled_agree(p.second, logseg);
1557
1558 if (!(++count % mds->heartbeat_reset_grace()))
1559 mds->heartbeat_reset();
1560 }
1561
1562 // opened ino?
1563 if (opened_ino) {
1564 CInode *in = mds->mdcache->get_inode(opened_ino);
1565 ceph_assert(in);
1566 dout(10) << "EMetaBlob.replay noting opened inode " << *in << dendl;
1567 logseg->open_files.push_back(&in->item_open_file);
1568 }
1569
1570 // allocated_inos
1571 if (inotablev) {
1572 if (mds->inotable->get_version() >= inotablev) {
1573 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1574 << " <= table " << mds->inotable->get_version() << dendl;
1575 } else {
1576 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1577 << " - 1 == table " << mds->inotable->get_version()
1578 << " allocated+used " << allocated_ino
1579 << " prealloc " << preallocated_inos
1580 << dendl;
1581 if (allocated_ino)
1582 mds->inotable->replay_alloc_id(allocated_ino);
1583 if (preallocated_inos.size())
1584 mds->inotable->replay_alloc_ids(preallocated_inos);
1585
1586 // repair inotable updates in case inotable wasn't persist in time
1587 if (inotablev > mds->inotable->get_version()) {
1588 mds->clog->error() << "journal replay inotablev mismatch "
1589 << mds->inotable->get_version() << " -> " << inotablev
1590 << ", will force replay it.";
1591 mds->inotable->force_replay_version(inotablev);
1592 }
1593
1594 ceph_assert(inotablev == mds->inotable->get_version());
1595 }
1596 }
1597 if (sessionmapv) {
1598 if (mds->sessionmap.get_version() >= sessionmapv) {
1599 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1600 << " <= table " << mds->sessionmap.get_version() << dendl;
1601 } else {
1602 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1603 << ", table " << mds->sessionmap.get_version()
1604 << " prealloc " << preallocated_inos
1605 << " used " << used_preallocated_ino
1606 << dendl;
1607 Session *session = mds->sessionmap.get_session(client_name);
1608 if (session) {
1609 dout(20) << " (session prealloc " << session->info.prealloc_inos << ")" << dendl;
1610 if (used_preallocated_ino) {
1611 if (!session->info.prealloc_inos.empty()) {
1612 inodeno_t ino = session->take_ino(used_preallocated_ino);
1613 session->info.prealloc_inos.erase(ino);
1614 ceph_assert(ino == used_preallocated_ino);
1615 }
1616 mds->sessionmap.replay_dirty_session(session);
1617 }
1618 if (!preallocated_inos.empty()) {
1619 session->free_prealloc_inos.insert(preallocated_inos);
1620 session->info.prealloc_inos.insert(preallocated_inos);
1621 mds->sessionmap.replay_dirty_session(session);
1622 }
1623 } else {
1624 dout(10) << "EMetaBlob.replay no session for " << client_name << dendl;
1625 if (used_preallocated_ino)
1626 mds->sessionmap.replay_advance_version();
1627
1628 if (!preallocated_inos.empty())
1629 mds->sessionmap.replay_advance_version();
1630 }
1631
1632 // repair sessionmap updates in case sessionmap wasn't persist in time
1633 if (sessionmapv > mds->sessionmap.get_version()) {
1634 mds->clog->error() << "EMetaBlob.replay sessionmapv mismatch "
1635 << sessionmapv << " -> " << mds->sessionmap.get_version()
1636 << ", will force replay it.";
1637 if (g_conf()->mds_wipe_sessions) {
1638 mds->sessionmap.wipe();
1639 }
1640 // force replay sessionmap version
1641 mds->sessionmap.set_version(sessionmapv);
1642 }
1643 ceph_assert(sessionmapv == mds->sessionmap.get_version());
1644 }
1645 }
1646
1647 // truncating inodes
1648 for (const auto& ino : truncate_start) {
1649 CInode *in = mds->mdcache->get_inode(ino);
1650 ceph_assert(in);
1651 mds->mdcache->add_recovered_truncate(in, logseg);
1652
1653 if (!(++count % mds->heartbeat_reset_grace()))
1654 mds->heartbeat_reset();
1655 }
1656 for (const auto& p : truncate_finish) {
1657 LogSegment *ls = mds->mdlog->get_segment(p.second);
1658 if (ls) {
1659 CInode *in = mds->mdcache->get_inode(p.first);
1660 ceph_assert(in);
1661 mds->mdcache->remove_recovered_truncate(in, ls);
1662 }
1663
1664 if (!(++count % mds->heartbeat_reset_grace()))
1665 mds->heartbeat_reset();
1666 }
1667
1668 // destroyed inodes
1669 if (!destroyed_inodes.empty()) {
1670 for (vector<inodeno_t>::iterator p = destroyed_inodes.begin();
1671 p != destroyed_inodes.end();
1672 ++p) {
1673 CInode *in = mds->mdcache->get_inode(*p);
1674 if (in) {
1675 dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl;
1676 CDentry *parent = in->get_parent_dn();
1677 mds->mdcache->remove_inode(in);
1678 if (parent) {
1679 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl;
1680 ceph_assert(parent->get_linkage()->is_null());
1681 }
1682 } else {
1683 dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl;
1684 }
1685
1686 if (!(++count % mds->heartbeat_reset_grace()))
1687 mds->heartbeat_reset();
1688 }
1689 mds->mdcache->open_file_table.note_destroyed_inos(logseg->seq, destroyed_inodes);
1690 }
1691
1692 // client requests
1693 for (const auto& p : client_reqs) {
1694 if (p.first.name.is_client()) {
1695 dout(10) << "EMetaBlob.replay request " << p.first << " trim_to " << p.second << dendl;
1696 inodeno_t created = allocated_ino ? allocated_ino : used_preallocated_ino;
1697 // if we allocated an inode, there should be exactly one client request id.
1698 ceph_assert(created == inodeno_t() || client_reqs.size() == 1);
1699
1700 Session *session = mds->sessionmap.get_session(p.first.name);
1701 if (session) {
1702 session->add_completed_request(p.first.tid, created);
1703 if (p.second)
1704 session->trim_completed_requests(p.second);
1705 }
1706 }
1707
1708 if (!(++count % mds->heartbeat_reset_grace()))
1709 mds->heartbeat_reset();
1710 }
1711
1712 // client flushes
1713 for (const auto& p : client_flushes) {
1714 if (p.first.name.is_client()) {
1715 dout(10) << "EMetaBlob.replay flush " << p.first << " trim_to " << p.second << dendl;
1716 Session *session = mds->sessionmap.get_session(p.first.name);
1717 if (session) {
1718 session->add_completed_flush(p.first.tid);
1719 if (p.second)
1720 session->trim_completed_flushes(p.second);
1721 }
1722 }
1723
1724 if (!(++count % mds->heartbeat_reset_grace()))
1725 mds->heartbeat_reset();
1726 }
1727
1728 // update segment
1729 update_segment(logseg);
1730
1731 ceph_assert(g_conf()->mds_kill_journal_replay_at != 4);
1732 }
1733
1734 // -----------------------
1735 // EPurged
1736 void EPurged::update_segment()
1737 {
1738 if (inos.size() && inotablev)
1739 get_segment()->inotablev = inotablev;
1740 return;
1741 }
1742
1743 void EPurged::replay(MDSRank *mds)
1744 {
1745 if (inos.size()) {
1746 LogSegment *ls = mds->mdlog->get_segment(seq);
1747 if (ls)
1748 ls->purging_inodes.subtract(inos);
1749
1750 if (mds->inotable->get_version() >= inotablev) {
1751 dout(10) << "EPurged.replay inotable " << mds->inotable->get_version()
1752 << " >= " << inotablev << ", noop" << dendl;
1753 } else {
1754 dout(10) << "EPurged.replay inotable " << mds->inotable->get_version()
1755 << " < " << inotablev << " " << dendl;
1756 mds->inotable->replay_release_ids(inos);
1757 ceph_assert(mds->inotable->get_version() == inotablev);
1758 }
1759 }
1760 update_segment();
1761 }
1762
1763 void EPurged::encode(bufferlist& bl, uint64_t features) const
1764 {
1765 ENCODE_START(1, 1, bl);
1766 encode(inos, bl);
1767 encode(inotablev, bl);
1768 encode(seq, bl);
1769 ENCODE_FINISH(bl);
1770 }
1771
1772 void EPurged::decode(bufferlist::const_iterator& bl)
1773 {
1774 DECODE_START(1, bl);
1775 decode(inos, bl);
1776 decode(inotablev, bl);
1777 decode(seq, bl);
1778 DECODE_FINISH(bl);
1779 }
1780
1781 void EPurged::dump(Formatter *f) const
1782 {
1783 f->dump_stream("inos") << inos;
1784 f->dump_int("inotable version", inotablev);
1785 f->dump_int("segment seq", seq);
1786 }
1787
1788 // -----------------------
1789 // ESession
1790
1791 void ESession::update_segment()
1792 {
1793 get_segment()->sessionmapv = cmapv;
1794 if (inos_to_free.size() && inotablev)
1795 get_segment()->inotablev = inotablev;
1796 }
1797
1798 void ESession::replay(MDSRank *mds)
1799 {
1800 if (inos_to_purge.size())
1801 get_segment()->purging_inodes.insert(inos_to_purge);
1802
1803 if (mds->sessionmap.get_version() >= cmapv) {
1804 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1805 << " >= " << cmapv << ", noop" << dendl;
1806 } else if (mds->sessionmap.get_version() + 1 == cmapv) {
1807 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1808 << " < " << cmapv << " " << (open ? "open":"close") << " " << client_inst << dendl;
1809 Session *session;
1810 if (open) {
1811 session = mds->sessionmap.get_or_add_session(client_inst);
1812 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1813 session->set_client_metadata(client_metadata);
1814 dout(10) << " opened session " << session->info.inst << dendl;
1815 } else {
1816 session = mds->sessionmap.get_session(client_inst.name);
1817 if (session) { // there always should be a session, but there's a bug
1818 if (session->get_connection() == NULL) {
1819 dout(10) << " removed session " << session->info.inst << dendl;
1820 mds->sessionmap.remove_session(session);
1821 session = NULL;
1822 } else {
1823 session->clear(); // the client has reconnected; keep the Session, but reset
1824 dout(10) << " reset session " << session->info.inst << " (they reconnected)" << dendl;
1825 }
1826 } else {
1827 mds->clog->error() << "replayed stray Session close event for " << client_inst
1828 << " from time " << stamp << ", ignoring";
1829 }
1830 }
1831 if (session) {
1832 mds->sessionmap.replay_dirty_session(session);
1833 } else {
1834 mds->sessionmap.replay_advance_version();
1835 }
1836 ceph_assert(mds->sessionmap.get_version() == cmapv);
1837 } else {
1838 mds->clog->error() << "ESession.replay sessionmap v " << cmapv
1839 << " - 1 > table " << mds->sessionmap.get_version();
1840 ceph_assert(g_conf()->mds_wipe_sessions);
1841 mds->sessionmap.wipe();
1842 mds->sessionmap.set_version(cmapv);
1843 }
1844
1845 if (inos_to_free.size() && inotablev) {
1846 if (mds->inotable->get_version() >= inotablev) {
1847 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1848 << " >= " << inotablev << ", noop" << dendl;
1849 } else {
1850 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1851 << " < " << inotablev << " " << (open ? "add":"remove") << dendl;
1852 ceph_assert(!open); // for now
1853 mds->inotable->replay_release_ids(inos_to_free);
1854 ceph_assert(mds->inotable->get_version() == inotablev);
1855 }
1856 }
1857
1858 update_segment();
1859 }
1860
1861 void ESession::encode(bufferlist &bl, uint64_t features) const
1862 {
1863 ENCODE_START(6, 5, bl);
1864 encode(stamp, bl);
1865 encode(client_inst, bl, features);
1866 encode(open, bl);
1867 encode(cmapv, bl);
1868 encode(inos_to_free, bl);
1869 encode(inotablev, bl);
1870 encode(client_metadata, bl);
1871 encode(inos_to_purge, bl);
1872 ENCODE_FINISH(bl);
1873 }
1874
1875 void ESession::decode(bufferlist::const_iterator &bl)
1876 {
1877 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
1878 if (struct_v >= 2)
1879 decode(stamp, bl);
1880 decode(client_inst, bl);
1881 decode(open, bl);
1882 decode(cmapv, bl);
1883 decode(inos_to_free, bl);
1884 decode(inotablev, bl);
1885 if (struct_v == 4) {
1886 decode(client_metadata.kv_map, bl);
1887 } else if (struct_v >= 5) {
1888 decode(client_metadata, bl);
1889 }
1890 if (struct_v >= 6){
1891 decode(inos_to_purge, bl);
1892 }
1893
1894 DECODE_FINISH(bl);
1895 }
1896
1897 void ESession::dump(Formatter *f) const
1898 {
1899 f->dump_stream("client instance") << client_inst;
1900 f->dump_string("open", open ? "true" : "false");
1901 f->dump_int("client map version", cmapv);
1902 f->dump_stream("inos_to_free") << inos_to_free;
1903 f->dump_int("inotable version", inotablev);
1904 f->open_object_section("client_metadata");
1905 f->dump_stream("inos_to_purge") << inos_to_purge;
1906 client_metadata.dump(f);
1907 f->close_section(); // client_metadata
1908 }
1909
1910 void ESession::generate_test_instances(std::list<ESession*>& ls)
1911 {
1912 ls.push_back(new ESession);
1913 }
1914
1915 // -----------------------
1916 // ESessions
1917
1918 void ESessions::encode(bufferlist &bl, uint64_t features) const
1919 {
1920 ENCODE_START(2, 1, bl);
1921 encode(client_map, bl, features);
1922 encode(cmapv, bl);
1923 encode(stamp, bl);
1924 encode(client_metadata_map, bl);
1925 ENCODE_FINISH(bl);
1926 }
1927
1928 void ESessions::decode_old(bufferlist::const_iterator &bl)
1929 {
1930 using ceph::decode;
1931 decode(client_map, bl);
1932 decode(cmapv, bl);
1933 if (!bl.end())
1934 decode(stamp, bl);
1935 }
1936
1937 void ESessions::decode_new(bufferlist::const_iterator &bl)
1938 {
1939 DECODE_START(2, bl);
1940 decode(client_map, bl);
1941 decode(cmapv, bl);
1942 decode(stamp, bl);
1943 if (struct_v >= 2)
1944 decode(client_metadata_map, bl);
1945 DECODE_FINISH(bl);
1946 }
1947
1948 void ESessions::dump(Formatter *f) const
1949 {
1950 f->dump_int("client map version", cmapv);
1951
1952 f->open_array_section("client map");
1953 for (map<client_t,entity_inst_t>::const_iterator i = client_map.begin();
1954 i != client_map.end(); ++i) {
1955 f->open_object_section("client");
1956 f->dump_int("client id", i->first.v);
1957 f->dump_stream("client entity") << i->second;
1958 f->close_section(); // client
1959 }
1960 f->close_section(); // client map
1961 }
1962
1963 void ESessions::generate_test_instances(std::list<ESessions*>& ls)
1964 {
1965 ls.push_back(new ESessions());
1966 }
1967
1968 void ESessions::update_segment()
1969 {
1970 get_segment()->sessionmapv = cmapv;
1971 }
1972
1973 void ESessions::replay(MDSRank *mds)
1974 {
1975 if (mds->sessionmap.get_version() >= cmapv) {
1976 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1977 << " >= " << cmapv << ", noop" << dendl;
1978 } else {
1979 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1980 << " < " << cmapv << dendl;
1981 mds->sessionmap.replay_open_sessions(cmapv, client_map, client_metadata_map);
1982 }
1983 update_segment();
1984 }
1985
1986
1987 // -----------------------
1988 // ETableServer
1989
1990 void ETableServer::encode(bufferlist& bl, uint64_t features) const
1991 {
1992 ENCODE_START(3, 3, bl);
1993 encode(stamp, bl);
1994 encode(table, bl);
1995 encode(op, bl);
1996 encode(reqid, bl);
1997 encode(bymds, bl);
1998 encode(mutation, bl);
1999 encode(tid, bl);
2000 encode(version, bl);
2001 ENCODE_FINISH(bl);
2002 }
2003
2004 void ETableServer::decode(bufferlist::const_iterator &bl)
2005 {
2006 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2007 if (struct_v >= 2)
2008 decode(stamp, bl);
2009 decode(table, bl);
2010 decode(op, bl);
2011 decode(reqid, bl);
2012 decode(bymds, bl);
2013 decode(mutation, bl);
2014 decode(tid, bl);
2015 decode(version, bl);
2016 DECODE_FINISH(bl);
2017 }
2018
2019 void ETableServer::dump(Formatter *f) const
2020 {
2021 f->dump_int("table id", table);
2022 f->dump_int("op", op);
2023 f->dump_int("request id", reqid);
2024 f->dump_int("by mds", bymds);
2025 f->dump_int("tid", tid);
2026 f->dump_int("version", version);
2027 }
2028
2029 void ETableServer::generate_test_instances(std::list<ETableServer*>& ls)
2030 {
2031 ls.push_back(new ETableServer());
2032 }
2033
2034
2035 void ETableServer::update_segment()
2036 {
2037 get_segment()->tablev[table] = version;
2038 }
2039
2040 void ETableServer::replay(MDSRank *mds)
2041 {
2042 MDSTableServer *server = mds->get_table_server(table);
2043 if (!server)
2044 return;
2045
2046 if (server->get_version() >= version) {
2047 dout(10) << "ETableServer.replay " << get_mdstable_name(table)
2048 << " " << get_mdstableserver_opname(op)
2049 << " event " << version
2050 << " <= table " << server->get_version() << dendl;
2051 return;
2052 }
2053
2054 dout(10) << " ETableServer.replay " << get_mdstable_name(table)
2055 << " " << get_mdstableserver_opname(op)
2056 << " event " << version << " - 1 == table " << server->get_version() << dendl;
2057 ceph_assert(version-1 == server->get_version());
2058
2059 switch (op) {
2060 case TABLESERVER_OP_PREPARE: {
2061 server->_note_prepare(bymds, reqid, true);
2062 bufferlist out;
2063 server->_prepare(mutation, reqid, bymds, out);
2064 mutation = std::move(out);
2065 break;
2066 }
2067 case TABLESERVER_OP_COMMIT:
2068 server->_commit(tid, ref_t<MMDSTableRequest>());
2069 server->_note_commit(tid, true);
2070 break;
2071 case TABLESERVER_OP_ROLLBACK:
2072 server->_rollback(tid);
2073 server->_note_rollback(tid, true);
2074 break;
2075 case TABLESERVER_OP_SERVER_UPDATE:
2076 server->_server_update(mutation);
2077 server->_note_server_update(mutation, true);
2078 break;
2079 default:
2080 mds->clog->error() << "invalid tableserver op in ETableServer";
2081 mds->damaged();
2082 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2083 }
2084
2085 ceph_assert(version == server->get_version());
2086 update_segment();
2087 }
2088
2089
2090 // ---------------------
2091 // ETableClient
2092
2093 void ETableClient::encode(bufferlist& bl, uint64_t features) const
2094 {
2095 ENCODE_START(3, 3, bl);
2096 encode(stamp, bl);
2097 encode(table, bl);
2098 encode(op, bl);
2099 encode(tid, bl);
2100 ENCODE_FINISH(bl);
2101 }
2102
2103 void ETableClient::decode(bufferlist::const_iterator &bl)
2104 {
2105 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2106 if (struct_v >= 2)
2107 decode(stamp, bl);
2108 decode(table, bl);
2109 decode(op, bl);
2110 decode(tid, bl);
2111 DECODE_FINISH(bl);
2112 }
2113
2114 void ETableClient::dump(Formatter *f) const
2115 {
2116 f->dump_int("table", table);
2117 f->dump_int("op", op);
2118 f->dump_int("tid", tid);
2119 }
2120
2121 void ETableClient::generate_test_instances(std::list<ETableClient*>& ls)
2122 {
2123 ls.push_back(new ETableClient());
2124 }
2125
2126 void ETableClient::replay(MDSRank *mds)
2127 {
2128 dout(10) << " ETableClient.replay " << get_mdstable_name(table)
2129 << " op " << get_mdstableserver_opname(op)
2130 << " tid " << tid << dendl;
2131
2132 MDSTableClient *client = mds->get_table_client(table);
2133 if (!client)
2134 return;
2135
2136 ceph_assert(op == TABLESERVER_OP_ACK);
2137 client->got_journaled_ack(tid);
2138 }
2139
2140
2141 // -----------------------
2142 // ESnap
2143 /*
2144 void ESnap::update_segment()
2145 {
2146 get_segment()->tablev[TABLE_SNAP] = version;
2147 }
2148
2149 void ESnap::replay(MDSRank *mds)
2150 {
2151 if (mds->snaptable->get_version() >= version) {
2152 dout(10) << "ESnap.replay event " << version
2153 << " <= table " << mds->snaptable->get_version() << dendl;
2154 return;
2155 }
2156
2157 dout(10) << " ESnap.replay event " << version
2158 << " - 1 == table " << mds->snaptable->get_version() << dendl;
2159 ceph_assert(version-1 == mds->snaptable->get_version());
2160
2161 if (create) {
2162 version_t v;
2163 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
2164 ceph_assert(s == snap.snapid);
2165 } else {
2166 mds->snaptable->remove(snap.snapid);
2167 }
2168
2169 ceph_assert(version == mds->snaptable->get_version());
2170 }
2171 */
2172
2173
2174
2175 // -----------------------
2176 // EUpdate
2177
2178 void EUpdate::encode(bufferlist &bl, uint64_t features) const
2179 {
2180 ENCODE_START(4, 4, bl);
2181 encode(stamp, bl);
2182 encode(type, bl);
2183 encode(metablob, bl, features);
2184 encode(client_map, bl);
2185 encode(cmapv, bl);
2186 encode(reqid, bl);
2187 encode(had_peers, bl);
2188 ENCODE_FINISH(bl);
2189 }
2190
2191 void EUpdate::decode(bufferlist::const_iterator &bl)
2192 {
2193 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl);
2194 if (struct_v >= 2)
2195 decode(stamp, bl);
2196 decode(type, bl);
2197 decode(metablob, bl);
2198 decode(client_map, bl);
2199 if (struct_v >= 3)
2200 decode(cmapv, bl);
2201 decode(reqid, bl);
2202 decode(had_peers, bl);
2203 DECODE_FINISH(bl);
2204 }
2205
2206 void EUpdate::dump(Formatter *f) const
2207 {
2208 f->open_object_section("metablob");
2209 metablob.dump(f);
2210 f->close_section(); // metablob
2211
2212 f->dump_string("type", type);
2213 f->dump_int("client map length", client_map.length());
2214 f->dump_int("client map version", cmapv);
2215 f->dump_stream("reqid") << reqid;
2216 f->dump_string("had peers", had_peers ? "true" : "false");
2217 }
2218
2219 void EUpdate::generate_test_instances(std::list<EUpdate*>& ls)
2220 {
2221 ls.push_back(new EUpdate());
2222 }
2223
2224
2225 void EUpdate::update_segment()
2226 {
2227 auto&& segment = get_segment();
2228 metablob.update_segment(segment);
2229
2230 if (client_map.length())
2231 segment->sessionmapv = cmapv;
2232
2233 if (had_peers)
2234 segment->uncommitted_leaders.insert(reqid);
2235 }
2236
2237 void EUpdate::replay(MDSRank *mds)
2238 {
2239 auto&& segment = get_segment();
2240 metablob.replay(mds, segment);
2241
2242 if (had_peers) {
2243 dout(10) << "EUpdate.replay " << reqid << " had peers, expecting a matching ECommitted" << dendl;
2244 segment->uncommitted_leaders.insert(reqid);
2245 set<mds_rank_t> peers;
2246 mds->mdcache->add_uncommitted_leader(reqid, segment, peers, true);
2247 }
2248
2249 if (client_map.length()) {
2250 if (mds->sessionmap.get_version() >= cmapv) {
2251 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2252 << " <= table " << mds->sessionmap.get_version() << dendl;
2253 } else {
2254 dout(10) << "EUpdate.replay sessionmap " << mds->sessionmap.get_version()
2255 << " < " << cmapv << dendl;
2256 // open client sessions?
2257 map<client_t,entity_inst_t> cm;
2258 map<client_t,client_metadata_t> cmm;
2259 auto blp = client_map.cbegin();
2260 using ceph::decode;
2261 decode(cm, blp);
2262 if (!blp.end())
2263 decode(cmm, blp);
2264 mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
2265 }
2266 }
2267 update_segment();
2268 }
2269
2270
2271 // ------------------------
2272 // EOpen
2273
2274 void EOpen::encode(bufferlist &bl, uint64_t features) const {
2275 ENCODE_START(4, 3, bl);
2276 encode(stamp, bl);
2277 encode(metablob, bl, features);
2278 encode(inos, bl);
2279 encode(snap_inos, bl);
2280 ENCODE_FINISH(bl);
2281 }
2282
2283 void EOpen::decode(bufferlist::const_iterator &bl) {
2284 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2285 if (struct_v >= 2)
2286 decode(stamp, bl);
2287 decode(metablob, bl);
2288 decode(inos, bl);
2289 if (struct_v >= 4)
2290 decode(snap_inos, bl);
2291 DECODE_FINISH(bl);
2292 }
2293
2294 void EOpen::dump(Formatter *f) const
2295 {
2296 f->open_object_section("metablob");
2297 metablob.dump(f);
2298 f->close_section(); // metablob
2299 f->open_array_section("inos involved");
2300 for (vector<inodeno_t>::const_iterator i = inos.begin();
2301 i != inos.end(); ++i) {
2302 f->dump_int("ino", *i);
2303 }
2304 f->close_section(); // inos
2305 }
2306
2307 void EOpen::generate_test_instances(std::list<EOpen*>& ls)
2308 {
2309 ls.push_back(new EOpen());
2310 ls.push_back(new EOpen());
2311 ls.back()->add_ino(0);
2312 }
2313
2314 void EOpen::update_segment()
2315 {
2316 // ??
2317 }
2318
2319 void EOpen::replay(MDSRank *mds)
2320 {
2321 dout(10) << "EOpen.replay " << dendl;
2322 auto&& segment = get_segment();
2323 metablob.replay(mds, segment);
2324
2325 // note which segments inodes belong to, so we don't have to start rejournaling them
2326 for (const auto &ino : inos) {
2327 CInode *in = mds->mdcache->get_inode(ino);
2328 if (!in) {
2329 dout(0) << "EOpen.replay ino " << ino << " not in metablob" << dendl;
2330 ceph_assert(in);
2331 }
2332 segment->open_files.push_back(&in->item_open_file);
2333 }
2334 for (const auto &vino : snap_inos) {
2335 CInode *in = mds->mdcache->get_inode(vino);
2336 if (!in) {
2337 dout(0) << "EOpen.replay ino " << vino << " not in metablob" << dendl;
2338 ceph_assert(in);
2339 }
2340 segment->open_files.push_back(&in->item_open_file);
2341 }
2342 }
2343
2344
2345 // -----------------------
2346 // ECommitted
2347
2348 void ECommitted::replay(MDSRank *mds)
2349 {
2350 if (mds->mdcache->uncommitted_leaders.count(reqid)) {
2351 dout(10) << "ECommitted.replay " << reqid << dendl;
2352 mds->mdcache->uncommitted_leaders[reqid].ls->uncommitted_leaders.erase(reqid);
2353 mds->mdcache->uncommitted_leaders.erase(reqid);
2354 } else {
2355 dout(10) << "ECommitted.replay " << reqid << " -- didn't see original op" << dendl;
2356 }
2357 }
2358
2359 void ECommitted::encode(bufferlist& bl, uint64_t features) const
2360 {
2361 ENCODE_START(3, 3, bl);
2362 encode(stamp, bl);
2363 encode(reqid, bl);
2364 ENCODE_FINISH(bl);
2365 }
2366
2367 void ECommitted::decode(bufferlist::const_iterator& bl)
2368 {
2369 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2370 if (struct_v >= 2)
2371 decode(stamp, bl);
2372 decode(reqid, bl);
2373 DECODE_FINISH(bl);
2374 }
2375
2376 void ECommitted::dump(Formatter *f) const {
2377 f->dump_stream("stamp") << stamp;
2378 f->dump_stream("reqid") << reqid;
2379 }
2380
2381 void ECommitted::generate_test_instances(std::list<ECommitted*>& ls)
2382 {
2383 ls.push_back(new ECommitted);
2384 ls.push_back(new ECommitted);
2385 ls.back()->stamp = utime_t(1, 2);
2386 ls.back()->reqid = metareqid_t(entity_name_t::CLIENT(123), 456);
2387 }
2388
2389 // -----------------------
2390 // EPeerUpdate
2391
2392 void link_rollback::encode(bufferlist &bl) const
2393 {
2394 ENCODE_START(3, 2, bl);
2395 encode(reqid, bl);
2396 encode(ino, bl);
2397 encode(was_inc, bl);
2398 encode(old_ctime, bl);
2399 encode(old_dir_mtime, bl);
2400 encode(old_dir_rctime, bl);
2401 encode(snapbl, bl);
2402 ENCODE_FINISH(bl);
2403 }
2404
2405 void link_rollback::decode(bufferlist::const_iterator &bl)
2406 {
2407 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2408 decode(reqid, bl);
2409 decode(ino, bl);
2410 decode(was_inc, bl);
2411 decode(old_ctime, bl);
2412 decode(old_dir_mtime, bl);
2413 decode(old_dir_rctime, bl);
2414 if (struct_v >= 3)
2415 decode(snapbl, bl);
2416 DECODE_FINISH(bl);
2417 }
2418
2419 void link_rollback::dump(Formatter *f) const
2420 {
2421 f->dump_stream("metareqid") << reqid;
2422 f->dump_int("ino", ino);
2423 f->dump_string("was incremented", was_inc ? "true" : "false");
2424 f->dump_stream("old_ctime") << old_ctime;
2425 f->dump_stream("old_dir_mtime") << old_dir_mtime;
2426 f->dump_stream("old_dir_rctime") << old_dir_rctime;
2427 }
2428
2429 void link_rollback::generate_test_instances(std::list<link_rollback*>& ls)
2430 {
2431 ls.push_back(new link_rollback());
2432 }
2433
2434 void rmdir_rollback::encode(bufferlist& bl) const
2435 {
2436 ENCODE_START(3, 2, bl);
2437 encode(reqid, bl);
2438 encode(src_dir, bl);
2439 encode(src_dname, bl);
2440 encode(dest_dir, bl);
2441 encode(dest_dname, bl);
2442 encode(snapbl, bl);
2443 ENCODE_FINISH(bl);
2444 }
2445
2446 void rmdir_rollback::decode(bufferlist::const_iterator& bl)
2447 {
2448 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2449 decode(reqid, bl);
2450 decode(src_dir, bl);
2451 decode(src_dname, bl);
2452 decode(dest_dir, bl);
2453 decode(dest_dname, bl);
2454 if (struct_v >= 3)
2455 decode(snapbl, bl);
2456 DECODE_FINISH(bl);
2457 }
2458
2459 void rmdir_rollback::dump(Formatter *f) const
2460 {
2461 f->dump_stream("metareqid") << reqid;
2462 f->dump_stream("source directory") << src_dir;
2463 f->dump_string("source dname", src_dname);
2464 f->dump_stream("destination directory") << dest_dir;
2465 f->dump_string("destination dname", dest_dname);
2466 }
2467
2468 void rmdir_rollback::generate_test_instances(std::list<rmdir_rollback*>& ls)
2469 {
2470 ls.push_back(new rmdir_rollback());
2471 }
2472
2473 void rename_rollback::drec::encode(bufferlist &bl) const
2474 {
2475 ENCODE_START(2, 2, bl);
2476 encode(dirfrag, bl);
2477 encode(dirfrag_old_mtime, bl);
2478 encode(dirfrag_old_rctime, bl);
2479 encode(ino, bl);
2480 encode(remote_ino, bl);
2481 encode(dname, bl);
2482 encode(remote_d_type, bl);
2483 encode(old_ctime, bl);
2484 ENCODE_FINISH(bl);
2485 }
2486
2487 void rename_rollback::drec::decode(bufferlist::const_iterator &bl)
2488 {
2489 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2490 decode(dirfrag, bl);
2491 decode(dirfrag_old_mtime, bl);
2492 decode(dirfrag_old_rctime, bl);
2493 decode(ino, bl);
2494 decode(remote_ino, bl);
2495 decode(dname, bl);
2496 decode(remote_d_type, bl);
2497 decode(old_ctime, bl);
2498 DECODE_FINISH(bl);
2499 }
2500
2501 void rename_rollback::drec::dump(Formatter *f) const
2502 {
2503 f->dump_stream("directory fragment") << dirfrag;
2504 f->dump_stream("directory old mtime") << dirfrag_old_mtime;
2505 f->dump_stream("directory old rctime") << dirfrag_old_rctime;
2506 f->dump_int("ino", ino);
2507 f->dump_int("remote ino", remote_ino);
2508 f->dump_string("dname", dname);
2509 uint32_t type = DTTOIF(remote_d_type) & S_IFMT; // convert to type entries
2510 string type_string;
2511 switch(type) {
2512 case S_IFREG:
2513 type_string = "file"; break;
2514 case S_IFLNK:
2515 type_string = "symlink"; break;
2516 case S_IFDIR:
2517 type_string = "directory"; break;
2518 default:
2519 type_string = "UNKNOWN-" + stringify((int)type); break;
2520 }
2521 f->dump_string("remote dtype", type_string);
2522 f->dump_stream("old ctime") << old_ctime;
2523 }
2524
2525 void rename_rollback::drec::generate_test_instances(std::list<drec*>& ls)
2526 {
2527 ls.push_back(new drec());
2528 ls.back()->remote_d_type = IFTODT(S_IFREG);
2529 }
2530
2531 void rename_rollback::encode(bufferlist &bl) const
2532 {
2533 ENCODE_START(3, 2, bl);
2534 encode(reqid, bl);
2535 encode(orig_src, bl);
2536 encode(orig_dest, bl);
2537 encode(stray, bl);
2538 encode(ctime, bl);
2539 encode(srci_snapbl, bl);
2540 encode(desti_snapbl, bl);
2541 ENCODE_FINISH(bl);
2542 }
2543
2544 void rename_rollback::decode(bufferlist::const_iterator &bl)
2545 {
2546 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2547 decode(reqid, bl);
2548 decode(orig_src, bl);
2549 decode(orig_dest, bl);
2550 decode(stray, bl);
2551 decode(ctime, bl);
2552 if (struct_v >= 3) {
2553 decode(srci_snapbl, bl);
2554 decode(desti_snapbl, bl);
2555 }
2556 DECODE_FINISH(bl);
2557 }
2558
2559 void rename_rollback::dump(Formatter *f) const
2560 {
2561 f->dump_stream("request id") << reqid;
2562 f->open_object_section("original src drec");
2563 orig_src.dump(f);
2564 f->close_section(); // original src drec
2565 f->open_object_section("original dest drec");
2566 orig_dest.dump(f);
2567 f->close_section(); // original dest drec
2568 f->open_object_section("stray drec");
2569 stray.dump(f);
2570 f->close_section(); // stray drec
2571 f->dump_stream("ctime") << ctime;
2572 }
2573
2574 void rename_rollback::generate_test_instances(std::list<rename_rollback*>& ls)
2575 {
2576 ls.push_back(new rename_rollback());
2577 ls.back()->orig_src.remote_d_type = IFTODT(S_IFREG);
2578 ls.back()->orig_dest.remote_d_type = IFTODT(S_IFREG);
2579 ls.back()->stray.remote_d_type = IFTODT(S_IFREG);
2580 }
2581
2582 void EPeerUpdate::encode(bufferlist &bl, uint64_t features) const
2583 {
2584 ENCODE_START(3, 3, bl);
2585 encode(stamp, bl);
2586 encode(type, bl);
2587 encode(reqid, bl);
2588 encode(leader, bl);
2589 encode(op, bl);
2590 encode(origop, bl);
2591 encode(commit, bl, features);
2592 encode(rollback, bl);
2593 ENCODE_FINISH(bl);
2594 }
2595
2596 void EPeerUpdate::decode(bufferlist::const_iterator &bl)
2597 {
2598 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2599 if (struct_v >= 2)
2600 decode(stamp, bl);
2601 decode(type, bl);
2602 decode(reqid, bl);
2603 decode(leader, bl);
2604 decode(op, bl);
2605 decode(origop, bl);
2606 decode(commit, bl);
2607 decode(rollback, bl);
2608 DECODE_FINISH(bl);
2609 }
2610
2611 void EPeerUpdate::dump(Formatter *f) const
2612 {
2613 f->open_object_section("metablob");
2614 commit.dump(f);
2615 f->close_section(); // metablob
2616
2617 f->dump_int("rollback length", rollback.length());
2618 f->dump_string("type", type);
2619 f->dump_stream("metareqid") << reqid;
2620 f->dump_int("leader", leader);
2621 f->dump_int("op", op);
2622 f->dump_int("original op", origop);
2623 }
2624
2625 void EPeerUpdate::generate_test_instances(std::list<EPeerUpdate*>& ls)
2626 {
2627 ls.push_back(new EPeerUpdate());
2628 }
2629
2630 void EPeerUpdate::replay(MDSRank *mds)
2631 {
2632 MDPeerUpdate *su;
2633 auto&& segment = get_segment();
2634 switch (op) {
2635 case EPeerUpdate::OP_PREPARE:
2636 dout(10) << "EPeerUpdate.replay prepare " << reqid << " for mds." << leader
2637 << ": applying commit, saving rollback info" << dendl;
2638 su = new MDPeerUpdate(origop, rollback);
2639 commit.replay(mds, segment, su);
2640 mds->mdcache->add_uncommitted_peer(reqid, segment, leader, su);
2641 break;
2642
2643 case EPeerUpdate::OP_COMMIT:
2644 dout(10) << "EPeerUpdate.replay commit " << reqid << " for mds." << leader << dendl;
2645 mds->mdcache->finish_uncommitted_peer(reqid, false);
2646 break;
2647
2648 case EPeerUpdate::OP_ROLLBACK:
2649 dout(10) << "EPeerUpdate.replay abort " << reqid << " for mds." << leader
2650 << ": applying rollback commit blob" << dendl;
2651 commit.replay(mds, segment);
2652 mds->mdcache->finish_uncommitted_peer(reqid, false);
2653 break;
2654
2655 default:
2656 mds->clog->error() << "invalid op in EPeerUpdate";
2657 mds->damaged();
2658 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2659 }
2660 }
2661
2662
2663 // -----------------------
2664 // ESubtreeMap
2665
2666 void ESubtreeMap::encode(bufferlist& bl, uint64_t features) const
2667 {
2668 ENCODE_START(6, 5, bl);
2669 encode(stamp, bl);
2670 encode(metablob, bl, features);
2671 encode(subtrees, bl);
2672 encode(ambiguous_subtrees, bl);
2673 encode(expire_pos, bl);
2674 encode(event_seq, bl);
2675 ENCODE_FINISH(bl);
2676 }
2677
2678 void ESubtreeMap::decode(bufferlist::const_iterator &bl)
2679 {
2680 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
2681 if (struct_v >= 2)
2682 decode(stamp, bl);
2683 decode(metablob, bl);
2684 decode(subtrees, bl);
2685 if (struct_v >= 4)
2686 decode(ambiguous_subtrees, bl);
2687 if (struct_v >= 3)
2688 decode(expire_pos, bl);
2689 if (struct_v >= 6)
2690 decode(event_seq, bl);
2691 DECODE_FINISH(bl);
2692 }
2693
2694 void ESubtreeMap::dump(Formatter *f) const
2695 {
2696 f->open_object_section("metablob");
2697 metablob.dump(f);
2698 f->close_section(); // metablob
2699
2700 f->open_array_section("subtrees");
2701 for(map<dirfrag_t,vector<dirfrag_t> >::const_iterator i = subtrees.begin();
2702 i != subtrees.end(); ++i) {
2703 f->open_object_section("tree");
2704 f->dump_stream("root dirfrag") << i->first;
2705 for (vector<dirfrag_t>::const_iterator j = i->second.begin();
2706 j != i->second.end(); ++j) {
2707 f->dump_stream("bound dirfrag") << *j;
2708 }
2709 f->close_section(); // tree
2710 }
2711 f->close_section(); // subtrees
2712
2713 f->open_array_section("ambiguous subtrees");
2714 for(set<dirfrag_t>::const_iterator i = ambiguous_subtrees.begin();
2715 i != ambiguous_subtrees.end(); ++i) {
2716 f->dump_stream("dirfrag") << *i;
2717 }
2718 f->close_section(); // ambiguous subtrees
2719
2720 f->dump_int("expire position", expire_pos);
2721 }
2722
2723 void ESubtreeMap::generate_test_instances(std::list<ESubtreeMap*>& ls)
2724 {
2725 ls.push_back(new ESubtreeMap());
2726 }
2727
2728 void ESubtreeMap::replay(MDSRank *mds)
2729 {
2730 if (expire_pos && expire_pos > mds->mdlog->journaler->get_expire_pos())
2731 mds->mdlog->journaler->set_expire_pos(expire_pos);
2732
2733 // suck up the subtree map?
2734 if (mds->mdcache->is_subtrees()) {
2735 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl;
2736 int errors = 0;
2737
2738 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2739 p != subtrees.end();
2740 ++p) {
2741 CDir *dir = mds->mdcache->get_dirfrag(p->first);
2742 if (!dir) {
2743 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2744 << " subtree root " << p->first << " not in cache";
2745 ++errors;
2746 continue;
2747 }
2748
2749 if (!mds->mdcache->is_subtree(dir)) {
2750 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2751 << " subtree root " << p->first << " not a subtree in cache";
2752 ++errors;
2753 continue;
2754 }
2755 if (dir->get_dir_auth().first != mds->get_nodeid()) {
2756 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2757 << " subtree root " << p->first
2758 << " is not mine in cache (it's " << dir->get_dir_auth() << ")";
2759 ++errors;
2760 continue;
2761 }
2762
2763 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2764 mds->mdcache->get_force_dirfrag(*q, true);
2765
2766 set<CDir*> bounds;
2767 mds->mdcache->get_subtree_bounds(dir, bounds);
2768 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2769 CDir *b = mds->mdcache->get_dirfrag(*q);
2770 if (!b) {
2771 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2772 << " subtree " << p->first << " bound " << *q << " not in cache";
2773 ++errors;
2774 continue;
2775 }
2776 if (bounds.count(b) == 0) {
2777 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2778 << " subtree " << p->first << " bound " << *q << " not a bound in cache";
2779 ++errors;
2780 continue;
2781 }
2782 bounds.erase(b);
2783 }
2784 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q) {
2785 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2786 << " subtree " << p->first << " has extra bound in cache " << (*q)->dirfrag();
2787 ++errors;
2788 }
2789
2790 if (ambiguous_subtrees.count(p->first)) {
2791 if (!mds->mdcache->have_ambiguous_import(p->first)) {
2792 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2793 << " subtree " << p->first << " is ambiguous but is not in our cache";
2794 ++errors;
2795 }
2796 } else {
2797 if (mds->mdcache->have_ambiguous_import(p->first)) {
2798 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2799 << " subtree " << p->first << " is not ambiguous but is in our cache";
2800 ++errors;
2801 }
2802 }
2803 }
2804
2805 std::vector<CDir*> dirs;
2806 mds->mdcache->get_subtrees(dirs);
2807 for (const auto& dir : dirs) {
2808 if (dir->get_dir_auth().first != mds->get_nodeid())
2809 continue;
2810 if (subtrees.count(dir->dirfrag()) == 0) {
2811 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2812 << " does not include cache subtree " << dir->dirfrag();
2813 ++errors;
2814 }
2815 }
2816
2817 if (errors) {
2818 dout(0) << "journal subtrees: " << subtrees << dendl;
2819 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees << dendl;
2820 mds->mdcache->show_subtrees();
2821 ceph_assert(!g_conf()->mds_debug_subtrees || errors == 0);
2822 }
2823 return;
2824 }
2825
2826 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl;
2827
2828 // first, stick the spanning tree in my cache
2829 //metablob.print(*_dout);
2830 metablob.replay(mds, get_segment());
2831
2832 // restore import/export maps
2833 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2834 p != subtrees.end();
2835 ++p) {
2836 CDir *dir = mds->mdcache->get_dirfrag(p->first);
2837 ceph_assert(dir);
2838 if (ambiguous_subtrees.count(p->first)) {
2839 // ambiguous!
2840 mds->mdcache->add_ambiguous_import(p->first, p->second);
2841 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second,
2842 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
2843 } else {
2844 // not ambiguous
2845 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid());
2846 }
2847 }
2848
2849 mds->mdcache->recalc_auth_bits(true);
2850
2851 mds->mdcache->show_subtrees();
2852 }
2853
2854
2855
2856 // -----------------------
2857 // EFragment
2858
2859 void EFragment::replay(MDSRank *mds)
2860 {
2861 dout(10) << "EFragment.replay " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << dendl;
2862
2863 std::vector<CDir*> resultfrags;
2864 MDSContext::vec waiters;
2865
2866 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2867 // it will be once we replay the metablob , but first we need to
2868 // refragment anything we already have in the cache.
2869 CInode *in = mds->mdcache->get_inode(ino);
2870
2871 auto&& segment = get_segment();
2872 switch (op) {
2873 case OP_PREPARE:
2874 mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, segment, &rollback);
2875
2876 if (in)
2877 mds->mdcache->adjust_dir_fragments(in, basefrag, bits, &resultfrags, waiters, true);
2878 break;
2879
2880 case OP_ROLLBACK: {
2881 frag_vec_t old_frags;
2882 if (in) {
2883 in->dirfragtree.get_leaves_under(basefrag, old_frags);
2884 if (orig_frags.empty()) {
2885 // old format EFragment
2886 mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, &resultfrags, waiters, true);
2887 } else {
2888 for (const auto& fg : orig_frags)
2889 mds->mdcache->force_dir_fragment(in, fg);
2890 }
2891 }
2892 mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), std::move(old_frags));
2893 break;
2894 }
2895
2896 case OP_COMMIT:
2897 case OP_FINISH:
2898 mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op);
2899 break;
2900
2901 default:
2902 ceph_abort();
2903 }
2904
2905 metablob.replay(mds, segment);
2906 if (in && g_conf()->mds_debug_frag)
2907 in->verify_dirfrags();
2908 }
2909
2910 void EFragment::encode(bufferlist &bl, uint64_t features) const {
2911 ENCODE_START(5, 4, bl);
2912 encode(stamp, bl);
2913 encode(op, bl);
2914 encode(ino, bl);
2915 encode(basefrag, bl);
2916 encode(bits, bl);
2917 encode(metablob, bl, features);
2918 encode(orig_frags, bl);
2919 encode(rollback, bl);
2920 ENCODE_FINISH(bl);
2921 }
2922
2923 void EFragment::decode(bufferlist::const_iterator &bl) {
2924 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
2925 if (struct_v >= 2)
2926 decode(stamp, bl);
2927 if (struct_v >= 3)
2928 decode(op, bl);
2929 decode(ino, bl);
2930 decode(basefrag, bl);
2931 decode(bits, bl);
2932 decode(metablob, bl);
2933 if (struct_v >= 5) {
2934 decode(orig_frags, bl);
2935 decode(rollback, bl);
2936 }
2937 DECODE_FINISH(bl);
2938 }
2939
2940 void EFragment::dump(Formatter *f) const
2941 {
2942 /*f->open_object_section("Metablob");
2943 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2944 f->close_section();*/
2945 f->dump_string("op", op_name(op));
2946 f->dump_stream("ino") << ino;
2947 f->dump_stream("base frag") << basefrag;
2948 f->dump_int("bits", bits);
2949 }
2950
2951 void EFragment::generate_test_instances(std::list<EFragment*>& ls)
2952 {
2953 ls.push_back(new EFragment);
2954 ls.push_back(new EFragment);
2955 ls.back()->op = OP_PREPARE;
2956 ls.back()->ino = 1;
2957 ls.back()->bits = 5;
2958 }
2959
2960 void dirfrag_rollback::encode(bufferlist &bl) const
2961 {
2962 ENCODE_START(1, 1, bl);
2963 encode(*fnode, bl);
2964 ENCODE_FINISH(bl);
2965 }
2966
2967 void dirfrag_rollback::decode(bufferlist::const_iterator &bl)
2968 {
2969 DECODE_START(1, bl);
2970 {
2971 auto _fnode = CDir::allocate_fnode();
2972 decode(*_fnode, bl);
2973 fnode = std::move(_fnode);
2974 }
2975 DECODE_FINISH(bl);
2976 }
2977
2978
2979
2980 // =========================================================================
2981
2982 // -----------------------
2983 // EExport
2984
2985 void EExport::replay(MDSRank *mds)
2986 {
2987 dout(10) << "EExport.replay " << base << dendl;
2988 auto&& segment = get_segment();
2989 metablob.replay(mds, segment);
2990
2991 CDir *dir = mds->mdcache->get_dirfrag(base);
2992 ceph_assert(dir);
2993
2994 set<CDir*> realbounds;
2995 for (set<dirfrag_t>::iterator p = bounds.begin();
2996 p != bounds.end();
2997 ++p) {
2998 CDir *bd = mds->mdcache->get_dirfrag(*p);
2999 ceph_assert(bd);
3000 realbounds.insert(bd);
3001 }
3002
3003 // adjust auth away
3004 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, CDIR_AUTH_UNDEF);
3005
3006 mds->mdcache->try_trim_non_auth_subtree(dir);
3007 }
3008
3009 void EExport::encode(bufferlist& bl, uint64_t features) const
3010 {
3011 ENCODE_START(4, 3, bl);
3012 encode(stamp, bl);
3013 encode(metablob, bl, features);
3014 encode(base, bl);
3015 encode(bounds, bl);
3016 encode(target, bl);
3017 ENCODE_FINISH(bl);
3018 }
3019
3020 void EExport::decode(bufferlist::const_iterator &bl)
3021 {
3022 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
3023 if (struct_v >= 2)
3024 decode(stamp, bl);
3025 decode(metablob, bl);
3026 decode(base, bl);
3027 decode(bounds, bl);
3028 if (struct_v >= 4)
3029 decode(target, bl);
3030 DECODE_FINISH(bl);
3031 }
3032
3033 void EExport::dump(Formatter *f) const
3034 {
3035 f->dump_float("stamp", (double)stamp);
3036 /*f->open_object_section("Metablob");
3037 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
3038 f->close_section();*/
3039 f->dump_stream("base dirfrag") << base;
3040 f->open_array_section("bounds dirfrags");
3041 for (set<dirfrag_t>::const_iterator i = bounds.begin();
3042 i != bounds.end(); ++i) {
3043 f->dump_stream("dirfrag") << *i;
3044 }
3045 f->close_section(); // bounds dirfrags
3046 }
3047
3048 void EExport::generate_test_instances(std::list<EExport*>& ls)
3049 {
3050 EExport *sample = new EExport();
3051 ls.push_back(sample);
3052 }
3053
3054
3055 // -----------------------
3056 // EImportStart
3057
3058 void EImportStart::update_segment()
3059 {
3060 get_segment()->sessionmapv = cmapv;
3061 }
3062
3063 void EImportStart::replay(MDSRank *mds)
3064 {
3065 dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl;
3066 //metablob.print(*_dout);
3067 auto&& segment = get_segment();
3068 metablob.replay(mds, segment);
3069
3070 // put in ambiguous import list
3071 mds->mdcache->add_ambiguous_import(base, bounds);
3072
3073 // set auth partially to us so we don't trim it
3074 CDir *dir = mds->mdcache->get_dirfrag(base);
3075 ceph_assert(dir);
3076
3077 set<CDir*> realbounds;
3078 for (vector<dirfrag_t>::iterator p = bounds.begin();
3079 p != bounds.end();
3080 ++p) {
3081 CDir *bd = mds->mdcache->get_dirfrag(*p);
3082 ceph_assert(bd);
3083 if (!bd->is_subtree_root())
3084 bd->state_clear(CDir::STATE_AUTH);
3085 realbounds.insert(bd);
3086 }
3087
3088 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds,
3089 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
3090
3091 // open client sessions?
3092 if (mds->sessionmap.get_version() >= cmapv) {
3093 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
3094 << " >= " << cmapv << ", noop" << dendl;
3095 } else {
3096 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
3097 << " < " << cmapv << dendl;
3098 map<client_t,entity_inst_t> cm;
3099 map<client_t,client_metadata_t> cmm;
3100 auto blp = client_map.cbegin();
3101 using ceph::decode;
3102 decode(cm, blp);
3103 if (!blp.end())
3104 decode(cmm, blp);
3105 mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
3106 }
3107 update_segment();
3108 }
3109
3110 void EImportStart::encode(bufferlist &bl, uint64_t features) const {
3111 ENCODE_START(4, 3, bl);
3112 encode(stamp, bl);
3113 encode(base, bl);
3114 encode(metablob, bl, features);
3115 encode(bounds, bl);
3116 encode(cmapv, bl);
3117 encode(client_map, bl);
3118 encode(from, bl);
3119 ENCODE_FINISH(bl);
3120 }
3121
3122 void EImportStart::decode(bufferlist::const_iterator &bl) {
3123 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
3124 if (struct_v >= 2)
3125 decode(stamp, bl);
3126 decode(base, bl);
3127 decode(metablob, bl);
3128 decode(bounds, bl);
3129 decode(cmapv, bl);
3130 decode(client_map, bl);
3131 if (struct_v >= 4)
3132 decode(from, bl);
3133 DECODE_FINISH(bl);
3134 }
3135
3136 void EImportStart::dump(Formatter *f) const
3137 {
3138 f->dump_stream("base dirfrag") << base;
3139 f->open_array_section("boundary dirfrags");
3140 for (vector<dirfrag_t>::const_iterator iter = bounds.begin();
3141 iter != bounds.end(); ++iter) {
3142 f->dump_stream("frag") << *iter;
3143 }
3144 f->close_section();
3145 }
3146
3147 void EImportStart::generate_test_instances(std::list<EImportStart*>& ls)
3148 {
3149 ls.push_back(new EImportStart);
3150 }
3151
3152 // -----------------------
3153 // EImportFinish
3154
3155 void EImportFinish::replay(MDSRank *mds)
3156 {
3157 if (mds->mdcache->have_ambiguous_import(base)) {
3158 dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl;
3159 if (success) {
3160 mds->mdcache->finish_ambiguous_import(base);
3161 } else {
3162 CDir *dir = mds->mdcache->get_dirfrag(base);
3163 ceph_assert(dir);
3164 vector<dirfrag_t> bounds;
3165 mds->mdcache->get_ambiguous_import_bounds(base, bounds);
3166 mds->mdcache->adjust_bounded_subtree_auth(dir, bounds, CDIR_AUTH_UNDEF);
3167 mds->mdcache->cancel_ambiguous_import(dir);
3168 mds->mdcache->try_trim_non_auth_subtree(dir);
3169 }
3170 } else {
3171 // this shouldn't happen unless this is an old journal
3172 dout(10) << "EImportFinish.replay " << base << " success=" << success
3173 << " on subtree not marked as ambiguous"
3174 << dendl;
3175 mds->clog->error() << "failure replaying journal (EImportFinish)";
3176 mds->damaged();
3177 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3178 }
3179 }
3180
3181 void EImportFinish::encode(bufferlist& bl, uint64_t features) const
3182 {
3183 ENCODE_START(3, 3, bl);
3184 encode(stamp, bl);
3185 encode(base, bl);
3186 encode(success, bl);
3187 ENCODE_FINISH(bl);
3188 }
3189
3190 void EImportFinish::decode(bufferlist::const_iterator &bl)
3191 {
3192 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
3193 if (struct_v >= 2)
3194 decode(stamp, bl);
3195 decode(base, bl);
3196 decode(success, bl);
3197 DECODE_FINISH(bl);
3198 }
3199
3200 void EImportFinish::dump(Formatter *f) const
3201 {
3202 f->dump_stream("base dirfrag") << base;
3203 f->dump_string("success", success ? "true" : "false");
3204 }
3205 void EImportFinish::generate_test_instances(std::list<EImportFinish*>& ls)
3206 {
3207 ls.push_back(new EImportFinish);
3208 ls.push_back(new EImportFinish);
3209 ls.back()->success = true;
3210 }
3211
3212
3213 // ------------------------
3214 // EResetJournal
3215
3216 void EResetJournal::encode(bufferlist& bl, uint64_t features) const
3217 {
3218 ENCODE_START(2, 2, bl);
3219 encode(stamp, bl);
3220 ENCODE_FINISH(bl);
3221 }
3222
3223 void EResetJournal::decode(bufferlist::const_iterator &bl)
3224 {
3225 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
3226 decode(stamp, bl);
3227 DECODE_FINISH(bl);
3228 }
3229
3230 void EResetJournal::dump(Formatter *f) const
3231 {
3232 f->dump_stream("timestamp") << stamp;
3233 }
3234
3235 void EResetJournal::generate_test_instances(std::list<EResetJournal*>& ls)
3236 {
3237 ls.push_back(new EResetJournal());
3238 }
3239
3240 void EResetJournal::replay(MDSRank *mds)
3241 {
3242 dout(1) << "EResetJournal" << dendl;
3243
3244 mds->sessionmap.wipe();
3245 mds->inotable->replay_reset();
3246
3247 if (mds->mdsmap->get_root() == mds->get_nodeid()) {
3248 CDir *rootdir = mds->mdcache->get_root()->get_or_open_dirfrag(mds->mdcache, frag_t());
3249 mds->mdcache->adjust_subtree_auth(rootdir, mds->get_nodeid());
3250 }
3251
3252 CDir *mydir = mds->mdcache->get_myin()->get_or_open_dirfrag(mds->mdcache, frag_t());
3253 mds->mdcache->adjust_subtree_auth(mydir, mds->get_nodeid());
3254
3255 mds->mdcache->recalc_auth_bits(true);
3256
3257 mds->mdcache->show_subtrees();
3258 }
3259
3260
3261 void ENoOp::encode(bufferlist &bl, uint64_t features) const
3262 {
3263 ENCODE_START(2, 2, bl);
3264 encode(pad_size, bl);
3265 uint8_t const pad = 0xff;
3266 for (unsigned int i = 0; i < pad_size; ++i) {
3267 encode(pad, bl);
3268 }
3269 ENCODE_FINISH(bl);
3270 }
3271
3272
3273 void ENoOp::decode(bufferlist::const_iterator &bl)
3274 {
3275 DECODE_START(2, bl);
3276 decode(pad_size, bl);
3277 if (bl.get_remaining() != pad_size) {
3278 // This is spiritually an assertion, but expressing in a way that will let
3279 // journal debug tools catch it and recognise a malformed entry.
3280 throw buffer::end_of_buffer();
3281 } else {
3282 bl += pad_size;
3283 }
3284 DECODE_FINISH(bl);
3285 }
3286
3287
3288 void ENoOp::replay(MDSRank *mds)
3289 {
3290 dout(4) << "ENoOp::replay, " << pad_size << " bytes skipped in journal" << dendl;
3291 }
3292
3293 /**
3294 * If re-formatting an old journal that used absolute log position
3295 * references as segment sequence numbers, use this function to update
3296 * it.
3297 *
3298 * @param mds
3299 * MDSRank instance, just used for logging
3300 * @param old_to_new
3301 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3302 *
3303 * @return
3304 * True if the event was modified.
3305 */
3306 bool EMetaBlob::rewrite_truncate_finish(MDSRank const *mds,
3307 std::map<LogSegment::seq_t, LogSegment::seq_t> const &old_to_new)
3308 {
3309 bool modified = false;
3310 map<inodeno_t, LogSegment::seq_t> new_trunc_finish;
3311 for (const auto& p : truncate_finish) {
3312 auto q = old_to_new.find(p.second);
3313 if (q != old_to_new.end()) {
3314 dout(20) << __func__ << " applying segment seq mapping "
3315 << p.second << " -> " << q->second << dendl;
3316 new_trunc_finish.emplace(p.first, q->second);
3317 modified = true;
3318 } else {
3319 dout(20) << __func__ << " no segment seq mapping found for "
3320 << p.second << dendl;
3321 new_trunc_finish.insert(p);
3322 }
3323 }
3324 truncate_finish.swap(new_trunc_finish);
3325
3326 return modified;
3327 }