]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/journal.cc
import ceph 16.2.6
[ceph.git] / ceph / src / mds / journal.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "common/config.h"
16#include "osdc/Journaler.h"
17#include "events/ESubtreeMap.h"
18#include "events/ESession.h"
19#include "events/ESessions.h"
20
21#include "events/EMetaBlob.h"
22#include "events/EResetJournal.h"
23#include "events/ENoOp.h"
24
25#include "events/EUpdate.h"
f67539c2 26#include "events/EPeerUpdate.h"
7c673cae
FG
27#include "events/EOpen.h"
28#include "events/ECommitted.h"
9f95a23c 29#include "events/EPurged.h"
7c673cae
FG
30
31#include "events/EExport.h"
32#include "events/EImportStart.h"
33#include "events/EImportFinish.h"
34#include "events/EFragment.h"
35
36#include "events/ETableClient.h"
37#include "events/ETableServer.h"
38
39#include "include/stringify.h"
40
41#include "LogSegment.h"
42
43#include "MDSRank.h"
44#include "MDLog.h"
45#include "MDCache.h"
46#include "Server.h"
47#include "Migrator.h"
48#include "Mutation.h"
49
50#include "InoTable.h"
51#include "MDSTableClient.h"
52#include "MDSTableServer.h"
53
54#include "Locker.h"
55
56#define dout_context g_ceph_context
57#define dout_subsys ceph_subsys_mds
58#undef dout_prefix
59#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
60
61
62// -----------------------
63// LogSegment
64
f67539c2
TL
65struct BatchStoredBacktrace : public MDSIOContext {
66 MDSContext *fin;
67 std::vector<CInodeCommitOperations> ops_vec;
68
69 BatchStoredBacktrace(MDSRank *m, MDSContext *f,
70 std::vector<CInodeCommitOperations>&& ops) :
71 MDSIOContext(m), fin(f), ops_vec(std::move(ops)) {}
72 void finish(int r) override {
73 for (auto& op : ops_vec) {
74 op.in->_stored_backtrace(r, op.version, nullptr);
75 }
76 fin->complete(r);
77 }
78 void print(ostream& out) const override {
79 out << "batch backtrace_store";
80 }
81};
82
83struct BatchCommitBacktrace : public Context {
84 MDSRank *mds;
85 MDSContext *fin;
86 std::vector<CInodeCommitOperations> ops_vec;
87
88 BatchCommitBacktrace(MDSRank *m, MDSContext *f,
89 std::vector<CInodeCommitOperations>&& ops) :
90 mds(m), fin(f), ops_vec(std::move(ops)) {}
91 void finish(int r) override {
92 C_GatherBuilder gather(g_ceph_context);
93
94 for (auto &op : ops_vec) {
95 op.in->_commit_ops(r, gather, op.ops_vec, op.bt);
96 op.ops_vec.clear();
97 op.bt.clear();
98 }
99 ceph_assert(gather.has_subs());
100 gather.set_finisher(new C_OnFinisher(
101 new BatchStoredBacktrace(mds, fin, std::move(ops_vec)),
102 mds->finisher));
103 gather.activate();
104 }
105};
106
7c673cae
FG
107void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio)
108{
109 set<CDir*> commit;
110
111 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire" << dendl;
112
11fdf7f2 113 ceph_assert(g_conf()->mds_kill_journal_expire_at != 1);
7c673cae
FG
114
115 // commit dirs
116 for (elist<CDir*>::iterator p = new_dirfrags.begin(); !p.end(); ++p) {
117 dout(20) << " new_dirfrag " << **p << dendl;
11fdf7f2 118 ceph_assert((*p)->is_auth());
7c673cae
FG
119 commit.insert(*p);
120 }
121 for (elist<CDir*>::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) {
122 dout(20) << " dirty_dirfrag " << **p << dendl;
11fdf7f2 123 ceph_assert((*p)->is_auth());
7c673cae
FG
124 commit.insert(*p);
125 }
126 for (elist<CDentry*>::iterator p = dirty_dentries.begin(); !p.end(); ++p) {
127 dout(20) << " dirty_dentry " << **p << dendl;
11fdf7f2 128 ceph_assert((*p)->is_auth());
7c673cae
FG
129 commit.insert((*p)->get_dir());
130 }
131 for (elist<CInode*>::iterator p = dirty_inodes.begin(); !p.end(); ++p) {
132 dout(20) << " dirty_inode " << **p << dendl;
11fdf7f2 133 ceph_assert((*p)->is_auth());
7c673cae
FG
134 if ((*p)->is_base()) {
135 (*p)->store(gather_bld.new_sub());
136 } else
137 commit.insert((*p)->get_parent_dn()->get_dir());
138 }
139
140 if (!commit.empty()) {
141 for (set<CDir*>::iterator p = commit.begin();
142 p != commit.end();
143 ++p) {
144 CDir *dir = *p;
11fdf7f2 145 ceph_assert(dir->is_auth());
7c673cae
FG
146 if (dir->can_auth_pin()) {
147 dout(15) << "try_to_expire committing " << *dir << dendl;
148 dir->commit(0, gather_bld.new_sub(), false, op_prio);
149 } else {
150 dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl;
151 dir->add_waiter(CDir::WAIT_UNFREEZE, gather_bld.new_sub());
152 }
153 }
154 }
155
f67539c2
TL
156 // leader ops with possibly uncommitted peers
157 for (set<metareqid_t>::iterator p = uncommitted_leaders.begin();
158 p != uncommitted_leaders.end();
7c673cae 159 ++p) {
f67539c2
TL
160 dout(10) << "try_to_expire waiting for peers to ack commit on " << *p << dendl;
161 mds->mdcache->wait_for_uncommitted_leader(*p, gather_bld.new_sub());
7c673cae
FG
162 }
163
f67539c2
TL
164 // peer ops that haven't been committed
165 for (set<metareqid_t>::iterator p = uncommitted_peers.begin();
166 p != uncommitted_peers.end();
e306af50 167 ++p) {
f67539c2
TL
168 dout(10) << "try_to_expire waiting for leader to ack OP_FINISH on " << *p << dendl;
169 mds->mdcache->wait_for_uncommitted_peer(*p, gather_bld.new_sub());
e306af50
TL
170 }
171
7c673cae
FG
172 // uncommitted fragments
173 for (set<dirfrag_t>::iterator p = uncommitted_fragments.begin();
174 p != uncommitted_fragments.end();
175 ++p) {
176 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p << dendl;
177 mds->mdcache->wait_for_uncommitted_fragment(*p, gather_bld.new_sub());
178 }
179
180 // nudge scatterlocks
181 for (elist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) {
182 CInode *in = *p;
183 dout(10) << "try_to_expire waiting for dirlock flush on " << *in << dendl;
184 mds->locker->scatter_nudge(&in->filelock, gather_bld.new_sub());
185 }
186 for (elist<CInode*>::iterator p = dirty_dirfrag_dirfragtree.begin(); !p.end(); ++p) {
187 CInode *in = *p;
188 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in << dendl;
189 mds->locker->scatter_nudge(&in->dirfragtreelock, gather_bld.new_sub());
190 }
191 for (elist<CInode*>::iterator p = dirty_dirfrag_nest.begin(); !p.end(); ++p) {
192 CInode *in = *p;
193 dout(10) << "try_to_expire waiting for nest flush on " << *in << dendl;
194 mds->locker->scatter_nudge(&in->nestlock, gather_bld.new_sub());
195 }
196
11fdf7f2 197 ceph_assert(g_conf()->mds_kill_journal_expire_at != 2);
7c673cae
FG
198
199 // open files and snap inodes
200 if (!open_files.empty()) {
11fdf7f2 201 ceph_assert(!mds->mdlog->is_capped()); // hmm FIXME
7c673cae
FG
202 EOpen *le = 0;
203 LogSegment *ls = mds->mdlog->get_current_segment();
11fdf7f2 204 ceph_assert(ls != this);
7c673cae
FG
205 elist<CInode*>::iterator p = open_files.begin(member_offset(CInode, item_open_file));
206 while (!p.end()) {
207 CInode *in = *p;
208 ++p;
11fdf7f2 209 if (in->last != CEPH_NOSNAP && in->is_auth() && !in->client_snap_caps.empty()) {
7c673cae
FG
210 // journal snap inodes that need flush. This simplify the mds failover hanlding
211 dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl;
212 if (!le) {
213 le = new EOpen(mds->mdlog);
214 mds->mdlog->start_entry(le);
215 }
216 le->add_clean_inode(in);
217 ls->open_files.push_back(&in->item_open_file);
218 } else {
11fdf7f2 219 // open files are tracked by open file table, no need to journal them again
7c673cae
FG
220 in->item_open_file.remove_myself();
221 }
222 }
223 if (le) {
224 mds->mdlog->submit_entry(le);
225 mds->mdlog->wait_for_safe(gather_bld.new_sub());
226 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl;
227 }
228 }
229
11fdf7f2 230 ceph_assert(g_conf()->mds_kill_journal_expire_at != 3);
7c673cae 231
f67539c2
TL
232 size_t count = 0;
233 for (elist<CInode*>::iterator it = dirty_parent_inodes.begin(); !it.end(); ++it)
234 count++;
235
236 std::vector<CInodeCommitOperations> ops_vec;
237 ops_vec.reserve(count);
7c673cae
FG
238 // backtraces to be stored/updated
239 for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) {
240 CInode *in = *p;
11fdf7f2 241 ceph_assert(in->is_auth());
7c673cae
FG
242 if (in->can_auth_pin()) {
243 dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl;
f67539c2
TL
244 ops_vec.resize(ops_vec.size() + 1);
245 in->store_backtrace(ops_vec.back(), op_prio);
7c673cae
FG
246 } else {
247 dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl;
248 in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub());
249 }
250 }
f67539c2
TL
251 if (!ops_vec.empty())
252 mds->finisher->queue(new BatchCommitBacktrace(mds, gather_bld.new_sub(), std::move(ops_vec)));
7c673cae 253
11fdf7f2 254 ceph_assert(g_conf()->mds_kill_journal_expire_at != 4);
7c673cae 255
7c673cae
FG
256 // idalloc
257 if (inotablev > mds->inotable->get_committed_version()) {
258 dout(10) << "try_to_expire saving inotable table, need " << inotablev
259 << ", committed is " << mds->inotable->get_committed_version()
260 << " (" << mds->inotable->get_committing_version() << ")"
261 << dendl;
262 mds->inotable->save(gather_bld.new_sub(), inotablev);
263 }
264
265 // sessionmap
266 if (sessionmapv > mds->sessionmap.get_committed()) {
267 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
268 << ", committed is " << mds->sessionmap.get_committed()
269 << " (" << mds->sessionmap.get_committing() << ")"
270 << dendl;
271 mds->sessionmap.save(gather_bld.new_sub(), sessionmapv);
272 }
273
274 // updates to sessions for completed_requests
275 mds->sessionmap.save_if_dirty(touched_sessions, &gather_bld);
276 touched_sessions.clear();
277
278 // pending commit atids
279 for (map<int, ceph::unordered_set<version_t> >::iterator p = pending_commit_tids.begin();
280 p != pending_commit_tids.end();
281 ++p) {
282 MDSTableClient *client = mds->get_table_client(p->first);
11fdf7f2 283 ceph_assert(client);
7c673cae
FG
284 for (ceph::unordered_set<version_t>::iterator q = p->second.begin();
285 q != p->second.end();
286 ++q) {
287 dout(10) << "try_to_expire " << get_mdstable_name(p->first) << " transaction " << *q
288 << " pending commit (not yet acked), waiting" << dendl;
11fdf7f2 289 ceph_assert(!client->has_committed(*q));
7c673cae
FG
290 client->wait_for_ack(*q, gather_bld.new_sub());
291 }
292 }
293
294 // table servers
295 for (map<int, version_t>::iterator p = tablev.begin();
296 p != tablev.end();
297 ++p) {
298 MDSTableServer *server = mds->get_table_server(p->first);
11fdf7f2 299 ceph_assert(server);
7c673cae
FG
300 if (p->second > server->get_committed_version()) {
301 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p->first)
302 << " to save, need " << p->second << dendl;
303 server->save(gather_bld.new_sub());
304 }
305 }
306
307 // truncating
308 for (set<CInode*>::iterator p = truncating_inodes.begin();
309 p != truncating_inodes.end();
310 ++p) {
311 dout(10) << "try_to_expire waiting for truncate of " << **p << dendl;
312 (*p)->add_waiter(CInode::WAIT_TRUNC, gather_bld.new_sub());
313 }
9f95a23c 314 // purge inodes
f67539c2
TL
315 dout(10) << "try_to_expire waiting for purge of " << purging_inodes << dendl;
316 if (purging_inodes.size())
9f95a23c 317 set_purged_cb(gather_bld.new_sub());
7c673cae
FG
318
319 if (gather_bld.has_subs()) {
320 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire waiting" << dendl;
321 mds->mdlog->flush();
322 } else {
11fdf7f2 323 ceph_assert(g_conf()->mds_kill_journal_expire_at != 5);
7c673cae
FG
324 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire success" << dendl;
325 }
326}
327
7c673cae
FG
328// -----------------------
329// EMetaBlob
330
7c673cae
FG
331void EMetaBlob::add_dir_context(CDir *dir, int mode)
332{
f67539c2 333 MDSRank *mds = dir->mdcache->mds;
7c673cae
FG
334
335 list<CDentry*> parents;
336
337 // it may be okay not to include the maybe items, if
338 // - we journaled the maybe child inode in this segment
339 // - that subtree turns out to be unambiguously auth
340 list<CDentry*> maybe;
341 bool maybenot = false;
342
343 while (true) {
344 // already have this dir? (we must always add in order)
345 if (lump_map.count(dir->dirfrag())) {
346 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") have lump " << dir->dirfrag() << dendl;
347 break;
348 }
349
350 // stop at root/stray
351 CInode *diri = dir->get_inode();
352 CDentry *parent = diri->get_projected_parent_dn();
353
354 if (mode == TO_AUTH_SUBTREE_ROOT) {
355 // subtree root?
31f18b77
FG
356 if (dir->is_subtree_root()) {
357 // match logic in MDCache::create_subtree_map()
358 if (dir->get_dir_auth().first == mds->get_nodeid()) {
359 mds_authority_t parent_auth = parent ? parent->authority() : CDIR_AUTH_UNDEF;
360 if (parent_auth.first == dir->get_dir_auth().first) {
361 if (parent_auth.second == CDIR_AUTH_UNKNOWN &&
362 !dir->is_ambiguous_dir_auth() &&
363 !dir->state_test(CDir::STATE_EXPORTBOUND) &&
364 !dir->state_test(CDir::STATE_AUXSUBTREE) &&
365 !diri->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
366 dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir << dendl;
11fdf7f2 367 ceph_abort();
31f18b77
FG
368 }
369 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") ambiguous or transient subtree " << dendl;
7c673cae
FG
370 } else {
371 // it's an auth subtree, we don't need maybe (if any), and we're done.
372 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
373 << " at " << *dir << dendl;
374 maybe.clear();
375 break;
376 }
377 } else {
378 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached ambig or !auth subtree, need " << maybe
379 << " at " << *dir << dendl;
380 // we need the maybe list after all!
381 parents.splice(parents.begin(), maybe);
382 maybenot = false;
383 }
384 }
31f18b77 385
7c673cae
FG
386 // was the inode journaled in this blob?
387 if (event_seq && diri->last_journaled == event_seq) {
388 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri this blob " << *diri << dendl;
389 break;
390 }
391
392 // have we journaled this inode since the last subtree map?
393 if (!maybenot && last_subtree_map && diri->last_journaled >= last_subtree_map) {
394 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri in this segment ("
395 << diri->last_journaled << " >= " << last_subtree_map << "), setting maybenot flag "
396 << *diri << dendl;
397 maybenot = true;
398 }
399 }
400
401 if (!parent)
402 break;
403
404 if (maybenot) {
405 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") maybe " << *parent << dendl;
406 maybe.push_front(parent);
407 } else {
408 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") definitely " << *parent << dendl;
409 parents.push_front(parent);
410 }
411
412 dir = parent->get_dir();
413 }
414
415 parents.splice(parents.begin(), maybe);
416
417 dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl;
9f95a23c
TL
418 for (const auto& dentry : parents) {
419 ceph_assert(dentry->get_projected_linkage()->is_primary());
420 add_dentry(dentry, false);
7c673cae
FG
421 }
422}
423
424void EMetaBlob::update_segment(LogSegment *ls)
425{
426 // dirty inode mtimes
427 // -> handled directly by Server.cc, replay()
428
429 // alloc table update?
430 if (inotablev)
431 ls->inotablev = inotablev;
432 if (sessionmapv)
433 ls->sessionmapv = sessionmapv;
434
435 // truncated inodes
436 // -> handled directly by Server.cc
437
438 // client requests
439 // note the newest request per client
440 //if (!client_reqs.empty())
441 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
442}
443
444// EMetaBlob::fullbit
445
446void EMetaBlob::fullbit::encode(bufferlist& bl, uint64_t features) const {
f67539c2 447 ENCODE_START(9, 5, bl);
11fdf7f2
TL
448 encode(dn, bl);
449 encode(dnfirst, bl);
450 encode(dnlast, bl);
451 encode(dnv, bl);
f67539c2
TL
452 encode(*inode, bl, features);
453 if (xattrs)
454 encode(*xattrs, bl);
455 else
456 encode((__u32)0, bl);
457
458 if (inode->is_symlink())
11fdf7f2 459 encode(symlink, bl);
f67539c2 460 if (inode->is_dir()) {
11fdf7f2
TL
461 encode(dirfragtree, bl);
462 encode(snapbl, bl);
7c673cae 463 }
11fdf7f2 464 encode(state, bl);
f67539c2 465 if (!old_inodes || old_inodes->empty()) {
11fdf7f2 466 encode(false, bl);
7c673cae 467 } else {
11fdf7f2 468 encode(true, bl);
f67539c2 469 encode(*old_inodes, bl, features);
7c673cae 470 }
f67539c2 471 if (!inode->is_dir())
11fdf7f2
TL
472 encode(snapbl, bl);
473 encode(oldest_snap, bl);
f67539c2 474 encode(alternate_name, bl);
7c673cae
FG
475 ENCODE_FINISH(bl);
476}
477
11fdf7f2 478void EMetaBlob::fullbit::decode(bufferlist::const_iterator &bl) {
f67539c2 479 DECODE_START(9, bl);
11fdf7f2
TL
480 decode(dn, bl);
481 decode(dnfirst, bl);
482 decode(dnlast, bl);
483 decode(dnv, bl);
f67539c2
TL
484 {
485 auto _inode = CInode::allocate_inode();
486 decode(*_inode, bl);
487 inode = std::move(_inode);
488 }
489 {
490 CInode::mempool_xattr_map tmp;
491 decode_noshare(tmp, bl);
492 if (!tmp.empty())
493 xattrs = CInode::allocate_xattr_map(std::move(tmp));
494 }
495 if (inode->is_symlink())
11fdf7f2 496 decode(symlink, bl);
f67539c2 497 if (inode->is_dir()) {
11fdf7f2
TL
498 decode(dirfragtree, bl);
499 decode(snapbl, bl);
7c673cae 500 }
f67539c2
TL
501 decode(state, bl);
502 bool old_inodes_present;
503 decode(old_inodes_present, bl);
504 if (old_inodes_present) {
505 auto _old_inodes = CInode::allocate_old_inode_map();
506 decode(*_old_inodes, bl);
507 old_inodes = std::move(_old_inodes);
508 }
509 if (!inode->is_dir()) {
510 decode(snapbl, bl);
7c673cae 511 }
f67539c2
TL
512 decode(oldest_snap, bl);
513 if (struct_v >= 9) {
514 decode(alternate_name, bl);
7c673cae 515 }
7c673cae
FG
516 DECODE_FINISH(bl);
517}
518
519void EMetaBlob::fullbit::dump(Formatter *f) const
520{
521 f->dump_string("dentry", dn);
522 f->dump_stream("snapid.first") << dnfirst;
523 f->dump_stream("snapid.last") << dnlast;
524 f->dump_int("dentry version", dnv);
525 f->open_object_section("inode");
f67539c2 526 inode->dump(f);
7c673cae
FG
527 f->close_section(); // inode
528 f->open_object_section("xattrs");
f67539c2
TL
529 if (xattrs) {
530 for (const auto &p : *xattrs) {
531 std::string s(p.second.c_str(), p.second.length());
532 f->dump_string(p.first.c_str(), s);
533 }
7c673cae
FG
534 }
535 f->close_section(); // xattrs
f67539c2 536 if (inode->is_symlink()) {
7c673cae
FG
537 f->dump_string("symlink", symlink);
538 }
f67539c2 539 if (inode->is_dir()) {
7c673cae
FG
540 f->dump_stream("frag tree") << dirfragtree;
541 f->dump_string("has_snapbl", snapbl.length() ? "true" : "false");
f67539c2 542 if (inode->has_layout()) {
7c673cae
FG
543 f->open_object_section("file layout policy");
544 // FIXME
545 f->dump_string("layout", "the layout exists");
546 f->close_section(); // file layout policy
547 }
548 }
549 f->dump_string("state", state_string());
f67539c2 550 if (old_inodes && !old_inodes->empty()) {
7c673cae 551 f->open_array_section("old inodes");
f67539c2 552 for (const auto &p : *old_inodes) {
7c673cae 553 f->open_object_section("inode");
94b18763
FG
554 f->dump_int("snapid", p.first);
555 p.second.dump(f);
7c673cae
FG
556 f->close_section(); // inode
557 }
558 f->close_section(); // old inodes
559 }
f67539c2 560 f->dump_string("alternate_name", alternate_name);
7c673cae
FG
561}
562
9f95a23c 563void EMetaBlob::fullbit::generate_test_instances(std::list<EMetaBlob::fullbit*>& ls)
7c673cae 564{
f67539c2 565 auto _inode = CInode::allocate_inode();
7c673cae 566 fragtree_t fragtree;
f67539c2 567 auto _xattrs = CInode::allocate_xattr_map();
7c673cae 568 bufferlist empty_snapbl;
f67539c2
TL
569 fullbit *sample = new fullbit("/testdn", "", 0, 0, 0,
570 _inode, fragtree, _xattrs, "", 0, empty_snapbl,
7c673cae
FG
571 false, NULL);
572 ls.push_back(sample);
573}
574
575void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in)
576{
f67539c2
TL
577 in->reset_inode(std::move(inode));
578 in->reset_xattrs(std::move(xattrs));
579 if (in->is_dir()) {
f6b5b4d7
TL
580 if (is_export_ephemeral_random()) {
581 dout(15) << "random ephemeral pin on " << *in << dendl;
f67539c2 582 in->set_ephemeral_pin(false, true);
f6b5b4d7 583 }
f6b5b4d7 584 in->maybe_export_pin();
7c673cae
FG
585 if (!(in->dirfragtree == dirfragtree)) {
586 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> "
587 << dirfragtree << " on " << *in << dendl;
f67539c2 588 in->dirfragtree = std::move(dirfragtree);
7c673cae 589 in->force_dirfrags();
9f95a23c
TL
590 if (in->get_num_dirfrags() && in->authority() == CDIR_AUTH_UNDEF) {
591 auto&& ls = in->get_nested_dirfrags();
592 for (const auto& dir : ls) {
7c673cae
FG
593 if (dir->get_num_any() == 0 &&
594 mds->mdcache->can_trim_non_auth_dirfrag(dir)) {
595 dout(10) << " closing empty non-auth dirfrag " << *dir << dendl;
596 in->close_dirfrag(dir->get_frag());
597 }
598 }
599 }
600 }
f67539c2 601 } else if (in->is_symlink()) {
11fdf7f2 602 in->symlink = symlink;
7c673cae 603 }
f67539c2
TL
604 in->reset_old_inodes(std::move(old_inodes));
605 if (in->is_any_old_inodes()) {
606 snapid_t min_first = in->get_old_inodes()->rbegin()->first + 1;
7c673cae
FG
607 if (min_first > in->first)
608 in->first = min_first;
609 }
610
611 /*
612 * we can do this before linking hte inode bc the split_at would
613 * be a no-op.. we have no children (namely open snaprealms) to
614 * divy up
615 */
616 in->oldest_snap = oldest_snap;
617 in->decode_snap_blob(snapbl);
618
619 /*
620 * In case there was anything malformed in the journal that we are
621 * replaying, do sanity checks on the inodes we're replaying and
622 * go damaged instead of letting any trash into a live cache
623 */
624 if (in->is_file()) {
625 // Files must have valid layouts with a pool set
f67539c2
TL
626 if (in->get_inode()->layout.pool_id == -1 ||
627 !in->get_inode()->layout.is_valid()) {
7c673cae 628 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
f67539c2
TL
629 << ": " << in->get_inode()->layout << dendl;
630 CachedStackStringStream css;
631 *css << "Invalid layout for inode " << in->ino() << " in journal";
632 mds->clog->error() << css->strv();
7c673cae
FG
633 mds->damaged();
634 ceph_abort(); // Should be unreachable because damaged() calls respawn()
635 }
636 }
637}
638
639// EMetaBlob::remotebit
640
641void EMetaBlob::remotebit::encode(bufferlist& bl) const
642{
f67539c2 643 ENCODE_START(3, 2, bl);
11fdf7f2
TL
644 encode(dn, bl);
645 encode(dnfirst, bl);
646 encode(dnlast, bl);
647 encode(dnv, bl);
648 encode(ino, bl);
649 encode(d_type, bl);
650 encode(dirty, bl);
f67539c2 651 encode(alternate_name, bl);
7c673cae
FG
652 ENCODE_FINISH(bl);
653}
654
11fdf7f2 655void EMetaBlob::remotebit::decode(bufferlist::const_iterator &bl)
7c673cae 656{
f67539c2 657 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
11fdf7f2
TL
658 decode(dn, bl);
659 decode(dnfirst, bl);
660 decode(dnlast, bl);
661 decode(dnv, bl);
662 decode(ino, bl);
663 decode(d_type, bl);
664 decode(dirty, bl);
f67539c2
TL
665 if (struct_v >= 3)
666 decode(alternate_name, bl);
7c673cae
FG
667 DECODE_FINISH(bl);
668}
669
670void EMetaBlob::remotebit::dump(Formatter *f) const
671{
672 f->dump_string("dentry", dn);
673 f->dump_int("snapid.first", dnfirst);
674 f->dump_int("snapid.last", dnlast);
675 f->dump_int("dentry version", dnv);
676 f->dump_int("inodeno", ino);
677 uint32_t type = DTTOIF(d_type) & S_IFMT; // convert to type entries
678 string type_string;
679 switch(type) {
680 case S_IFREG:
681 type_string = "file"; break;
682 case S_IFLNK:
683 type_string = "symlink"; break;
684 case S_IFDIR:
685 type_string = "directory"; break;
686 case S_IFIFO:
687 type_string = "fifo"; break;
688 case S_IFCHR:
689 type_string = "chr"; break;
690 case S_IFBLK:
691 type_string = "blk"; break;
692 case S_IFSOCK:
693 type_string = "sock"; break;
694 default:
695 assert (0 == "unknown d_type!");
696 }
697 f->dump_string("d_type", type_string);
698 f->dump_string("dirty", dirty ? "true" : "false");
f67539c2 699 f->dump_string("alternate_name", alternate_name);
7c673cae
FG
700}
701
702void EMetaBlob::remotebit::
9f95a23c 703generate_test_instances(std::list<EMetaBlob::remotebit*>& ls)
7c673cae 704{
f67539c2
TL
705 remotebit *remote = new remotebit("/test/dn", "", 0, 10, 15, 1, IFTODT(S_IFREG), false);
706 ls.push_back(remote);
707 remote = new remotebit("/test/dn2", "foo", 0, 10, 15, 1, IFTODT(S_IFREG), false);
7c673cae
FG
708 ls.push_back(remote);
709}
710
711// EMetaBlob::nullbit
712
713void EMetaBlob::nullbit::encode(bufferlist& bl) const
714{
715 ENCODE_START(2, 2, bl);
11fdf7f2
TL
716 encode(dn, bl);
717 encode(dnfirst, bl);
718 encode(dnlast, bl);
719 encode(dnv, bl);
720 encode(dirty, bl);
7c673cae
FG
721 ENCODE_FINISH(bl);
722}
723
11fdf7f2 724void EMetaBlob::nullbit::decode(bufferlist::const_iterator &bl)
7c673cae
FG
725{
726 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
11fdf7f2
TL
727 decode(dn, bl);
728 decode(dnfirst, bl);
729 decode(dnlast, bl);
730 decode(dnv, bl);
731 decode(dirty, bl);
7c673cae
FG
732 DECODE_FINISH(bl);
733}
734
735void EMetaBlob::nullbit::dump(Formatter *f) const
736{
737 f->dump_string("dentry", dn);
738 f->dump_int("snapid.first", dnfirst);
739 f->dump_int("snapid.last", dnlast);
740 f->dump_int("dentry version", dnv);
741 f->dump_string("dirty", dirty ? "true" : "false");
742}
743
9f95a23c 744void EMetaBlob::nullbit::generate_test_instances(std::list<nullbit*>& ls)
7c673cae
FG
745{
746 nullbit *sample = new nullbit("/test/dentry", 0, 10, 15, false);
747 nullbit *sample2 = new nullbit("/test/dirty", 10, 20, 25, true);
748 ls.push_back(sample);
749 ls.push_back(sample2);
750}
751
752// EMetaBlob::dirlump
753
754void EMetaBlob::dirlump::encode(bufferlist& bl, uint64_t features) const
755{
756 ENCODE_START(2, 2, bl);
f67539c2 757 encode(*fnode, bl);
11fdf7f2
TL
758 encode(state, bl);
759 encode(nfull, bl);
760 encode(nremote, bl);
761 encode(nnull, bl);
7c673cae 762 _encode_bits(features);
11fdf7f2 763 encode(dnbl, bl);
7c673cae
FG
764 ENCODE_FINISH(bl);
765}
766
11fdf7f2 767void EMetaBlob::dirlump::decode(bufferlist::const_iterator &bl)
7c673cae
FG
768{
769 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl)
f67539c2
TL
770 {
771 auto _fnode = CDir::allocate_fnode();
772 decode(*_fnode, bl);
773 fnode = std::move(_fnode);
774 }
11fdf7f2
TL
775 decode(state, bl);
776 decode(nfull, bl);
777 decode(nremote, bl);
778 decode(nnull, bl);
779 decode(dnbl, bl);
7c673cae
FG
780 dn_decoded = false; // don't decode bits unless we need them.
781 DECODE_FINISH(bl);
782}
783
784void EMetaBlob::dirlump::dump(Formatter *f) const
785{
786 if (!dn_decoded) {
787 dirlump *me = const_cast<dirlump*>(this);
788 me->_decode_bits();
789 }
790 f->open_object_section("fnode");
f67539c2 791 fnode->dump(f);
7c673cae
FG
792 f->close_section(); // fnode
793 f->dump_string("state", state_string());
794 f->dump_int("nfull", nfull);
795 f->dump_int("nremote", nremote);
796 f->dump_int("nnull", nnull);
797
798 f->open_array_section("full bits");
11fdf7f2 799 for (const auto& iter : dfull) {
7c673cae 800 f->open_object_section("fullbit");
11fdf7f2 801 iter.dump(f);
7c673cae
FG
802 f->close_section(); // fullbit
803 }
804 f->close_section(); // full bits
805 f->open_array_section("remote bits");
11fdf7f2 806 for (const auto& iter : dremote) {
7c673cae 807 f->open_object_section("remotebit");
11fdf7f2 808 iter.dump(f);
7c673cae
FG
809 f->close_section(); // remotebit
810 }
811 f->close_section(); // remote bits
812 f->open_array_section("null bits");
11fdf7f2 813 for (const auto& iter : dnull) {
7c673cae 814 f->open_object_section("null bit");
11fdf7f2 815 iter.dump(f);
7c673cae
FG
816 f->close_section(); // null bit
817 }
818 f->close_section(); // null bits
819}
820
9f95a23c 821void EMetaBlob::dirlump::generate_test_instances(std::list<dirlump*>& ls)
7c673cae 822{
f67539c2
TL
823 auto dl = new dirlump();
824 dl->fnode = CDir::allocate_fnode();
825 ls.push_back(dl);
7c673cae
FG
826}
827
828/**
829 * EMetaBlob proper
830 */
831void EMetaBlob::encode(bufferlist& bl, uint64_t features) const
832{
833 ENCODE_START(8, 5, bl);
11fdf7f2
TL
834 encode(lump_order, bl);
835 encode(lump_map, bl, features);
836 encode(roots, bl, features);
837 encode(table_tids, bl);
838 encode(opened_ino, bl);
839 encode(allocated_ino, bl);
840 encode(used_preallocated_ino, bl);
841 encode(preallocated_inos, bl);
842 encode(client_name, bl);
843 encode(inotablev, bl);
844 encode(sessionmapv, bl);
845 encode(truncate_start, bl);
846 encode(truncate_finish, bl);
847 encode(destroyed_inodes, bl);
848 encode(client_reqs, bl);
849 encode(renamed_dirino, bl);
850 encode(renamed_dir_frags, bl);
7c673cae
FG
851 {
852 // make MDSRank use v6 format happy
853 int64_t i = -1;
854 bool b = false;
11fdf7f2
TL
855 encode(i, bl);
856 encode(b, bl);
7c673cae 857 }
11fdf7f2 858 encode(client_flushes, bl);
7c673cae
FG
859 ENCODE_FINISH(bl);
860}
11fdf7f2 861void EMetaBlob::decode(bufferlist::const_iterator &bl)
7c673cae 862{
9f95a23c 863 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
11fdf7f2
TL
864 decode(lump_order, bl);
865 decode(lump_map, bl);
7c673cae 866 if (struct_v >= 4) {
11fdf7f2 867 decode(roots, bl);
7c673cae
FG
868 } else {
869 bufferlist rootbl;
11fdf7f2 870 decode(rootbl, bl);
7c673cae 871 if (rootbl.length()) {
11fdf7f2
TL
872 auto p = rootbl.cbegin();
873 roots.emplace_back(p);
7c673cae
FG
874 }
875 }
11fdf7f2
TL
876 decode(table_tids, bl);
877 decode(opened_ino, bl);
878 decode(allocated_ino, bl);
879 decode(used_preallocated_ino, bl);
880 decode(preallocated_inos, bl);
881 decode(client_name, bl);
882 decode(inotablev, bl);
883 decode(sessionmapv, bl);
884 decode(truncate_start, bl);
885 decode(truncate_finish, bl);
886 decode(destroyed_inodes, bl);
7c673cae 887 if (struct_v >= 2) {
11fdf7f2 888 decode(client_reqs, bl);
7c673cae
FG
889 } else {
890 list<metareqid_t> r;
11fdf7f2 891 decode(r, bl);
7c673cae
FG
892 while (!r.empty()) {
893 client_reqs.push_back(pair<metareqid_t,uint64_t>(r.front(), 0));
894 r.pop_front();
895 }
896 }
897 if (struct_v >= 3) {
11fdf7f2
TL
898 decode(renamed_dirino, bl);
899 decode(renamed_dir_frags, bl);
7c673cae
FG
900 }
901 if (struct_v >= 6) {
902 // ignore
903 int64_t i;
904 bool b;
11fdf7f2
TL
905 decode(i, bl);
906 decode(b, bl);
7c673cae
FG
907 }
908 if (struct_v >= 8) {
11fdf7f2 909 decode(client_flushes, bl);
7c673cae
FG
910 }
911 DECODE_FINISH(bl);
912}
913
914
915/**
916 * Get all inodes touched by this metablob. Includes the 'bits' within
917 * dirlumps, and the inodes of the dirs themselves.
918 */
919void EMetaBlob::get_inodes(
920 std::set<inodeno_t> &inodes) const
921{
922 // For all dirlumps in this metablob
923 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
924 // Record inode of dirlump
925 inodeno_t const dir_ino = i->first.ino;
926 inodes.insert(dir_ino);
927
928 // Decode dirlump bits
929 dirlump const &dl = i->second;
930 dl._decode_bits();
931
932 // Record inodes of fullbits
11fdf7f2 933 for (const auto& iter : dl.get_dfull()) {
f67539c2 934 inodes.insert(iter.inode->ino);
7c673cae
FG
935 }
936
937 // Record inodes of remotebits
11fdf7f2
TL
938 for (const auto& iter : dl.get_dremote()) {
939 inodes.insert(iter.ino);
7c673cae
FG
940 }
941 }
942}
943
944
945/**
946 * Get a map of dirfrag to set of dentries in that dirfrag which are
947 * touched in this operation.
948 */
949void EMetaBlob::get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const
950{
951 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
952 dirlump const &dl = i->second;
953 dirfrag_t const &df = i->first;
954
955 // Get all bits
956 dl._decode_bits();
7c673cae
FG
957
958 // For all bits, store dentry
11fdf7f2
TL
959 for (const auto& iter : dl.get_dfull()) {
960 dentries[df].insert(iter.dn);
7c673cae 961 }
11fdf7f2
TL
962 for (const auto& iter : dl.get_dremote()) {
963 dentries[df].insert(iter.dn);
7c673cae 964 }
11fdf7f2
TL
965 for (const auto& iter : dl.get_dnull()) {
966 dentries[df].insert(iter.dn);
7c673cae
FG
967 }
968 }
969}
970
971
972
973/**
974 * Calculate all paths that we can infer are touched by this metablob. Only uses
975 * information local to this metablob so it may only be the path within the
976 * subtree.
977 */
978void EMetaBlob::get_paths(
979 std::vector<std::string> &paths) const
980{
981 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
982 typedef std::pair<inodeno_t, std::string> Location;
983
984 // Whenever we see a dentry within a dirlump, we remember it as a child of
985 // the dirlump's inode
9f95a23c 986 std::map<inodeno_t, std::vector<std::string> > children;
7c673cae
FG
987
988 // Whenever we see a location for an inode, remember it: this allows us to
989 // build a path given an inode
990 std::map<inodeno_t, Location> ino_locations;
991
992 // Special case: operations on root inode populate roots but not dirlumps
993 if (lump_map.empty() && !roots.empty()) {
994 paths.push_back("/");
995 return;
996 }
997
998 // First pass
999 // ==========
1000 // Build a tiny local metadata cache for the path structure in this metablob
1001 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
1002 inodeno_t const dir_ino = i->first.ino;
1003 dirlump const &dl = i->second;
1004 dl._decode_bits();
1005
11fdf7f2
TL
1006 for (const auto& iter : dl.get_dfull()) {
1007 std::string_view dentry = iter.dn;
94b18763 1008 children[dir_ino].emplace_back(dentry);
f67539c2 1009 ino_locations[iter.inode->ino] = Location(dir_ino, dentry);
7c673cae
FG
1010 }
1011
11fdf7f2
TL
1012 for (const auto& iter : dl.get_dremote()) {
1013 std::string_view dentry = iter.dn;
94b18763 1014 children[dir_ino].emplace_back(dentry);
7c673cae
FG
1015 }
1016
11fdf7f2
TL
1017 for (const auto& iter : dl.get_dnull()) {
1018 std::string_view dentry = iter.dn;
94b18763 1019 children[dir_ino].emplace_back(dentry);
7c673cae
FG
1020 }
1021 }
1022
1023 std::vector<Location> leaf_locations;
1024
1025 // Second pass
1026 // ===========
1027 // Output paths for all childless nodes in the metablob
1028 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
1029 inodeno_t const dir_ino = i->first.ino;
1030 dirlump const &dl = i->second;
1031 dl._decode_bits();
1032
11fdf7f2
TL
1033 for (const auto& iter : dl.get_dfull()) {
1034 std::string_view dentry = iter.dn;
f67539c2 1035 if (children.find(iter.inode->ino) == children.end()) {
11fdf7f2 1036 leaf_locations.push_back(Location(dir_ino, dentry));
7c673cae
FG
1037 }
1038 }
1039
11fdf7f2
TL
1040 for (const auto& iter : dl.get_dremote()) {
1041 std::string_view dentry = iter.dn;
1042 leaf_locations.push_back(Location(dir_ino, dentry));
7c673cae
FG
1043 }
1044
11fdf7f2
TL
1045 for (const auto& iter : dl.get_dnull()) {
1046 std::string_view dentry = iter.dn;
1047 leaf_locations.push_back(Location(dir_ino, dentry));
7c673cae
FG
1048 }
1049 }
1050
1051 // For all the leaf locations identified, generate paths
1052 for (std::vector<Location>::iterator i = leaf_locations.begin(); i != leaf_locations.end(); ++i) {
1053 Location const &loc = *i;
1054 std::string path = loc.second;
1055 inodeno_t ino = loc.first;
11fdf7f2
TL
1056 std::map<inodeno_t, Location>::iterator iter = ino_locations.find(ino);
1057 while(iter != ino_locations.end()) {
1058 Location const &loc = iter->second;
7c673cae
FG
1059 if (!path.empty()) {
1060 path = loc.second + "/" + path;
1061 } else {
1062 path = loc.second + path;
1063 }
11fdf7f2 1064 iter = ino_locations.find(loc.first);
7c673cae
FG
1065 }
1066
1067 paths.push_back(path);
1068 }
1069}
1070
1071
1072void EMetaBlob::dump(Formatter *f) const
1073{
1074 f->open_array_section("lumps");
11fdf7f2 1075 for (const auto& d : lump_order) {
7c673cae
FG
1076 f->open_object_section("lump");
1077 f->open_object_section("dirfrag");
11fdf7f2 1078 f->dump_stream("dirfrag") << d;
7c673cae
FG
1079 f->close_section(); // dirfrag
1080 f->open_object_section("dirlump");
11fdf7f2 1081 lump_map.at(d).dump(f);
7c673cae
FG
1082 f->close_section(); // dirlump
1083 f->close_section(); // lump
1084 }
1085 f->close_section(); // lumps
1086
1087 f->open_array_section("roots");
11fdf7f2 1088 for (const auto& iter : roots) {
7c673cae 1089 f->open_object_section("root");
11fdf7f2 1090 iter.dump(f);
7c673cae
FG
1091 f->close_section(); // root
1092 }
1093 f->close_section(); // roots
1094
1095 f->open_array_section("tableclient tranactions");
11fdf7f2 1096 for (const auto& p : table_tids) {
7c673cae 1097 f->open_object_section("transaction");
11fdf7f2
TL
1098 f->dump_int("tid", p.first);
1099 f->dump_int("version", p.second);
7c673cae
FG
1100 f->close_section(); // transaction
1101 }
1102 f->close_section(); // tableclient transactions
1103
1104 f->dump_int("renamed directory inodeno", renamed_dirino);
1105
1106 f->open_array_section("renamed directory fragments");
11fdf7f2
TL
1107 for (const auto& p : renamed_dir_frags) {
1108 f->dump_int("frag", p);
7c673cae
FG
1109 }
1110 f->close_section(); // renamed directory fragments
1111
1112 f->dump_int("inotable version", inotablev);
1113 f->dump_int("SessionMap version", sessionmapv);
1114 f->dump_int("allocated ino", allocated_ino);
1115
1116 f->dump_stream("preallocated inos") << preallocated_inos;
1117 f->dump_int("used preallocated ino", used_preallocated_ino);
1118
1119 f->open_object_section("client name");
1120 client_name.dump(f);
1121 f->close_section(); // client name
1122
1123 f->open_array_section("inodes starting a truncate");
11fdf7f2
TL
1124 for(const auto& ino : truncate_start) {
1125 f->dump_int("inodeno", ino);
7c673cae
FG
1126 }
1127 f->close_section(); // truncate inodes
1128 f->open_array_section("inodes finishing a truncated");
11fdf7f2 1129 for(const auto& p : truncate_finish) {
7c673cae 1130 f->open_object_section("inode+segment");
11fdf7f2
TL
1131 f->dump_int("inodeno", p.first);
1132 f->dump_int("truncate starting segment", p.second);
7c673cae
FG
1133 f->close_section(); // truncated inode
1134 }
1135 f->close_section(); // truncate finish inodes
1136
1137 f->open_array_section("destroyed inodes");
1138 for(vector<inodeno_t>::const_iterator i = destroyed_inodes.begin();
1139 i != destroyed_inodes.end(); ++i) {
1140 f->dump_int("inodeno", *i);
1141 }
1142 f->close_section(); // destroyed inodes
1143
1144 f->open_array_section("client requests");
11fdf7f2 1145 for(const auto& p : client_reqs) {
7c673cae 1146 f->open_object_section("Client request");
11fdf7f2
TL
1147 f->dump_stream("request ID") << p.first;
1148 f->dump_int("oldest request on client", p.second);
7c673cae
FG
1149 f->close_section(); // request
1150 }
1151 f->close_section(); // client requests
1152}
1153
9f95a23c 1154void EMetaBlob::generate_test_instances(std::list<EMetaBlob*>& ls)
7c673cae
FG
1155{
1156 ls.push_back(new EMetaBlob());
1157}
1158
f67539c2 1159void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDPeerUpdate *peerup)
7c673cae
FG
1160{
1161 dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
1162
11fdf7f2 1163 ceph_assert(logseg);
7c673cae 1164
11fdf7f2 1165 ceph_assert(g_conf()->mds_kill_journal_replay_at != 1);
7c673cae 1166
11fdf7f2 1167 for (auto& p : roots) {
f67539c2 1168 CInode *in = mds->mdcache->get_inode(p.inode->ino);
7c673cae
FG
1169 bool isnew = in ? false:true;
1170 if (!in)
11fdf7f2
TL
1171 in = new CInode(mds->mdcache, false, 2, CEPH_NOSNAP);
1172 p.update_inode(mds, in);
7c673cae
FG
1173
1174 if (isnew)
1175 mds->mdcache->add_inode(in);
11fdf7f2 1176 if (p.is_dirty()) in->_mark_dirty(logseg);
7c673cae
FG
1177 dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl;
1178 }
1179
1180 CInode *renamed_diri = 0;
1181 CDir *olddir = 0;
1182 if (renamed_dirino) {
1183 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
1184 if (renamed_diri)
1185 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri << dendl;
1186 else
1187 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino << dendl;
1188
1189 int nnull = 0;
11fdf7f2
TL
1190 for (const auto& lp : lump_order) {
1191 dirlump &lump = lump_map[lp];
7c673cae 1192 if (lump.nnull) {
11fdf7f2 1193 dout(10) << "EMetaBlob.replay found null dentry in dir " << lp << dendl;
7c673cae
FG
1194 nnull += lump.nnull;
1195 }
1196 }
11fdf7f2 1197 ceph_assert(nnull <= 1);
7c673cae
FG
1198 }
1199
1200 // keep track of any inodes we unlink and don't relink elsewhere
1201 map<CInode*, CDir*> unlinked;
1202 set<CInode*> linked;
1203
1204 // walk through my dirs (in order!)
f6b5b4d7 1205 int count = 0;
11fdf7f2
TL
1206 for (const auto& lp : lump_order) {
1207 dout(10) << "EMetaBlob.replay dir " << lp << dendl;
1208 dirlump &lump = lump_map[lp];
7c673cae
FG
1209
1210 // the dir
11fdf7f2 1211 CDir *dir = mds->mdcache->get_force_dirfrag(lp, true);
7c673cae
FG
1212 if (!dir) {
1213 // hmm. do i have the inode?
11fdf7f2 1214 CInode *diri = mds->mdcache->get_inode((lp).ino);
7c673cae 1215 if (!diri) {
11fdf7f2
TL
1216 if (MDS_INO_IS_MDSDIR(lp.ino)) {
1217 ceph_assert(MDS_INO_MDSDIR(mds->get_nodeid()) != lp.ino);
1218 diri = mds->mdcache->create_system_inode(lp.ino, S_IFDIR|0755);
7c673cae
FG
1219 diri->state_clear(CInode::STATE_AUTH);
1220 dout(10) << "EMetaBlob.replay created base " << *diri << dendl;
1221 } else {
11fdf7f2 1222 dout(0) << "EMetaBlob.replay missing dir ino " << lp.ino << dendl;
7c673cae
FG
1223 mds->clog->error() << "failure replaying journal (EMetaBlob)";
1224 mds->damaged();
1225 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1226 }
1227 }
1228
1229 // create the dirfrag
11fdf7f2 1230 dir = diri->get_or_open_dirfrag(mds->mdcache, lp.frag);
7c673cae 1231
11fdf7f2 1232 if (MDS_INO_IS_BASE(lp.ino))
7c673cae
FG
1233 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
1234
1235 dout(10) << "EMetaBlob.replay added dir " << *dir << dendl;
1236 }
f67539c2
TL
1237 dir->reset_fnode(std::move(lump.fnode));
1238 dir->update_projected_version();
7c673cae
FG
1239
1240 if (lump.is_importing()) {
1241 dir->state_set(CDir::STATE_AUTH);
1242 dir->state_clear(CDir::STATE_COMPLETE);
1243 }
1244 if (lump.is_dirty()) {
1245 dir->_mark_dirty(logseg);
1246
f67539c2 1247 if (!(dir->get_fnode()->rstat == dir->get_fnode()->accounted_rstat)) {
7c673cae
FG
1248 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir << dendl;
1249 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
1250 logseg->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
1251 } else {
1252 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir << dendl;
1253 }
f67539c2 1254 if (!(dir->get_fnode()->fragstat == dir->get_fnode()->accounted_fragstat)) {
7c673cae
FG
1255 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir << dendl;
1256 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
1257 logseg->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
1258 } else {
1259 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir << dendl;
1260 }
1261 }
1262 if (lump.is_dirty_dft()) {
1263 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir << dendl;
1264 dir->state_set(CDir::STATE_DIRTYDFT);
1265 mds->locker->mark_updated_scatterlock(&dir->inode->dirfragtreelock);
1266 logseg->dirty_dirfrag_dirfragtree.push_back(&dir->inode->item_dirty_dirfrag_dirfragtree);
1267 }
1268 if (lump.is_new())
1269 dir->mark_new(logseg);
1270 if (lump.is_complete())
1271 dir->mark_complete();
1272
1273 dout(10) << "EMetaBlob.replay updated dir " << *dir << dendl;
1274
1275 // decode bits
1276 lump._decode_bits();
1277
1278 // full dentry+inode pairs
11fdf7f2
TL
1279 for (auto& fb : lump._get_dfull()) {
1280 CDentry *dn = dir->lookup_exact_snap(fb.dn, fb.dnlast);
7c673cae 1281 if (!dn) {
11fdf7f2
TL
1282 dn = dir->add_null_dentry(fb.dn, fb.dnfirst, fb.dnlast);
1283 dn->set_version(fb.dnv);
1284 if (fb.is_dirty()) dn->_mark_dirty(logseg);
7c673cae
FG
1285 dout(10) << "EMetaBlob.replay added (full) " << *dn << dendl;
1286 } else {
11fdf7f2
TL
1287 dn->set_version(fb.dnv);
1288 if (fb.is_dirty()) dn->_mark_dirty(logseg);
1289 dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *dn << dendl;
1290 dn->first = fb.dnfirst;
1291 ceph_assert(dn->last == fb.dnlast);
7c673cae
FG
1292 }
1293 if (lump.is_importing())
1294 dn->state_set(CDentry::STATE_AUTH);
1295
f67539c2 1296 CInode *in = mds->mdcache->get_inode(fb.inode->ino, fb.dnlast);
7c673cae 1297 if (!in) {
11fdf7f2
TL
1298 in = new CInode(mds->mdcache, dn->is_auth(), fb.dnfirst, fb.dnlast);
1299 fb.update_inode(mds, in);
7c673cae
FG
1300 mds->mdcache->add_inode(in);
1301 if (!dn->get_linkage()->is_null()) {
1302 if (dn->get_linkage()->is_primary()) {
1303 unlinked[dn->get_linkage()->get_inode()] = dir;
f67539c2
TL
1304 CachedStackStringStream css;
1305 *css << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1306 << " " << *dn->get_linkage()->get_inode() << " should be " << in->ino();
1307 dout(0) << css->strv() << dendl;
1308 mds->clog->warn() << css->strv();
7c673cae 1309 }
31f18b77 1310 dir->unlink_inode(dn, false);
7c673cae
FG
1311 }
1312 if (unlinked.count(in))
1313 linked.insert(in);
1314 dir->link_primary_inode(dn, in);
1315 dout(10) << "EMetaBlob.replay added " << *in << dendl;
1316 } else {
11fdf7f2
TL
1317 in->first = fb.dnfirst;
1318 fb.update_inode(mds, in);
7c673cae
FG
1319 if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) {
1320 dout(10) << "EMetaBlob.replay unlinking " << *in << dendl;
1321 unlinked[in] = in->get_parent_dir();
7c673cae 1322 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
7c673cae
FG
1323 }
1324 if (dn->get_linkage()->get_inode() != in) {
1325 if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1326 if (dn->get_linkage()->is_primary()) {
1327 unlinked[dn->get_linkage()->get_inode()] = dir;
f67539c2
TL
1328 CachedStackStringStream css;
1329 *css << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1330 << " " << *dn->get_linkage()->get_inode() << " should be " << in->ino();
1331 dout(0) << css->strv() << dendl;
1332 mds->clog->warn() << css->strv();
7c673cae 1333 }
31f18b77 1334 dir->unlink_inode(dn, false);
7c673cae
FG
1335 }
1336 if (unlinked.count(in))
1337 linked.insert(in);
1338 dir->link_primary_inode(dn, in);
1339 dout(10) << "EMetaBlob.replay linked " << *in << dendl;
1340 } else {
11fdf7f2 1341 dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *in << dendl;
7c673cae 1342 }
11fdf7f2
TL
1343 ceph_assert(in->first == fb.dnfirst ||
1344 (in->is_multiversion() && in->first > fb.dnfirst));
7c673cae 1345 }
11fdf7f2 1346 if (fb.is_dirty())
7c673cae 1347 in->_mark_dirty(logseg);
11fdf7f2
TL
1348 if (fb.is_dirty_parent())
1349 in->mark_dirty_parent(logseg, fb.is_dirty_pool());
1350 if (fb.need_snapflush())
7c673cae
FG
1351 logseg->open_files.push_back(&in->item_open_file);
1352 if (dn->is_auth())
1353 in->state_set(CInode::STATE_AUTH);
1354 else
1355 in->state_clear(CInode::STATE_AUTH);
11fdf7f2 1356 ceph_assert(g_conf()->mds_kill_journal_replay_at != 2);
f6b5b4d7
TL
1357
1358 if (!(++count % 1000))
1359 mds->heartbeat_reset();
7c673cae
FG
1360 }
1361
1362 // remote dentries
11fdf7f2
TL
1363 for (const auto& rb : lump.get_dremote()) {
1364 CDentry *dn = dir->lookup_exact_snap(rb.dn, rb.dnlast);
7c673cae 1365 if (!dn) {
f67539c2 1366 dn = dir->add_remote_dentry(rb.dn, rb.ino, rb.d_type, mempool::mds_co::string(rb.alternate_name), rb.dnfirst, rb.dnlast);
11fdf7f2
TL
1367 dn->set_version(rb.dnv);
1368 if (rb.dirty) dn->_mark_dirty(logseg);
7c673cae
FG
1369 dout(10) << "EMetaBlob.replay added " << *dn << dendl;
1370 } else {
1371 if (!dn->get_linkage()->is_null()) {
1372 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1373 if (dn->get_linkage()->is_primary()) {
1374 unlinked[dn->get_linkage()->get_inode()] = dir;
f67539c2
TL
1375 CachedStackStringStream css;
1376 *css << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
11fdf7f2 1377 << " " << *dn->get_linkage()->get_inode() << " should be remote " << rb.ino;
f67539c2 1378 dout(0) << css->strv() << dendl;
7c673cae 1379 }
31f18b77 1380 dir->unlink_inode(dn, false);
7c673cae 1381 }
f67539c2 1382 dn->set_alternate_name(mempool::mds_co::string(rb.alternate_name));
11fdf7f2
TL
1383 dir->link_remote_inode(dn, rb.ino, rb.d_type);
1384 dn->set_version(rb.dnv);
1385 if (rb.dirty) dn->_mark_dirty(logseg);
1386 dout(10) << "EMetaBlob.replay for [" << rb.dnfirst << "," << rb.dnlast << "] had " << *dn << dendl;
1387 dn->first = rb.dnfirst;
1388 ceph_assert(dn->last == rb.dnlast);
7c673cae
FG
1389 }
1390 if (lump.is_importing())
1391 dn->state_set(CDentry::STATE_AUTH);
f6b5b4d7
TL
1392
1393 if (!(++count % 1000))
1394 mds->heartbeat_reset();
7c673cae
FG
1395 }
1396
1397 // null dentries
11fdf7f2
TL
1398 for (const auto& nb : lump.get_dnull()) {
1399 CDentry *dn = dir->lookup_exact_snap(nb.dn, nb.dnlast);
7c673cae 1400 if (!dn) {
11fdf7f2
TL
1401 dn = dir->add_null_dentry(nb.dn, nb.dnfirst, nb.dnlast);
1402 dn->set_version(nb.dnv);
1403 if (nb.dirty) dn->_mark_dirty(logseg);
7c673cae
FG
1404 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn << dendl;
1405 } else {
11fdf7f2 1406 dn->first = nb.dnfirst;
7c673cae
FG
1407 if (!dn->get_linkage()->is_null()) {
1408 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1409 CInode *in = dn->get_linkage()->get_inode();
1410 // For renamed inode, We may call CInode::force_dirfrag() later.
1411 // CInode::force_dirfrag() doesn't work well when inode is detached
1412 // from the hierarchy.
1413 if (!renamed_diri || renamed_diri != in) {
1414 if (dn->get_linkage()->is_primary())
1415 unlinked[in] = dir;
1416 dir->unlink_inode(dn);
7c673cae
FG
1417 }
1418 }
11fdf7f2
TL
1419 dn->set_version(nb.dnv);
1420 if (nb.dirty) dn->_mark_dirty(logseg);
7c673cae 1421 dout(10) << "EMetaBlob.replay had " << *dn << dendl;
11fdf7f2 1422 ceph_assert(dn->last == nb.dnlast);
7c673cae
FG
1423 }
1424 olddir = dir;
1425 if (lump.is_importing())
1426 dn->state_set(CDentry::STATE_AUTH);
1427
1428 // Make null dentries the first things we trim
1429 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn << dendl;
f6b5b4d7
TL
1430
1431 if (!(++count % 1000))
1432 mds->heartbeat_reset();
7c673cae
FG
1433 }
1434 }
1435
11fdf7f2 1436 ceph_assert(g_conf()->mds_kill_journal_replay_at != 3);
7c673cae
FG
1437
1438 if (renamed_dirino) {
1439 if (renamed_diri) {
11fdf7f2
TL
1440 ceph_assert(unlinked.count(renamed_diri));
1441 ceph_assert(linked.count(renamed_diri));
7c673cae
FG
1442 olddir = unlinked[renamed_diri];
1443 } else {
1444 // we imported a diri we haven't seen before
1445 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
11fdf7f2 1446 ceph_assert(renamed_diri); // it was in the metablob
7c673cae
FG
1447 }
1448
1449 if (olddir) {
1450 if (olddir->authority() != CDIR_AUTH_UNDEF &&
1451 renamed_diri->authority() == CDIR_AUTH_UNDEF) {
f67539c2 1452 ceph_assert(peerup); // auth to non-auth, must be peer prepare
11fdf7f2 1453 frag_vec_t leaves;
7c673cae 1454 renamed_diri->dirfragtree.get_leaves(leaves);
11fdf7f2
TL
1455 for (const auto& leaf : leaves) {
1456 CDir *dir = renamed_diri->get_dirfrag(leaf);
1457 ceph_assert(dir);
7c673cae 1458 if (dir->get_dir_auth() == CDIR_AUTH_UNDEF)
f67539c2
TL
1459 // preserve subtree bound until peer commit
1460 peerup->olddirs.insert(dir->inode);
7c673cae
FG
1461 else
1462 dir->state_set(CDir::STATE_AUTH);
f6b5b4d7
TL
1463
1464 if (!(++count % 1000))
1465 mds->heartbeat_reset();
7c673cae
FG
1466 }
1467 }
1468
1469 mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
1470
1471 // see if we can discard the subtree we renamed out of
1472 CDir *root = mds->mdcache->get_subtree_root(olddir);
1473 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
f67539c2
TL
1474 if (peerup) // preserve the old dir until peer commit
1475 peerup->olddirs.insert(olddir->inode);
7c673cae
FG
1476 else
1477 mds->mdcache->try_trim_non_auth_subtree(root);
1478 }
1479 }
1480
1481 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1482 if (renamed_diri->authority() != CDIR_AUTH_UNDEF) {
11fdf7f2
TL
1483 for (const auto& p : renamed_dir_frags) {
1484 CDir *dir = renamed_diri->get_dirfrag(p);
7c673cae
FG
1485 if (dir) {
1486 // we already had the inode before, and we already adjusted this subtree accordingly.
1487 dout(10) << " already had+adjusted rename import bound " << *dir << dendl;
11fdf7f2 1488 ceph_assert(olddir);
7c673cae
FG
1489 continue;
1490 }
11fdf7f2 1491 dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, p);
7c673cae
FG
1492 dout(10) << " creating new rename import bound " << *dir << dendl;
1493 dir->state_clear(CDir::STATE_AUTH);
224ce89b 1494 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
f6b5b4d7
TL
1495
1496 if (!(++count % 1000))
1497 mds->heartbeat_reset();
7c673cae
FG
1498 }
1499 }
1500
1501 // rename may overwrite an empty directory and move it into stray dir.
1502 unlinked.erase(renamed_diri);
1503 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
1504 if (!linked.count(p->first))
1505 continue;
11fdf7f2 1506 ceph_assert(p->first->is_dir());
7c673cae 1507 mds->mdcache->adjust_subtree_after_rename(p->first, p->second, false);
f6b5b4d7
TL
1508
1509 if (!(++count % 1000))
1510 mds->heartbeat_reset();
7c673cae
FG
1511 }
1512 }
1513
1514 if (!unlinked.empty()) {
1515 for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); ++p)
1516 unlinked.erase(*p);
1517 dout(10) << " unlinked set contains " << unlinked << dendl;
1518 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
11fdf7f2 1519 CInode *in = p->first;
f67539c2
TL
1520 if (peerup) { // preserve unlinked inodes until peer commit
1521 peerup->unlinked.insert(in);
11fdf7f2
TL
1522 if (in->snaprealm)
1523 in->snaprealm->adjust_parent();
1524 } else
1525 mds->mdcache->remove_inode_recursive(in);
f6b5b4d7
TL
1526
1527 if (!(++count % 1000))
1528 mds->heartbeat_reset();
7c673cae
FG
1529 }
1530 }
1531
1532 // table client transactions
11fdf7f2
TL
1533 for (const auto& p : table_tids) {
1534 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p.first)
1535 << " transaction " << p.second << dendl;
1536 MDSTableClient *client = mds->get_table_client(p.first);
7c673cae 1537 if (client)
11fdf7f2 1538 client->got_journaled_agree(p.second, logseg);
f6b5b4d7
TL
1539
1540 if (!(++count % 1000))
1541 mds->heartbeat_reset();
7c673cae
FG
1542 }
1543
1544 // opened ino?
1545 if (opened_ino) {
1546 CInode *in = mds->mdcache->get_inode(opened_ino);
11fdf7f2 1547 ceph_assert(in);
7c673cae
FG
1548 dout(10) << "EMetaBlob.replay noting opened inode " << *in << dendl;
1549 logseg->open_files.push_back(&in->item_open_file);
1550 }
1551
1552 // allocated_inos
1553 if (inotablev) {
1554 if (mds->inotable->get_version() >= inotablev) {
1555 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1556 << " <= table " << mds->inotable->get_version() << dendl;
1557 } else {
1558 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1559 << " - 1 == table " << mds->inotable->get_version()
1560 << " allocated+used " << allocated_ino
1561 << " prealloc " << preallocated_inos
1562 << dendl;
1563 if (allocated_ino)
1564 mds->inotable->replay_alloc_id(allocated_ino);
1565 if (preallocated_inos.size())
1566 mds->inotable->replay_alloc_ids(preallocated_inos);
1567
1568 // [repair bad inotable updates]
1569 if (inotablev > mds->inotable->get_version()) {
1570 mds->clog->error() << "journal replay inotablev mismatch "
1571 << mds->inotable->get_version() << " -> " << inotablev;
1572 mds->inotable->force_replay_version(inotablev);
1573 }
1574
11fdf7f2 1575 ceph_assert(inotablev == mds->inotable->get_version());
7c673cae
FG
1576 }
1577 }
1578 if (sessionmapv) {
81eedcae 1579 unsigned diff = (used_preallocated_ino && !preallocated_inos.empty()) ? 2 : 1;
7c673cae
FG
1580 if (mds->sessionmap.get_version() >= sessionmapv) {
1581 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1582 << " <= table " << mds->sessionmap.get_version() << dendl;
81eedcae 1583 } else if (mds->sessionmap.get_version() + diff == sessionmapv) {
7c673cae 1584 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
81eedcae 1585 << " - " << diff << " == table " << mds->sessionmap.get_version()
7c673cae
FG
1586 << " prealloc " << preallocated_inos
1587 << " used " << used_preallocated_ino
1588 << dendl;
1589 Session *session = mds->sessionmap.get_session(client_name);
1590 if (session) {
1591 dout(20) << " (session prealloc " << session->info.prealloc_inos << ")" << dendl;
1592 if (used_preallocated_ino) {
1593 if (!session->info.prealloc_inos.empty()) {
f67539c2
TL
1594 inodeno_t ino = session->take_ino(used_preallocated_ino);
1595 session->info.prealloc_inos.erase(ino);
1596 ceph_assert(ino == used_preallocated_ino);
7c673cae
FG
1597 }
1598 mds->sessionmap.replay_dirty_session(session);
1599 }
1600 if (!preallocated_inos.empty()) {
f67539c2 1601 session->free_prealloc_inos.insert(preallocated_inos);
7c673cae
FG
1602 session->info.prealloc_inos.insert(preallocated_inos);
1603 mds->sessionmap.replay_dirty_session(session);
1604 }
1605
1606 } else {
1607 dout(10) << "EMetaBlob.replay no session for " << client_name << dendl;
81eedcae 1608 if (used_preallocated_ino)
7c673cae 1609 mds->sessionmap.replay_advance_version();
81eedcae 1610
7c673cae
FG
1611 if (!preallocated_inos.empty())
1612 mds->sessionmap.replay_advance_version();
1613 }
11fdf7f2 1614 ceph_assert(sessionmapv == mds->sessionmap.get_version());
7c673cae 1615 } else {
81eedcae
TL
1616 mds->clog->error() << "EMetaBlob.replay sessionmap v " << sessionmapv
1617 << " - " << diff << " > table " << mds->sessionmap.get_version();
11fdf7f2 1618 ceph_assert(g_conf()->mds_wipe_sessions);
7c673cae
FG
1619 mds->sessionmap.wipe();
1620 mds->sessionmap.set_version(sessionmapv);
1621 }
1622 }
1623
1624 // truncating inodes
11fdf7f2
TL
1625 for (const auto& ino : truncate_start) {
1626 CInode *in = mds->mdcache->get_inode(ino);
1627 ceph_assert(in);
7c673cae 1628 mds->mdcache->add_recovered_truncate(in, logseg);
f6b5b4d7
TL
1629
1630 if (!(++count % 1000))
1631 mds->heartbeat_reset();
7c673cae 1632 }
11fdf7f2
TL
1633 for (const auto& p : truncate_finish) {
1634 LogSegment *ls = mds->mdlog->get_segment(p.second);
7c673cae 1635 if (ls) {
11fdf7f2
TL
1636 CInode *in = mds->mdcache->get_inode(p.first);
1637 ceph_assert(in);
7c673cae
FG
1638 mds->mdcache->remove_recovered_truncate(in, ls);
1639 }
f6b5b4d7
TL
1640
1641 if (!(++count % 1000))
1642 mds->heartbeat_reset();
7c673cae
FG
1643 }
1644
1645 // destroyed inodes
11fdf7f2
TL
1646 if (!destroyed_inodes.empty()) {
1647 for (vector<inodeno_t>::iterator p = destroyed_inodes.begin();
1648 p != destroyed_inodes.end();
1649 ++p) {
1650 CInode *in = mds->mdcache->get_inode(*p);
1651 if (in) {
1652 dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl;
1653 CDentry *parent = in->get_parent_dn();
1654 mds->mdcache->remove_inode(in);
1655 if (parent) {
1656 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl;
1657 ceph_assert(parent->get_linkage()->is_null());
1658 }
1659 } else {
1660 dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl;
7c673cae 1661 }
f6b5b4d7
TL
1662
1663 if (!(++count % 1000))
1664 mds->heartbeat_reset();
7c673cae 1665 }
11fdf7f2 1666 mds->mdcache->open_file_table.note_destroyed_inos(logseg->seq, destroyed_inodes);
7c673cae
FG
1667 }
1668
1669 // client requests
11fdf7f2
TL
1670 for (const auto& p : client_reqs) {
1671 if (p.first.name.is_client()) {
1672 dout(10) << "EMetaBlob.replay request " << p.first << " trim_to " << p.second << dendl;
7c673cae
FG
1673 inodeno_t created = allocated_ino ? allocated_ino : used_preallocated_ino;
1674 // if we allocated an inode, there should be exactly one client request id.
11fdf7f2 1675 ceph_assert(created == inodeno_t() || client_reqs.size() == 1);
7c673cae 1676
11fdf7f2 1677 Session *session = mds->sessionmap.get_session(p.first.name);
7c673cae 1678 if (session) {
11fdf7f2
TL
1679 session->add_completed_request(p.first.tid, created);
1680 if (p.second)
1681 session->trim_completed_requests(p.second);
7c673cae
FG
1682 }
1683 }
f6b5b4d7
TL
1684
1685 if (!(++count % 1000))
1686 mds->heartbeat_reset();
7c673cae
FG
1687 }
1688
1689 // client flushes
11fdf7f2
TL
1690 for (const auto& p : client_flushes) {
1691 if (p.first.name.is_client()) {
1692 dout(10) << "EMetaBlob.replay flush " << p.first << " trim_to " << p.second << dendl;
1693 Session *session = mds->sessionmap.get_session(p.first.name);
7c673cae 1694 if (session) {
11fdf7f2
TL
1695 session->add_completed_flush(p.first.tid);
1696 if (p.second)
1697 session->trim_completed_flushes(p.second);
7c673cae
FG
1698 }
1699 }
f6b5b4d7
TL
1700
1701 if (!(++count % 1000))
1702 mds->heartbeat_reset();
7c673cae
FG
1703 }
1704
1705 // update segment
1706 update_segment(logseg);
1707
11fdf7f2 1708 ceph_assert(g_conf()->mds_kill_journal_replay_at != 4);
7c673cae
FG
1709}
1710
9f95a23c
TL
1711// -----------------------
1712// EPurged
1713void EPurged::update_segment()
1714{
1715 if (inos.size() && inotablev)
1716 get_segment()->inotablev = inotablev;
1717 return;
1718}
1719
1720void EPurged::replay(MDSRank *mds)
1721{
1722 if (inos.size()) {
1723 LogSegment *ls = mds->mdlog->get_segment(seq);
f67539c2
TL
1724 if (ls)
1725 ls->purging_inodes.subtract(inos);
1726
9f95a23c
TL
1727 if (mds->inotable->get_version() >= inotablev) {
1728 dout(10) << "EPurged.replay inotable " << mds->inotable->get_version()
1729 << " >= " << inotablev << ", noop" << dendl;
1730 } else {
1731 dout(10) << "EPurged.replay inotable " << mds->inotable->get_version()
1732 << " < " << inotablev << " " << dendl;
1733 mds->inotable->replay_release_ids(inos);
1734 assert(mds->inotable->get_version() == inotablev);
1735 }
1736 }
1737 update_segment();
1738}
1739
1740void EPurged::encode(bufferlist& bl, uint64_t features) const
1741{
1742 ENCODE_START(1, 1, bl);
1743 encode(inos, bl);
1744 encode(inotablev, bl);
1745 encode(seq, bl);
1746 ENCODE_FINISH(bl);
1747}
1748
1749void EPurged::decode(bufferlist::const_iterator& bl)
1750{
1751 DECODE_START(1, bl);
1752 decode(inos, bl);
1753 decode(inotablev, bl);
1754 decode(seq, bl);
1755 DECODE_FINISH(bl);
1756}
1757
1758void EPurged::dump(Formatter *f) const
1759{
1760 f->dump_stream("inos") << inos;
1761 f->dump_int("inotable version", inotablev);
1762 f->dump_int("segment seq", seq);
1763}
1764
7c673cae
FG
1765// -----------------------
1766// ESession
1767
1768void ESession::update_segment()
1769{
11fdf7f2 1770 get_segment()->sessionmapv = cmapv;
f67539c2 1771 if (inos_to_free.size() && inotablev)
11fdf7f2 1772 get_segment()->inotablev = inotablev;
7c673cae
FG
1773}
1774
1775void ESession::replay(MDSRank *mds)
1776{
f67539c2
TL
1777 if (inos_to_purge.size())
1778 get_segment()->purging_inodes.insert(inos_to_purge);
9f95a23c 1779
7c673cae
FG
1780 if (mds->sessionmap.get_version() >= cmapv) {
1781 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1782 << " >= " << cmapv << ", noop" << dendl;
81eedcae 1783 } else if (mds->sessionmap.get_version() + 1 == cmapv) {
7c673cae
FG
1784 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1785 << " < " << cmapv << " " << (open ? "open":"close") << " " << client_inst << dendl;
1786 Session *session;
1787 if (open) {
1788 session = mds->sessionmap.get_or_add_session(client_inst);
1789 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1790 session->set_client_metadata(client_metadata);
1791 dout(10) << " opened session " << session->info.inst << dendl;
1792 } else {
1793 session = mds->sessionmap.get_session(client_inst.name);
1794 if (session) { // there always should be a session, but there's a bug
11fdf7f2 1795 if (session->get_connection() == NULL) {
7c673cae
FG
1796 dout(10) << " removed session " << session->info.inst << dendl;
1797 mds->sessionmap.remove_session(session);
1798 session = NULL;
1799 } else {
1800 session->clear(); // the client has reconnected; keep the Session, but reset
1801 dout(10) << " reset session " << session->info.inst << " (they reconnected)" << dendl;
1802 }
1803 } else {
1804 mds->clog->error() << "replayed stray Session close event for " << client_inst
1805 << " from time " << stamp << ", ignoring";
1806 }
1807 }
1808 if (session) {
1809 mds->sessionmap.replay_dirty_session(session);
1810 } else {
1811 mds->sessionmap.replay_advance_version();
1812 }
11fdf7f2 1813 ceph_assert(mds->sessionmap.get_version() == cmapv);
81eedcae
TL
1814 } else {
1815 mds->clog->error() << "ESession.replay sessionmap v " << cmapv
1816 << " - 1 > table " << mds->sessionmap.get_version();
1817 ceph_assert(g_conf()->mds_wipe_sessions);
1818 mds->sessionmap.wipe();
1819 mds->sessionmap.set_version(cmapv);
7c673cae
FG
1820 }
1821
f67539c2 1822 if (inos_to_free.size() && inotablev) {
7c673cae
FG
1823 if (mds->inotable->get_version() >= inotablev) {
1824 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1825 << " >= " << inotablev << ", noop" << dendl;
1826 } else {
1827 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1828 << " < " << inotablev << " " << (open ? "add":"remove") << dendl;
11fdf7f2 1829 ceph_assert(!open); // for now
f67539c2 1830 mds->inotable->replay_release_ids(inos_to_free);
11fdf7f2 1831 ceph_assert(mds->inotable->get_version() == inotablev);
7c673cae
FG
1832 }
1833 }
1834
1835 update_segment();
1836}
1837
1838void ESession::encode(bufferlist &bl, uint64_t features) const
1839{
9f95a23c 1840 ENCODE_START(6, 5, bl);
11fdf7f2
TL
1841 encode(stamp, bl);
1842 encode(client_inst, bl, features);
1843 encode(open, bl);
1844 encode(cmapv, bl);
f67539c2 1845 encode(inos_to_free, bl);
11fdf7f2
TL
1846 encode(inotablev, bl);
1847 encode(client_metadata, bl);
f67539c2 1848 encode(inos_to_purge, bl);
7c673cae
FG
1849 ENCODE_FINISH(bl);
1850}
1851
11fdf7f2 1852void ESession::decode(bufferlist::const_iterator &bl)
7c673cae 1853{
9f95a23c 1854 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
7c673cae 1855 if (struct_v >= 2)
11fdf7f2
TL
1856 decode(stamp, bl);
1857 decode(client_inst, bl);
1858 decode(open, bl);
1859 decode(cmapv, bl);
f67539c2 1860 decode(inos_to_free, bl);
11fdf7f2
TL
1861 decode(inotablev, bl);
1862 if (struct_v == 4) {
1863 decode(client_metadata.kv_map, bl);
1864 } else if (struct_v >= 5) {
1865 decode(client_metadata, bl);
7c673cae 1866 }
9f95a23c 1867 if (struct_v >= 6){
f67539c2 1868 decode(inos_to_purge, bl);
9f95a23c
TL
1869 }
1870
7c673cae
FG
1871 DECODE_FINISH(bl);
1872}
1873
1874void ESession::dump(Formatter *f) const
1875{
1876 f->dump_stream("client instance") << client_inst;
1877 f->dump_string("open", open ? "true" : "false");
1878 f->dump_int("client map version", cmapv);
f67539c2 1879 f->dump_stream("inos_to_free") << inos_to_free;
7c673cae
FG
1880 f->dump_int("inotable version", inotablev);
1881 f->open_object_section("client_metadata");
f67539c2 1882 f->dump_stream("inos_to_purge") << inos_to_purge;
11fdf7f2 1883 client_metadata.dump(f);
7c673cae
FG
1884 f->close_section(); // client_metadata
1885}
1886
9f95a23c 1887void ESession::generate_test_instances(std::list<ESession*>& ls)
7c673cae
FG
1888{
1889 ls.push_back(new ESession);
1890}
1891
1892// -----------------------
1893// ESessions
1894
1895void ESessions::encode(bufferlist &bl, uint64_t features) const
1896{
11fdf7f2
TL
1897 ENCODE_START(2, 1, bl);
1898 encode(client_map, bl, features);
1899 encode(cmapv, bl);
1900 encode(stamp, bl);
1901 encode(client_metadata_map, bl);
7c673cae
FG
1902 ENCODE_FINISH(bl);
1903}
1904
11fdf7f2 1905void ESessions::decode_old(bufferlist::const_iterator &bl)
7c673cae 1906{
11fdf7f2
TL
1907 using ceph::decode;
1908 decode(client_map, bl);
1909 decode(cmapv, bl);
7c673cae 1910 if (!bl.end())
11fdf7f2 1911 decode(stamp, bl);
7c673cae
FG
1912}
1913
11fdf7f2 1914void ESessions::decode_new(bufferlist::const_iterator &bl)
7c673cae 1915{
11fdf7f2
TL
1916 DECODE_START(2, bl);
1917 decode(client_map, bl);
1918 decode(cmapv, bl);
1919 decode(stamp, bl);
1920 if (struct_v >= 2)
1921 decode(client_metadata_map, bl);
7c673cae
FG
1922 DECODE_FINISH(bl);
1923}
1924
1925void ESessions::dump(Formatter *f) const
1926{
1927 f->dump_int("client map version", cmapv);
1928
1929 f->open_array_section("client map");
1930 for (map<client_t,entity_inst_t>::const_iterator i = client_map.begin();
1931 i != client_map.end(); ++i) {
1932 f->open_object_section("client");
1933 f->dump_int("client id", i->first.v);
1934 f->dump_stream("client entity") << i->second;
1935 f->close_section(); // client
1936 }
1937 f->close_section(); // client map
1938}
1939
9f95a23c 1940void ESessions::generate_test_instances(std::list<ESessions*>& ls)
7c673cae
FG
1941{
1942 ls.push_back(new ESessions());
1943}
1944
1945void ESessions::update_segment()
1946{
11fdf7f2 1947 get_segment()->sessionmapv = cmapv;
7c673cae
FG
1948}
1949
1950void ESessions::replay(MDSRank *mds)
1951{
1952 if (mds->sessionmap.get_version() >= cmapv) {
1953 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1954 << " >= " << cmapv << ", noop" << dendl;
1955 } else {
1956 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1957 << " < " << cmapv << dendl;
81eedcae 1958 mds->sessionmap.replay_open_sessions(cmapv, client_map, client_metadata_map);
7c673cae
FG
1959 }
1960 update_segment();
1961}
1962
1963
1964// -----------------------
1965// ETableServer
1966
1967void ETableServer::encode(bufferlist& bl, uint64_t features) const
1968{
1969 ENCODE_START(3, 3, bl);
11fdf7f2
TL
1970 encode(stamp, bl);
1971 encode(table, bl);
1972 encode(op, bl);
1973 encode(reqid, bl);
1974 encode(bymds, bl);
1975 encode(mutation, bl);
1976 encode(tid, bl);
1977 encode(version, bl);
7c673cae
FG
1978 ENCODE_FINISH(bl);
1979}
1980
11fdf7f2 1981void ETableServer::decode(bufferlist::const_iterator &bl)
7c673cae
FG
1982{
1983 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
1984 if (struct_v >= 2)
11fdf7f2
TL
1985 decode(stamp, bl);
1986 decode(table, bl);
1987 decode(op, bl);
1988 decode(reqid, bl);
1989 decode(bymds, bl);
1990 decode(mutation, bl);
1991 decode(tid, bl);
1992 decode(version, bl);
7c673cae
FG
1993 DECODE_FINISH(bl);
1994}
1995
1996void ETableServer::dump(Formatter *f) const
1997{
1998 f->dump_int("table id", table);
1999 f->dump_int("op", op);
2000 f->dump_int("request id", reqid);
2001 f->dump_int("by mds", bymds);
2002 f->dump_int("tid", tid);
2003 f->dump_int("version", version);
2004}
2005
9f95a23c 2006void ETableServer::generate_test_instances(std::list<ETableServer*>& ls)
7c673cae
FG
2007{
2008 ls.push_back(new ETableServer());
2009}
2010
2011
2012void ETableServer::update_segment()
2013{
11fdf7f2 2014 get_segment()->tablev[table] = version;
7c673cae
FG
2015}
2016
2017void ETableServer::replay(MDSRank *mds)
2018{
2019 MDSTableServer *server = mds->get_table_server(table);
2020 if (!server)
2021 return;
2022
2023 if (server->get_version() >= version) {
2024 dout(10) << "ETableServer.replay " << get_mdstable_name(table)
2025 << " " << get_mdstableserver_opname(op)
2026 << " event " << version
2027 << " <= table " << server->get_version() << dendl;
2028 return;
2029 }
2030
2031 dout(10) << " ETableServer.replay " << get_mdstable_name(table)
2032 << " " << get_mdstableserver_opname(op)
2033 << " event " << version << " - 1 == table " << server->get_version() << dendl;
11fdf7f2 2034 ceph_assert(version-1 == server->get_version());
7c673cae
FG
2035
2036 switch (op) {
11fdf7f2
TL
2037 case TABLESERVER_OP_PREPARE: {
2038 server->_note_prepare(bymds, reqid, true);
2039 bufferlist out;
2040 server->_prepare(mutation, reqid, bymds, out);
2041 mutation = std::move(out);
7c673cae 2042 break;
11fdf7f2 2043 }
7c673cae 2044 case TABLESERVER_OP_COMMIT:
9f95a23c 2045 server->_commit(tid, ref_t<MMDSTableRequest>());
11fdf7f2 2046 server->_note_commit(tid, true);
7c673cae
FG
2047 break;
2048 case TABLESERVER_OP_ROLLBACK:
2049 server->_rollback(tid);
11fdf7f2 2050 server->_note_rollback(tid, true);
7c673cae
FG
2051 break;
2052 case TABLESERVER_OP_SERVER_UPDATE:
2053 server->_server_update(mutation);
11fdf7f2 2054 server->_note_server_update(mutation, true);
7c673cae
FG
2055 break;
2056 default:
2057 mds->clog->error() << "invalid tableserver op in ETableServer";
2058 mds->damaged();
2059 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2060 }
2061
11fdf7f2 2062 ceph_assert(version == server->get_version());
7c673cae
FG
2063 update_segment();
2064}
2065
2066
2067// ---------------------
2068// ETableClient
2069
2070void ETableClient::encode(bufferlist& bl, uint64_t features) const
2071{
2072 ENCODE_START(3, 3, bl);
11fdf7f2
TL
2073 encode(stamp, bl);
2074 encode(table, bl);
2075 encode(op, bl);
2076 encode(tid, bl);
7c673cae
FG
2077 ENCODE_FINISH(bl);
2078}
2079
11fdf7f2 2080void ETableClient::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2081{
2082 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2083 if (struct_v >= 2)
11fdf7f2
TL
2084 decode(stamp, bl);
2085 decode(table, bl);
2086 decode(op, bl);
2087 decode(tid, bl);
7c673cae
FG
2088 DECODE_FINISH(bl);
2089}
2090
2091void ETableClient::dump(Formatter *f) const
2092{
2093 f->dump_int("table", table);
2094 f->dump_int("op", op);
2095 f->dump_int("tid", tid);
2096}
2097
9f95a23c 2098void ETableClient::generate_test_instances(std::list<ETableClient*>& ls)
7c673cae
FG
2099{
2100 ls.push_back(new ETableClient());
2101}
2102
2103void ETableClient::replay(MDSRank *mds)
2104{
2105 dout(10) << " ETableClient.replay " << get_mdstable_name(table)
2106 << " op " << get_mdstableserver_opname(op)
2107 << " tid " << tid << dendl;
2108
2109 MDSTableClient *client = mds->get_table_client(table);
2110 if (!client)
2111 return;
2112
11fdf7f2 2113 ceph_assert(op == TABLESERVER_OP_ACK);
7c673cae
FG
2114 client->got_journaled_ack(tid);
2115}
2116
2117
2118// -----------------------
2119// ESnap
2120/*
2121void ESnap::update_segment()
2122{
11fdf7f2 2123 get_segment()->tablev[TABLE_SNAP] = version;
7c673cae
FG
2124}
2125
2126void ESnap::replay(MDSRank *mds)
2127{
2128 if (mds->snaptable->get_version() >= version) {
2129 dout(10) << "ESnap.replay event " << version
2130 << " <= table " << mds->snaptable->get_version() << dendl;
2131 return;
2132 }
2133
2134 dout(10) << " ESnap.replay event " << version
2135 << " - 1 == table " << mds->snaptable->get_version() << dendl;
11fdf7f2 2136 ceph_assert(version-1 == mds->snaptable->get_version());
7c673cae
FG
2137
2138 if (create) {
2139 version_t v;
2140 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
11fdf7f2 2141 ceph_assert(s == snap.snapid);
7c673cae
FG
2142 } else {
2143 mds->snaptable->remove(snap.snapid);
2144 }
2145
11fdf7f2 2146 ceph_assert(version == mds->snaptable->get_version());
7c673cae
FG
2147}
2148*/
2149
2150
2151
2152// -----------------------
2153// EUpdate
2154
2155void EUpdate::encode(bufferlist &bl, uint64_t features) const
2156{
2157 ENCODE_START(4, 4, bl);
11fdf7f2
TL
2158 encode(stamp, bl);
2159 encode(type, bl);
2160 encode(metablob, bl, features);
2161 encode(client_map, bl);
2162 encode(cmapv, bl);
2163 encode(reqid, bl);
f67539c2 2164 encode(had_peers, bl);
7c673cae
FG
2165 ENCODE_FINISH(bl);
2166}
2167
11fdf7f2 2168void EUpdate::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2169{
2170 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl);
2171 if (struct_v >= 2)
11fdf7f2
TL
2172 decode(stamp, bl);
2173 decode(type, bl);
2174 decode(metablob, bl);
2175 decode(client_map, bl);
7c673cae 2176 if (struct_v >= 3)
11fdf7f2
TL
2177 decode(cmapv, bl);
2178 decode(reqid, bl);
f67539c2 2179 decode(had_peers, bl);
7c673cae
FG
2180 DECODE_FINISH(bl);
2181}
2182
2183void EUpdate::dump(Formatter *f) const
2184{
2185 f->open_object_section("metablob");
2186 metablob.dump(f);
2187 f->close_section(); // metablob
2188
2189 f->dump_string("type", type);
2190 f->dump_int("client map length", client_map.length());
2191 f->dump_int("client map version", cmapv);
2192 f->dump_stream("reqid") << reqid;
f67539c2 2193 f->dump_string("had peers", had_peers ? "true" : "false");
7c673cae
FG
2194}
2195
9f95a23c 2196void EUpdate::generate_test_instances(std::list<EUpdate*>& ls)
7c673cae
FG
2197{
2198 ls.push_back(new EUpdate());
2199}
2200
2201
2202void EUpdate::update_segment()
2203{
11fdf7f2
TL
2204 auto&& segment = get_segment();
2205 metablob.update_segment(segment);
7c673cae
FG
2206
2207 if (client_map.length())
11fdf7f2 2208 segment->sessionmapv = cmapv;
7c673cae 2209
f67539c2
TL
2210 if (had_peers)
2211 segment->uncommitted_leaders.insert(reqid);
7c673cae
FG
2212}
2213
2214void EUpdate::replay(MDSRank *mds)
2215{
11fdf7f2
TL
2216 auto&& segment = get_segment();
2217 metablob.replay(mds, segment);
7c673cae 2218
f67539c2
TL
2219 if (had_peers) {
2220 dout(10) << "EUpdate.replay " << reqid << " had peers, expecting a matching ECommitted" << dendl;
2221 segment->uncommitted_leaders.insert(reqid);
2222 set<mds_rank_t> peers;
2223 mds->mdcache->add_uncommitted_leader(reqid, segment, peers, true);
7c673cae
FG
2224 }
2225
2226 if (client_map.length()) {
2227 if (mds->sessionmap.get_version() >= cmapv) {
2228 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2229 << " <= table " << mds->sessionmap.get_version() << dendl;
2230 } else {
2231 dout(10) << "EUpdate.replay sessionmap " << mds->sessionmap.get_version()
2232 << " < " << cmapv << dendl;
2233 // open client sessions?
2234 map<client_t,entity_inst_t> cm;
11fdf7f2
TL
2235 map<client_t,client_metadata_t> cmm;
2236 auto blp = client_map.cbegin();
2237 using ceph::decode;
2238 decode(cm, blp);
2239 if (!blp.end())
2240 decode(cmm, blp);
81eedcae 2241 mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
7c673cae
FG
2242 }
2243 }
2244 update_segment();
2245}
2246
2247
2248// ------------------------
2249// EOpen
2250
2251void EOpen::encode(bufferlist &bl, uint64_t features) const {
2252 ENCODE_START(4, 3, bl);
11fdf7f2
TL
2253 encode(stamp, bl);
2254 encode(metablob, bl, features);
2255 encode(inos, bl);
2256 encode(snap_inos, bl);
7c673cae
FG
2257 ENCODE_FINISH(bl);
2258}
2259
11fdf7f2 2260void EOpen::decode(bufferlist::const_iterator &bl) {
7c673cae
FG
2261 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2262 if (struct_v >= 2)
11fdf7f2
TL
2263 decode(stamp, bl);
2264 decode(metablob, bl);
2265 decode(inos, bl);
7c673cae 2266 if (struct_v >= 4)
11fdf7f2 2267 decode(snap_inos, bl);
7c673cae
FG
2268 DECODE_FINISH(bl);
2269}
2270
2271void EOpen::dump(Formatter *f) const
2272{
2273 f->open_object_section("metablob");
2274 metablob.dump(f);
2275 f->close_section(); // metablob
2276 f->open_array_section("inos involved");
2277 for (vector<inodeno_t>::const_iterator i = inos.begin();
2278 i != inos.end(); ++i) {
2279 f->dump_int("ino", *i);
2280 }
2281 f->close_section(); // inos
2282}
2283
9f95a23c 2284void EOpen::generate_test_instances(std::list<EOpen*>& ls)
7c673cae
FG
2285{
2286 ls.push_back(new EOpen());
2287 ls.push_back(new EOpen());
2288 ls.back()->add_ino(0);
2289}
2290
2291void EOpen::update_segment()
2292{
2293 // ??
2294}
2295
2296void EOpen::replay(MDSRank *mds)
2297{
2298 dout(10) << "EOpen.replay " << dendl;
11fdf7f2
TL
2299 auto&& segment = get_segment();
2300 metablob.replay(mds, segment);
7c673cae
FG
2301
2302 // note which segments inodes belong to, so we don't have to start rejournaling them
2303 for (const auto &ino : inos) {
2304 CInode *in = mds->mdcache->get_inode(ino);
2305 if (!in) {
2306 dout(0) << "EOpen.replay ino " << ino << " not in metablob" << dendl;
11fdf7f2 2307 ceph_assert(in);
7c673cae 2308 }
11fdf7f2 2309 segment->open_files.push_back(&in->item_open_file);
7c673cae
FG
2310 }
2311 for (const auto &vino : snap_inos) {
2312 CInode *in = mds->mdcache->get_inode(vino);
2313 if (!in) {
2314 dout(0) << "EOpen.replay ino " << vino << " not in metablob" << dendl;
11fdf7f2 2315 ceph_assert(in);
7c673cae 2316 }
11fdf7f2 2317 segment->open_files.push_back(&in->item_open_file);
7c673cae
FG
2318 }
2319}
2320
2321
2322// -----------------------
2323// ECommitted
2324
2325void ECommitted::replay(MDSRank *mds)
2326{
f67539c2 2327 if (mds->mdcache->uncommitted_leaders.count(reqid)) {
7c673cae 2328 dout(10) << "ECommitted.replay " << reqid << dendl;
f67539c2
TL
2329 mds->mdcache->uncommitted_leaders[reqid].ls->uncommitted_leaders.erase(reqid);
2330 mds->mdcache->uncommitted_leaders.erase(reqid);
7c673cae
FG
2331 } else {
2332 dout(10) << "ECommitted.replay " << reqid << " -- didn't see original op" << dendl;
2333 }
2334}
2335
2336void ECommitted::encode(bufferlist& bl, uint64_t features) const
2337{
2338 ENCODE_START(3, 3, bl);
11fdf7f2
TL
2339 encode(stamp, bl);
2340 encode(reqid, bl);
7c673cae
FG
2341 ENCODE_FINISH(bl);
2342}
2343
11fdf7f2 2344void ECommitted::decode(bufferlist::const_iterator& bl)
7c673cae
FG
2345{
2346 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2347 if (struct_v >= 2)
11fdf7f2
TL
2348 decode(stamp, bl);
2349 decode(reqid, bl);
7c673cae
FG
2350 DECODE_FINISH(bl);
2351}
2352
2353void ECommitted::dump(Formatter *f) const {
2354 f->dump_stream("stamp") << stamp;
2355 f->dump_stream("reqid") << reqid;
2356}
2357
9f95a23c 2358void ECommitted::generate_test_instances(std::list<ECommitted*>& ls)
7c673cae
FG
2359{
2360 ls.push_back(new ECommitted);
2361 ls.push_back(new ECommitted);
2362 ls.back()->stamp = utime_t(1, 2);
2363 ls.back()->reqid = metareqid_t(entity_name_t::CLIENT(123), 456);
2364}
2365
2366// -----------------------
f67539c2 2367// EPeerUpdate
7c673cae
FG
2368
2369void link_rollback::encode(bufferlist &bl) const
2370{
11fdf7f2
TL
2371 ENCODE_START(3, 2, bl);
2372 encode(reqid, bl);
2373 encode(ino, bl);
2374 encode(was_inc, bl);
2375 encode(old_ctime, bl);
2376 encode(old_dir_mtime, bl);
2377 encode(old_dir_rctime, bl);
2378 encode(snapbl, bl);
7c673cae
FG
2379 ENCODE_FINISH(bl);
2380}
2381
11fdf7f2 2382void link_rollback::decode(bufferlist::const_iterator &bl)
7c673cae 2383{
11fdf7f2
TL
2384 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2385 decode(reqid, bl);
2386 decode(ino, bl);
2387 decode(was_inc, bl);
2388 decode(old_ctime, bl);
2389 decode(old_dir_mtime, bl);
2390 decode(old_dir_rctime, bl);
2391 if (struct_v >= 3)
2392 decode(snapbl, bl);
7c673cae
FG
2393 DECODE_FINISH(bl);
2394}
2395
2396void link_rollback::dump(Formatter *f) const
2397{
2398 f->dump_stream("metareqid") << reqid;
2399 f->dump_int("ino", ino);
2400 f->dump_string("was incremented", was_inc ? "true" : "false");
2401 f->dump_stream("old_ctime") << old_ctime;
2402 f->dump_stream("old_dir_mtime") << old_dir_mtime;
2403 f->dump_stream("old_dir_rctime") << old_dir_rctime;
2404}
2405
9f95a23c 2406void link_rollback::generate_test_instances(std::list<link_rollback*>& ls)
7c673cae
FG
2407{
2408 ls.push_back(new link_rollback());
2409}
2410
2411void rmdir_rollback::encode(bufferlist& bl) const
2412{
11fdf7f2
TL
2413 ENCODE_START(3, 2, bl);
2414 encode(reqid, bl);
2415 encode(src_dir, bl);
2416 encode(src_dname, bl);
2417 encode(dest_dir, bl);
2418 encode(dest_dname, bl);
2419 encode(snapbl, bl);
7c673cae
FG
2420 ENCODE_FINISH(bl);
2421}
2422
11fdf7f2 2423void rmdir_rollback::decode(bufferlist::const_iterator& bl)
7c673cae 2424{
11fdf7f2
TL
2425 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2426 decode(reqid, bl);
2427 decode(src_dir, bl);
2428 decode(src_dname, bl);
2429 decode(dest_dir, bl);
2430 decode(dest_dname, bl);
2431 if (struct_v >= 3)
2432 decode(snapbl, bl);
7c673cae
FG
2433 DECODE_FINISH(bl);
2434}
2435
2436void rmdir_rollback::dump(Formatter *f) const
2437{
2438 f->dump_stream("metareqid") << reqid;
2439 f->dump_stream("source directory") << src_dir;
2440 f->dump_string("source dname", src_dname);
2441 f->dump_stream("destination directory") << dest_dir;
2442 f->dump_string("destination dname", dest_dname);
2443}
2444
9f95a23c 2445void rmdir_rollback::generate_test_instances(std::list<rmdir_rollback*>& ls)
7c673cae
FG
2446{
2447 ls.push_back(new rmdir_rollback());
2448}
2449
2450void rename_rollback::drec::encode(bufferlist &bl) const
2451{
2452 ENCODE_START(2, 2, bl);
11fdf7f2
TL
2453 encode(dirfrag, bl);
2454 encode(dirfrag_old_mtime, bl);
2455 encode(dirfrag_old_rctime, bl);
2456 encode(ino, bl);
2457 encode(remote_ino, bl);
2458 encode(dname, bl);
2459 encode(remote_d_type, bl);
2460 encode(old_ctime, bl);
7c673cae
FG
2461 ENCODE_FINISH(bl);
2462}
2463
11fdf7f2 2464void rename_rollback::drec::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2465{
2466 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
11fdf7f2
TL
2467 decode(dirfrag, bl);
2468 decode(dirfrag_old_mtime, bl);
2469 decode(dirfrag_old_rctime, bl);
2470 decode(ino, bl);
2471 decode(remote_ino, bl);
2472 decode(dname, bl);
2473 decode(remote_d_type, bl);
2474 decode(old_ctime, bl);
7c673cae
FG
2475 DECODE_FINISH(bl);
2476}
2477
2478void rename_rollback::drec::dump(Formatter *f) const
2479{
2480 f->dump_stream("directory fragment") << dirfrag;
2481 f->dump_stream("directory old mtime") << dirfrag_old_mtime;
2482 f->dump_stream("directory old rctime") << dirfrag_old_rctime;
2483 f->dump_int("ino", ino);
2484 f->dump_int("remote ino", remote_ino);
2485 f->dump_string("dname", dname);
2486 uint32_t type = DTTOIF(remote_d_type) & S_IFMT; // convert to type entries
2487 string type_string;
2488 switch(type) {
2489 case S_IFREG:
2490 type_string = "file"; break;
2491 case S_IFLNK:
2492 type_string = "symlink"; break;
2493 case S_IFDIR:
2494 type_string = "directory"; break;
2495 default:
2496 type_string = "UNKNOWN-" + stringify((int)type); break;
2497 }
2498 f->dump_string("remote dtype", type_string);
2499 f->dump_stream("old ctime") << old_ctime;
2500}
2501
9f95a23c 2502void rename_rollback::drec::generate_test_instances(std::list<drec*>& ls)
7c673cae
FG
2503{
2504 ls.push_back(new drec());
2505 ls.back()->remote_d_type = IFTODT(S_IFREG);
2506}
2507
2508void rename_rollback::encode(bufferlist &bl) const
2509{
11fdf7f2
TL
2510 ENCODE_START(3, 2, bl);
2511 encode(reqid, bl);
7c673cae
FG
2512 encode(orig_src, bl);
2513 encode(orig_dest, bl);
2514 encode(stray, bl);
11fdf7f2
TL
2515 encode(ctime, bl);
2516 encode(srci_snapbl, bl);
2517 encode(desti_snapbl, bl);
7c673cae
FG
2518 ENCODE_FINISH(bl);
2519}
2520
11fdf7f2 2521void rename_rollback::decode(bufferlist::const_iterator &bl)
7c673cae 2522{
11fdf7f2
TL
2523 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2524 decode(reqid, bl);
7c673cae
FG
2525 decode(orig_src, bl);
2526 decode(orig_dest, bl);
2527 decode(stray, bl);
11fdf7f2
TL
2528 decode(ctime, bl);
2529 if (struct_v >= 3) {
2530 decode(srci_snapbl, bl);
2531 decode(desti_snapbl, bl);
2532 }
7c673cae
FG
2533 DECODE_FINISH(bl);
2534}
2535
2536void rename_rollback::dump(Formatter *f) const
2537{
2538 f->dump_stream("request id") << reqid;
2539 f->open_object_section("original src drec");
2540 orig_src.dump(f);
2541 f->close_section(); // original src drec
2542 f->open_object_section("original dest drec");
2543 orig_dest.dump(f);
2544 f->close_section(); // original dest drec
2545 f->open_object_section("stray drec");
2546 stray.dump(f);
2547 f->close_section(); // stray drec
2548 f->dump_stream("ctime") << ctime;
2549}
2550
9f95a23c 2551void rename_rollback::generate_test_instances(std::list<rename_rollback*>& ls)
7c673cae
FG
2552{
2553 ls.push_back(new rename_rollback());
2554 ls.back()->orig_src.remote_d_type = IFTODT(S_IFREG);
2555 ls.back()->orig_dest.remote_d_type = IFTODT(S_IFREG);
2556 ls.back()->stray.remote_d_type = IFTODT(S_IFREG);
2557}
2558
f67539c2 2559void EPeerUpdate::encode(bufferlist &bl, uint64_t features) const
7c673cae
FG
2560{
2561 ENCODE_START(3, 3, bl);
11fdf7f2
TL
2562 encode(stamp, bl);
2563 encode(type, bl);
2564 encode(reqid, bl);
f67539c2 2565 encode(leader, bl);
11fdf7f2
TL
2566 encode(op, bl);
2567 encode(origop, bl);
2568 encode(commit, bl, features);
2569 encode(rollback, bl);
7c673cae
FG
2570 ENCODE_FINISH(bl);
2571}
2572
f67539c2 2573void EPeerUpdate::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2574{
2575 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2576 if (struct_v >= 2)
11fdf7f2
TL
2577 decode(stamp, bl);
2578 decode(type, bl);
2579 decode(reqid, bl);
f67539c2 2580 decode(leader, bl);
11fdf7f2
TL
2581 decode(op, bl);
2582 decode(origop, bl);
2583 decode(commit, bl);
2584 decode(rollback, bl);
7c673cae
FG
2585 DECODE_FINISH(bl);
2586}
2587
f67539c2 2588void EPeerUpdate::dump(Formatter *f) const
7c673cae
FG
2589{
2590 f->open_object_section("metablob");
2591 commit.dump(f);
2592 f->close_section(); // metablob
2593
2594 f->dump_int("rollback length", rollback.length());
2595 f->dump_string("type", type);
2596 f->dump_stream("metareqid") << reqid;
f67539c2 2597 f->dump_int("leader", leader);
7c673cae
FG
2598 f->dump_int("op", op);
2599 f->dump_int("original op", origop);
2600}
2601
f67539c2 2602void EPeerUpdate::generate_test_instances(std::list<EPeerUpdate*>& ls)
7c673cae 2603{
f67539c2 2604 ls.push_back(new EPeerUpdate());
7c673cae
FG
2605}
2606
f67539c2 2607void EPeerUpdate::replay(MDSRank *mds)
7c673cae 2608{
f67539c2 2609 MDPeerUpdate *su;
11fdf7f2 2610 auto&& segment = get_segment();
7c673cae 2611 switch (op) {
f67539c2
TL
2612 case EPeerUpdate::OP_PREPARE:
2613 dout(10) << "EPeerUpdate.replay prepare " << reqid << " for mds." << leader
7c673cae 2614 << ": applying commit, saving rollback info" << dendl;
f67539c2 2615 su = new MDPeerUpdate(origop, rollback);
11fdf7f2 2616 commit.replay(mds, segment, su);
f67539c2 2617 mds->mdcache->add_uncommitted_peer(reqid, segment, leader, su);
7c673cae
FG
2618 break;
2619
f67539c2
TL
2620 case EPeerUpdate::OP_COMMIT:
2621 dout(10) << "EPeerUpdate.replay commit " << reqid << " for mds." << leader << dendl;
2622 mds->mdcache->finish_uncommitted_peer(reqid, false);
7c673cae
FG
2623 break;
2624
f67539c2
TL
2625 case EPeerUpdate::OP_ROLLBACK:
2626 dout(10) << "EPeerUpdate.replay abort " << reqid << " for mds." << leader
7c673cae 2627 << ": applying rollback commit blob" << dendl;
11fdf7f2 2628 commit.replay(mds, segment);
f67539c2 2629 mds->mdcache->finish_uncommitted_peer(reqid, false);
7c673cae
FG
2630 break;
2631
2632 default:
f67539c2 2633 mds->clog->error() << "invalid op in EPeerUpdate";
7c673cae
FG
2634 mds->damaged();
2635 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2636 }
2637}
2638
2639
2640// -----------------------
2641// ESubtreeMap
2642
2643void ESubtreeMap::encode(bufferlist& bl, uint64_t features) const
2644{
2645 ENCODE_START(6, 5, bl);
11fdf7f2
TL
2646 encode(stamp, bl);
2647 encode(metablob, bl, features);
2648 encode(subtrees, bl);
2649 encode(ambiguous_subtrees, bl);
2650 encode(expire_pos, bl);
2651 encode(event_seq, bl);
7c673cae
FG
2652 ENCODE_FINISH(bl);
2653}
2654
11fdf7f2 2655void ESubtreeMap::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2656{
2657 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
2658 if (struct_v >= 2)
11fdf7f2
TL
2659 decode(stamp, bl);
2660 decode(metablob, bl);
2661 decode(subtrees, bl);
7c673cae 2662 if (struct_v >= 4)
11fdf7f2 2663 decode(ambiguous_subtrees, bl);
7c673cae 2664 if (struct_v >= 3)
11fdf7f2 2665 decode(expire_pos, bl);
7c673cae 2666 if (struct_v >= 6)
11fdf7f2 2667 decode(event_seq, bl);
7c673cae
FG
2668 DECODE_FINISH(bl);
2669}
2670
2671void ESubtreeMap::dump(Formatter *f) const
2672{
2673 f->open_object_section("metablob");
2674 metablob.dump(f);
2675 f->close_section(); // metablob
2676
2677 f->open_array_section("subtrees");
2678 for(map<dirfrag_t,vector<dirfrag_t> >::const_iterator i = subtrees.begin();
2679 i != subtrees.end(); ++i) {
2680 f->open_object_section("tree");
2681 f->dump_stream("root dirfrag") << i->first;
2682 for (vector<dirfrag_t>::const_iterator j = i->second.begin();
2683 j != i->second.end(); ++j) {
2684 f->dump_stream("bound dirfrag") << *j;
2685 }
2686 f->close_section(); // tree
2687 }
2688 f->close_section(); // subtrees
2689
2690 f->open_array_section("ambiguous subtrees");
2691 for(set<dirfrag_t>::const_iterator i = ambiguous_subtrees.begin();
2692 i != ambiguous_subtrees.end(); ++i) {
2693 f->dump_stream("dirfrag") << *i;
2694 }
2695 f->close_section(); // ambiguous subtrees
2696
2697 f->dump_int("expire position", expire_pos);
2698}
2699
9f95a23c 2700void ESubtreeMap::generate_test_instances(std::list<ESubtreeMap*>& ls)
7c673cae
FG
2701{
2702 ls.push_back(new ESubtreeMap());
2703}
2704
2705void ESubtreeMap::replay(MDSRank *mds)
2706{
2707 if (expire_pos && expire_pos > mds->mdlog->journaler->get_expire_pos())
2708 mds->mdlog->journaler->set_expire_pos(expire_pos);
2709
2710 // suck up the subtree map?
2711 if (mds->mdcache->is_subtrees()) {
2712 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl;
2713 int errors = 0;
2714
2715 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2716 p != subtrees.end();
2717 ++p) {
2718 CDir *dir = mds->mdcache->get_dirfrag(p->first);
2719 if (!dir) {
2720 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2721 << " subtree root " << p->first << " not in cache";
2722 ++errors;
2723 continue;
2724 }
2725
2726 if (!mds->mdcache->is_subtree(dir)) {
2727 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2728 << " subtree root " << p->first << " not a subtree in cache";
2729 ++errors;
2730 continue;
2731 }
2732 if (dir->get_dir_auth().first != mds->get_nodeid()) {
2733 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2734 << " subtree root " << p->first
2735 << " is not mine in cache (it's " << dir->get_dir_auth() << ")";
2736 ++errors;
2737 continue;
2738 }
2739
2740 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2741 mds->mdcache->get_force_dirfrag(*q, true);
2742
2743 set<CDir*> bounds;
2744 mds->mdcache->get_subtree_bounds(dir, bounds);
2745 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2746 CDir *b = mds->mdcache->get_dirfrag(*q);
2747 if (!b) {
2748 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2749 << " subtree " << p->first << " bound " << *q << " not in cache";
2750 ++errors;
2751 continue;
2752 }
2753 if (bounds.count(b) == 0) {
2754 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2755 << " subtree " << p->first << " bound " << *q << " not a bound in cache";
2756 ++errors;
2757 continue;
2758 }
2759 bounds.erase(b);
2760 }
2761 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q) {
2762 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2763 << " subtree " << p->first << " has extra bound in cache " << (*q)->dirfrag();
2764 ++errors;
2765 }
2766
2767 if (ambiguous_subtrees.count(p->first)) {
2768 if (!mds->mdcache->have_ambiguous_import(p->first)) {
2769 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2770 << " subtree " << p->first << " is ambiguous but is not in our cache";
2771 ++errors;
2772 }
2773 } else {
2774 if (mds->mdcache->have_ambiguous_import(p->first)) {
2775 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2776 << " subtree " << p->first << " is not ambiguous but is in our cache";
2777 ++errors;
2778 }
2779 }
2780 }
2781
11fdf7f2
TL
2782 std::vector<CDir*> dirs;
2783 mds->mdcache->get_subtrees(dirs);
2784 for (const auto& dir : dirs) {
7c673cae
FG
2785 if (dir->get_dir_auth().first != mds->get_nodeid())
2786 continue;
2787 if (subtrees.count(dir->dirfrag()) == 0) {
2788 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2789 << " does not include cache subtree " << dir->dirfrag();
2790 ++errors;
2791 }
2792 }
2793
2794 if (errors) {
2795 dout(0) << "journal subtrees: " << subtrees << dendl;
2796 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees << dendl;
2797 mds->mdcache->show_subtrees();
11fdf7f2 2798 ceph_assert(!g_conf()->mds_debug_subtrees || errors == 0);
7c673cae
FG
2799 }
2800 return;
2801 }
2802
2803 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl;
2804
2805 // first, stick the spanning tree in my cache
2806 //metablob.print(*_dout);
11fdf7f2 2807 metablob.replay(mds, get_segment());
7c673cae
FG
2808
2809 // restore import/export maps
2810 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2811 p != subtrees.end();
2812 ++p) {
2813 CDir *dir = mds->mdcache->get_dirfrag(p->first);
11fdf7f2 2814 ceph_assert(dir);
7c673cae
FG
2815 if (ambiguous_subtrees.count(p->first)) {
2816 // ambiguous!
2817 mds->mdcache->add_ambiguous_import(p->first, p->second);
2818 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second,
2819 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
2820 } else {
2821 // not ambiguous
2822 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid());
2823 }
2824 }
2825
2826 mds->mdcache->recalc_auth_bits(true);
2827
2828 mds->mdcache->show_subtrees();
2829}
2830
2831
2832
2833// -----------------------
2834// EFragment
2835
2836void EFragment::replay(MDSRank *mds)
2837{
2838 dout(10) << "EFragment.replay " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << dendl;
2839
9f95a23c 2840 std::vector<CDir*> resultfrags;
11fdf7f2 2841 MDSContext::vec waiters;
7c673cae
FG
2842
2843 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2844 // it will be once we replay the metablob , but first we need to
2845 // refragment anything we already have in the cache.
2846 CInode *in = mds->mdcache->get_inode(ino);
2847
11fdf7f2 2848 auto&& segment = get_segment();
7c673cae
FG
2849 switch (op) {
2850 case OP_PREPARE:
11fdf7f2 2851 mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, segment, &rollback);
7c673cae
FG
2852
2853 if (in)
9f95a23c 2854 mds->mdcache->adjust_dir_fragments(in, basefrag, bits, &resultfrags, waiters, true);
7c673cae
FG
2855 break;
2856
11fdf7f2
TL
2857 case OP_ROLLBACK: {
2858 frag_vec_t old_frags;
7c673cae
FG
2859 if (in) {
2860 in->dirfragtree.get_leaves_under(basefrag, old_frags);
2861 if (orig_frags.empty()) {
2862 // old format EFragment
9f95a23c 2863 mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, &resultfrags, waiters, true);
7c673cae 2864 } else {
11fdf7f2
TL
2865 for (const auto& fg : orig_frags)
2866 mds->mdcache->force_dir_fragment(in, fg);
7c673cae
FG
2867 }
2868 }
11fdf7f2 2869 mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), std::move(old_frags));
7c673cae 2870 break;
11fdf7f2 2871 }
7c673cae
FG
2872
2873 case OP_COMMIT:
2874 case OP_FINISH:
2875 mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op);
2876 break;
2877
2878 default:
2879 ceph_abort();
2880 }
2881
11fdf7f2
TL
2882 metablob.replay(mds, segment);
2883 if (in && g_conf()->mds_debug_frag)
7c673cae
FG
2884 in->verify_dirfrags();
2885}
2886
2887void EFragment::encode(bufferlist &bl, uint64_t features) const {
2888 ENCODE_START(5, 4, bl);
11fdf7f2
TL
2889 encode(stamp, bl);
2890 encode(op, bl);
2891 encode(ino, bl);
2892 encode(basefrag, bl);
2893 encode(bits, bl);
2894 encode(metablob, bl, features);
2895 encode(orig_frags, bl);
2896 encode(rollback, bl);
7c673cae
FG
2897 ENCODE_FINISH(bl);
2898}
2899
11fdf7f2 2900void EFragment::decode(bufferlist::const_iterator &bl) {
7c673cae
FG
2901 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
2902 if (struct_v >= 2)
11fdf7f2 2903 decode(stamp, bl);
7c673cae 2904 if (struct_v >= 3)
11fdf7f2
TL
2905 decode(op, bl);
2906 decode(ino, bl);
2907 decode(basefrag, bl);
2908 decode(bits, bl);
2909 decode(metablob, bl);
7c673cae 2910 if (struct_v >= 5) {
11fdf7f2
TL
2911 decode(orig_frags, bl);
2912 decode(rollback, bl);
7c673cae
FG
2913 }
2914 DECODE_FINISH(bl);
2915}
2916
2917void EFragment::dump(Formatter *f) const
2918{
2919 /*f->open_object_section("Metablob");
2920 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2921 f->close_section();*/
2922 f->dump_string("op", op_name(op));
2923 f->dump_stream("ino") << ino;
2924 f->dump_stream("base frag") << basefrag;
2925 f->dump_int("bits", bits);
2926}
2927
9f95a23c 2928void EFragment::generate_test_instances(std::list<EFragment*>& ls)
7c673cae
FG
2929{
2930 ls.push_back(new EFragment);
2931 ls.push_back(new EFragment);
2932 ls.back()->op = OP_PREPARE;
2933 ls.back()->ino = 1;
2934 ls.back()->bits = 5;
2935}
2936
2937void dirfrag_rollback::encode(bufferlist &bl) const
2938{
2939 ENCODE_START(1, 1, bl);
f67539c2 2940 encode(*fnode, bl);
7c673cae
FG
2941 ENCODE_FINISH(bl);
2942}
2943
11fdf7f2 2944void dirfrag_rollback::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2945{
2946 DECODE_START(1, bl);
f67539c2
TL
2947 {
2948 auto _fnode = CDir::allocate_fnode();
2949 decode(*_fnode, bl);
2950 fnode = std::move(_fnode);
2951 }
7c673cae
FG
2952 DECODE_FINISH(bl);
2953}
2954
2955
2956
2957// =========================================================================
2958
2959// -----------------------
2960// EExport
2961
2962void EExport::replay(MDSRank *mds)
2963{
2964 dout(10) << "EExport.replay " << base << dendl;
11fdf7f2
TL
2965 auto&& segment = get_segment();
2966 metablob.replay(mds, segment);
7c673cae
FG
2967
2968 CDir *dir = mds->mdcache->get_dirfrag(base);
11fdf7f2 2969 ceph_assert(dir);
7c673cae
FG
2970
2971 set<CDir*> realbounds;
2972 for (set<dirfrag_t>::iterator p = bounds.begin();
2973 p != bounds.end();
2974 ++p) {
2975 CDir *bd = mds->mdcache->get_dirfrag(*p);
11fdf7f2 2976 ceph_assert(bd);
7c673cae
FG
2977 realbounds.insert(bd);
2978 }
2979
2980 // adjust auth away
2981 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, CDIR_AUTH_UNDEF);
2982
2983 mds->mdcache->try_trim_non_auth_subtree(dir);
2984}
2985
2986void EExport::encode(bufferlist& bl, uint64_t features) const
2987{
31f18b77 2988 ENCODE_START(4, 3, bl);
11fdf7f2
TL
2989 encode(stamp, bl);
2990 encode(metablob, bl, features);
2991 encode(base, bl);
2992 encode(bounds, bl);
2993 encode(target, bl);
7c673cae
FG
2994 ENCODE_FINISH(bl);
2995}
2996
11fdf7f2 2997void EExport::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2998{
2999 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
3000 if (struct_v >= 2)
11fdf7f2
TL
3001 decode(stamp, bl);
3002 decode(metablob, bl);
3003 decode(base, bl);
3004 decode(bounds, bl);
31f18b77 3005 if (struct_v >= 4)
11fdf7f2 3006 decode(target, bl);
7c673cae
FG
3007 DECODE_FINISH(bl);
3008}
3009
3010void EExport::dump(Formatter *f) const
3011{
3012 f->dump_float("stamp", (double)stamp);
3013 /*f->open_object_section("Metablob");
3014 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
3015 f->close_section();*/
3016 f->dump_stream("base dirfrag") << base;
3017 f->open_array_section("bounds dirfrags");
3018 for (set<dirfrag_t>::const_iterator i = bounds.begin();
3019 i != bounds.end(); ++i) {
3020 f->dump_stream("dirfrag") << *i;
3021 }
3022 f->close_section(); // bounds dirfrags
3023}
3024
9f95a23c 3025void EExport::generate_test_instances(std::list<EExport*>& ls)
7c673cae
FG
3026{
3027 EExport *sample = new EExport();
3028 ls.push_back(sample);
3029}
3030
3031
3032// -----------------------
3033// EImportStart
3034
3035void EImportStart::update_segment()
3036{
11fdf7f2 3037 get_segment()->sessionmapv = cmapv;
7c673cae
FG
3038}
3039
3040void EImportStart::replay(MDSRank *mds)
3041{
3042 dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl;
3043 //metablob.print(*_dout);
11fdf7f2
TL
3044 auto&& segment = get_segment();
3045 metablob.replay(mds, segment);
7c673cae
FG
3046
3047 // put in ambiguous import list
3048 mds->mdcache->add_ambiguous_import(base, bounds);
3049
3050 // set auth partially to us so we don't trim it
3051 CDir *dir = mds->mdcache->get_dirfrag(base);
11fdf7f2 3052 ceph_assert(dir);
7c673cae
FG
3053
3054 set<CDir*> realbounds;
3055 for (vector<dirfrag_t>::iterator p = bounds.begin();
3056 p != bounds.end();
3057 ++p) {
3058 CDir *bd = mds->mdcache->get_dirfrag(*p);
11fdf7f2 3059 ceph_assert(bd);
7c673cae
FG
3060 if (!bd->is_subtree_root())
3061 bd->state_clear(CDir::STATE_AUTH);
3062 realbounds.insert(bd);
3063 }
3064
3065 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds,
3066 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
3067
3068 // open client sessions?
3069 if (mds->sessionmap.get_version() >= cmapv) {
3070 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
3071 << " >= " << cmapv << ", noop" << dendl;
3072 } else {
3073 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
3074 << " < " << cmapv << dendl;
3075 map<client_t,entity_inst_t> cm;
11fdf7f2
TL
3076 map<client_t,client_metadata_t> cmm;
3077 auto blp = client_map.cbegin();
3078 using ceph::decode;
3079 decode(cm, blp);
3080 if (!blp.end())
3081 decode(cmm, blp);
81eedcae 3082 mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
7c673cae
FG
3083 }
3084 update_segment();
3085}
3086
3087void EImportStart::encode(bufferlist &bl, uint64_t features) const {
31f18b77 3088 ENCODE_START(4, 3, bl);
11fdf7f2
TL
3089 encode(stamp, bl);
3090 encode(base, bl);
3091 encode(metablob, bl, features);
3092 encode(bounds, bl);
3093 encode(cmapv, bl);
3094 encode(client_map, bl);
3095 encode(from, bl);
7c673cae
FG
3096 ENCODE_FINISH(bl);
3097}
3098
11fdf7f2 3099void EImportStart::decode(bufferlist::const_iterator &bl) {
7c673cae
FG
3100 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
3101 if (struct_v >= 2)
11fdf7f2
TL
3102 decode(stamp, bl);
3103 decode(base, bl);
3104 decode(metablob, bl);
3105 decode(bounds, bl);
3106 decode(cmapv, bl);
3107 decode(client_map, bl);
31f18b77 3108 if (struct_v >= 4)
11fdf7f2 3109 decode(from, bl);
7c673cae
FG
3110 DECODE_FINISH(bl);
3111}
3112
3113void EImportStart::dump(Formatter *f) const
3114{
3115 f->dump_stream("base dirfrag") << base;
3116 f->open_array_section("boundary dirfrags");
3117 for (vector<dirfrag_t>::const_iterator iter = bounds.begin();
3118 iter != bounds.end(); ++iter) {
3119 f->dump_stream("frag") << *iter;
3120 }
3121 f->close_section();
3122}
3123
9f95a23c 3124void EImportStart::generate_test_instances(std::list<EImportStart*>& ls)
7c673cae
FG
3125{
3126 ls.push_back(new EImportStart);
3127}
3128
3129// -----------------------
3130// EImportFinish
3131
3132void EImportFinish::replay(MDSRank *mds)
3133{
3134 if (mds->mdcache->have_ambiguous_import(base)) {
3135 dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl;
3136 if (success) {
3137 mds->mdcache->finish_ambiguous_import(base);
3138 } else {
3139 CDir *dir = mds->mdcache->get_dirfrag(base);
11fdf7f2 3140 ceph_assert(dir);
7c673cae
FG
3141 vector<dirfrag_t> bounds;
3142 mds->mdcache->get_ambiguous_import_bounds(base, bounds);
3143 mds->mdcache->adjust_bounded_subtree_auth(dir, bounds, CDIR_AUTH_UNDEF);
3144 mds->mdcache->cancel_ambiguous_import(dir);
3145 mds->mdcache->try_trim_non_auth_subtree(dir);
3146 }
3147 } else {
3148 // this shouldn't happen unless this is an old journal
3149 dout(10) << "EImportFinish.replay " << base << " success=" << success
3150 << " on subtree not marked as ambiguous"
3151 << dendl;
3152 mds->clog->error() << "failure replaying journal (EImportFinish)";
3153 mds->damaged();
3154 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3155 }
3156}
3157
3158void EImportFinish::encode(bufferlist& bl, uint64_t features) const
3159{
3160 ENCODE_START(3, 3, bl);
11fdf7f2
TL
3161 encode(stamp, bl);
3162 encode(base, bl);
3163 encode(success, bl);
7c673cae
FG
3164 ENCODE_FINISH(bl);
3165}
3166
11fdf7f2 3167void EImportFinish::decode(bufferlist::const_iterator &bl)
7c673cae
FG
3168{
3169 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
3170 if (struct_v >= 2)
11fdf7f2
TL
3171 decode(stamp, bl);
3172 decode(base, bl);
3173 decode(success, bl);
7c673cae
FG
3174 DECODE_FINISH(bl);
3175}
3176
3177void EImportFinish::dump(Formatter *f) const
3178{
3179 f->dump_stream("base dirfrag") << base;
3180 f->dump_string("success", success ? "true" : "false");
3181}
9f95a23c 3182void EImportFinish::generate_test_instances(std::list<EImportFinish*>& ls)
7c673cae
FG
3183{
3184 ls.push_back(new EImportFinish);
3185 ls.push_back(new EImportFinish);
3186 ls.back()->success = true;
3187}
3188
3189
3190// ------------------------
3191// EResetJournal
3192
3193void EResetJournal::encode(bufferlist& bl, uint64_t features) const
3194{
3195 ENCODE_START(2, 2, bl);
11fdf7f2 3196 encode(stamp, bl);
7c673cae
FG
3197 ENCODE_FINISH(bl);
3198}
3199
11fdf7f2 3200void EResetJournal::decode(bufferlist::const_iterator &bl)
7c673cae
FG
3201{
3202 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
11fdf7f2 3203 decode(stamp, bl);
7c673cae
FG
3204 DECODE_FINISH(bl);
3205}
3206
3207void EResetJournal::dump(Formatter *f) const
3208{
3209 f->dump_stream("timestamp") << stamp;
3210}
3211
9f95a23c 3212void EResetJournal::generate_test_instances(std::list<EResetJournal*>& ls)
7c673cae
FG
3213{
3214 ls.push_back(new EResetJournal());
3215}
3216
3217void EResetJournal::replay(MDSRank *mds)
3218{
3219 dout(1) << "EResetJournal" << dendl;
3220
3221 mds->sessionmap.wipe();
3222 mds->inotable->replay_reset();
3223
3224 if (mds->mdsmap->get_root() == mds->get_nodeid()) {
3225 CDir *rootdir = mds->mdcache->get_root()->get_or_open_dirfrag(mds->mdcache, frag_t());
3226 mds->mdcache->adjust_subtree_auth(rootdir, mds->get_nodeid());
3227 }
3228
3229 CDir *mydir = mds->mdcache->get_myin()->get_or_open_dirfrag(mds->mdcache, frag_t());
3230 mds->mdcache->adjust_subtree_auth(mydir, mds->get_nodeid());
3231
3232 mds->mdcache->recalc_auth_bits(true);
3233
3234 mds->mdcache->show_subtrees();
3235}
3236
3237
3238void ENoOp::encode(bufferlist &bl, uint64_t features) const
3239{
3240 ENCODE_START(2, 2, bl);
11fdf7f2 3241 encode(pad_size, bl);
7c673cae
FG
3242 uint8_t const pad = 0xff;
3243 for (unsigned int i = 0; i < pad_size; ++i) {
11fdf7f2 3244 encode(pad, bl);
7c673cae
FG
3245 }
3246 ENCODE_FINISH(bl);
3247}
3248
3249
11fdf7f2 3250void ENoOp::decode(bufferlist::const_iterator &bl)
7c673cae
FG
3251{
3252 DECODE_START(2, bl);
11fdf7f2 3253 decode(pad_size, bl);
7c673cae
FG
3254 if (bl.get_remaining() != pad_size) {
3255 // This is spiritually an assertion, but expressing in a way that will let
3256 // journal debug tools catch it and recognise a malformed entry.
3257 throw buffer::end_of_buffer();
3258 } else {
9f95a23c 3259 bl += pad_size;
7c673cae
FG
3260 }
3261 DECODE_FINISH(bl);
3262}
3263
3264
3265void ENoOp::replay(MDSRank *mds)
3266{
3267 dout(4) << "ENoOp::replay, " << pad_size << " bytes skipped in journal" << dendl;
3268}
3269
3270/**
3271 * If re-formatting an old journal that used absolute log position
3272 * references as segment sequence numbers, use this function to update
3273 * it.
3274 *
3275 * @param mds
3276 * MDSRank instance, just used for logging
3277 * @param old_to_new
3278 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3279 *
3280 * @return
3281 * True if the event was modified.
3282 */
3283bool EMetaBlob::rewrite_truncate_finish(MDSRank const *mds,
9f95a23c 3284 std::map<LogSegment::seq_t, LogSegment::seq_t> const &old_to_new)
7c673cae
FG
3285{
3286 bool modified = false;
9f95a23c 3287 map<inodeno_t, LogSegment::seq_t> new_trunc_finish;
11fdf7f2
TL
3288 for (const auto& p : truncate_finish) {
3289 auto q = old_to_new.find(p.second);
3290 if (q != old_to_new.end()) {
7c673cae 3291 dout(20) << __func__ << " applying segment seq mapping "
11fdf7f2
TL
3292 << p.second << " -> " << q->second << dendl;
3293 new_trunc_finish.emplace(p.first, q->second);
7c673cae
FG
3294 modified = true;
3295 } else {
3296 dout(20) << __func__ << " no segment seq mapping found for "
11fdf7f2
TL
3297 << p.second << dendl;
3298 new_trunc_finish.insert(p);
7c673cae
FG
3299 }
3300 }
11fdf7f2 3301 truncate_finish.swap(new_trunc_finish);
7c673cae
FG
3302
3303 return modified;
3304}