]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/journal.cc
import 15.2.5
[ceph.git] / ceph / src / mds / journal.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "common/config.h"
16#include "osdc/Journaler.h"
17#include "events/ESubtreeMap.h"
18#include "events/ESession.h"
19#include "events/ESessions.h"
20
21#include "events/EMetaBlob.h"
22#include "events/EResetJournal.h"
23#include "events/ENoOp.h"
24
25#include "events/EUpdate.h"
26#include "events/ESlaveUpdate.h"
27#include "events/EOpen.h"
28#include "events/ECommitted.h"
9f95a23c 29#include "events/EPurged.h"
7c673cae
FG
30
31#include "events/EExport.h"
32#include "events/EImportStart.h"
33#include "events/EImportFinish.h"
34#include "events/EFragment.h"
35
36#include "events/ETableClient.h"
37#include "events/ETableServer.h"
38
39#include "include/stringify.h"
40
41#include "LogSegment.h"
42
43#include "MDSRank.h"
44#include "MDLog.h"
45#include "MDCache.h"
46#include "Server.h"
47#include "Migrator.h"
48#include "Mutation.h"
49
50#include "InoTable.h"
51#include "MDSTableClient.h"
52#include "MDSTableServer.h"
53
54#include "Locker.h"
55
56#define dout_context g_ceph_context
57#define dout_subsys ceph_subsys_mds
58#undef dout_prefix
59#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
60
61
62// -----------------------
63// LogSegment
64
65void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio)
66{
67 set<CDir*> commit;
68
69 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire" << dendl;
70
11fdf7f2 71 ceph_assert(g_conf()->mds_kill_journal_expire_at != 1);
7c673cae
FG
72
73 // commit dirs
74 for (elist<CDir*>::iterator p = new_dirfrags.begin(); !p.end(); ++p) {
75 dout(20) << " new_dirfrag " << **p << dendl;
11fdf7f2 76 ceph_assert((*p)->is_auth());
7c673cae
FG
77 commit.insert(*p);
78 }
79 for (elist<CDir*>::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) {
80 dout(20) << " dirty_dirfrag " << **p << dendl;
11fdf7f2 81 ceph_assert((*p)->is_auth());
7c673cae
FG
82 commit.insert(*p);
83 }
84 for (elist<CDentry*>::iterator p = dirty_dentries.begin(); !p.end(); ++p) {
85 dout(20) << " dirty_dentry " << **p << dendl;
11fdf7f2 86 ceph_assert((*p)->is_auth());
7c673cae
FG
87 commit.insert((*p)->get_dir());
88 }
89 for (elist<CInode*>::iterator p = dirty_inodes.begin(); !p.end(); ++p) {
90 dout(20) << " dirty_inode " << **p << dendl;
11fdf7f2 91 ceph_assert((*p)->is_auth());
7c673cae
FG
92 if ((*p)->is_base()) {
93 (*p)->store(gather_bld.new_sub());
94 } else
95 commit.insert((*p)->get_parent_dn()->get_dir());
96 }
97
98 if (!commit.empty()) {
99 for (set<CDir*>::iterator p = commit.begin();
100 p != commit.end();
101 ++p) {
102 CDir *dir = *p;
11fdf7f2 103 ceph_assert(dir->is_auth());
7c673cae
FG
104 if (dir->can_auth_pin()) {
105 dout(15) << "try_to_expire committing " << *dir << dendl;
106 dir->commit(0, gather_bld.new_sub(), false, op_prio);
107 } else {
108 dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl;
109 dir->add_waiter(CDir::WAIT_UNFREEZE, gather_bld.new_sub());
110 }
111 }
112 }
113
114 // master ops with possibly uncommitted slaves
115 for (set<metareqid_t>::iterator p = uncommitted_masters.begin();
116 p != uncommitted_masters.end();
117 ++p) {
118 dout(10) << "try_to_expire waiting for slaves to ack commit on " << *p << dendl;
119 mds->mdcache->wait_for_uncommitted_master(*p, gather_bld.new_sub());
120 }
121
e306af50
TL
122 // slave ops that haven't been committed
123 for (set<metareqid_t>::iterator p = uncommitted_slaves.begin();
124 p != uncommitted_slaves.end();
125 ++p) {
126 dout(10) << "try_to_expire waiting for master to ack OP_FINISH on " << *p << dendl;
127 mds->mdcache->wait_for_uncommitted_slave(*p, gather_bld.new_sub());
128 }
129
7c673cae
FG
130 // uncommitted fragments
131 for (set<dirfrag_t>::iterator p = uncommitted_fragments.begin();
132 p != uncommitted_fragments.end();
133 ++p) {
134 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p << dendl;
135 mds->mdcache->wait_for_uncommitted_fragment(*p, gather_bld.new_sub());
136 }
137
138 // nudge scatterlocks
139 for (elist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) {
140 CInode *in = *p;
141 dout(10) << "try_to_expire waiting for dirlock flush on " << *in << dendl;
142 mds->locker->scatter_nudge(&in->filelock, gather_bld.new_sub());
143 }
144 for (elist<CInode*>::iterator p = dirty_dirfrag_dirfragtree.begin(); !p.end(); ++p) {
145 CInode *in = *p;
146 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in << dendl;
147 mds->locker->scatter_nudge(&in->dirfragtreelock, gather_bld.new_sub());
148 }
149 for (elist<CInode*>::iterator p = dirty_dirfrag_nest.begin(); !p.end(); ++p) {
150 CInode *in = *p;
151 dout(10) << "try_to_expire waiting for nest flush on " << *in << dendl;
152 mds->locker->scatter_nudge(&in->nestlock, gather_bld.new_sub());
153 }
154
11fdf7f2 155 ceph_assert(g_conf()->mds_kill_journal_expire_at != 2);
7c673cae
FG
156
157 // open files and snap inodes
158 if (!open_files.empty()) {
11fdf7f2 159 ceph_assert(!mds->mdlog->is_capped()); // hmm FIXME
7c673cae
FG
160 EOpen *le = 0;
161 LogSegment *ls = mds->mdlog->get_current_segment();
11fdf7f2 162 ceph_assert(ls != this);
7c673cae
FG
163 elist<CInode*>::iterator p = open_files.begin(member_offset(CInode, item_open_file));
164 while (!p.end()) {
165 CInode *in = *p;
166 ++p;
11fdf7f2 167 if (in->last != CEPH_NOSNAP && in->is_auth() && !in->client_snap_caps.empty()) {
7c673cae
FG
168 // journal snap inodes that need flush. This simplify the mds failover hanlding
169 dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl;
170 if (!le) {
171 le = new EOpen(mds->mdlog);
172 mds->mdlog->start_entry(le);
173 }
174 le->add_clean_inode(in);
175 ls->open_files.push_back(&in->item_open_file);
176 } else {
11fdf7f2 177 // open files are tracked by open file table, no need to journal them again
7c673cae
FG
178 in->item_open_file.remove_myself();
179 }
180 }
181 if (le) {
182 mds->mdlog->submit_entry(le);
183 mds->mdlog->wait_for_safe(gather_bld.new_sub());
184 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl;
185 }
186 }
187
11fdf7f2 188 ceph_assert(g_conf()->mds_kill_journal_expire_at != 3);
7c673cae
FG
189
190 // backtraces to be stored/updated
191 for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) {
192 CInode *in = *p;
11fdf7f2 193 ceph_assert(in->is_auth());
7c673cae
FG
194 if (in->can_auth_pin()) {
195 dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl;
196 in->store_backtrace(gather_bld.new_sub(), op_prio);
197 } else {
198 dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl;
199 in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub());
200 }
201 }
202
11fdf7f2 203 ceph_assert(g_conf()->mds_kill_journal_expire_at != 4);
7c673cae 204
7c673cae
FG
205 // idalloc
206 if (inotablev > mds->inotable->get_committed_version()) {
207 dout(10) << "try_to_expire saving inotable table, need " << inotablev
208 << ", committed is " << mds->inotable->get_committed_version()
209 << " (" << mds->inotable->get_committing_version() << ")"
210 << dendl;
211 mds->inotable->save(gather_bld.new_sub(), inotablev);
212 }
213
214 // sessionmap
215 if (sessionmapv > mds->sessionmap.get_committed()) {
216 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
217 << ", committed is " << mds->sessionmap.get_committed()
218 << " (" << mds->sessionmap.get_committing() << ")"
219 << dendl;
220 mds->sessionmap.save(gather_bld.new_sub(), sessionmapv);
221 }
222
223 // updates to sessions for completed_requests
224 mds->sessionmap.save_if_dirty(touched_sessions, &gather_bld);
225 touched_sessions.clear();
226
227 // pending commit atids
228 for (map<int, ceph::unordered_set<version_t> >::iterator p = pending_commit_tids.begin();
229 p != pending_commit_tids.end();
230 ++p) {
231 MDSTableClient *client = mds->get_table_client(p->first);
11fdf7f2 232 ceph_assert(client);
7c673cae
FG
233 for (ceph::unordered_set<version_t>::iterator q = p->second.begin();
234 q != p->second.end();
235 ++q) {
236 dout(10) << "try_to_expire " << get_mdstable_name(p->first) << " transaction " << *q
237 << " pending commit (not yet acked), waiting" << dendl;
11fdf7f2 238 ceph_assert(!client->has_committed(*q));
7c673cae
FG
239 client->wait_for_ack(*q, gather_bld.new_sub());
240 }
241 }
242
243 // table servers
244 for (map<int, version_t>::iterator p = tablev.begin();
245 p != tablev.end();
246 ++p) {
247 MDSTableServer *server = mds->get_table_server(p->first);
11fdf7f2 248 ceph_assert(server);
7c673cae
FG
249 if (p->second > server->get_committed_version()) {
250 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p->first)
251 << " to save, need " << p->second << dendl;
252 server->save(gather_bld.new_sub());
253 }
254 }
255
256 // truncating
257 for (set<CInode*>::iterator p = truncating_inodes.begin();
258 p != truncating_inodes.end();
259 ++p) {
260 dout(10) << "try_to_expire waiting for truncate of " << **p << dendl;
261 (*p)->add_waiter(CInode::WAIT_TRUNC, gather_bld.new_sub());
262 }
9f95a23c
TL
263 // purge inodes
264 dout(10) << "try_to_expire waiting for purge of " << purge_inodes << dendl;
265 if (purge_inodes.size())
266 set_purged_cb(gather_bld.new_sub());
7c673cae
FG
267
268 if (gather_bld.has_subs()) {
269 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire waiting" << dendl;
270 mds->mdlog->flush();
271 } else {
11fdf7f2 272 ceph_assert(g_conf()->mds_kill_journal_expire_at != 5);
7c673cae
FG
273 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire success" << dendl;
274 }
275}
276
7c673cae
FG
277// -----------------------
278// EMetaBlob
279
7c673cae
FG
280void EMetaBlob::add_dir_context(CDir *dir, int mode)
281{
282 MDSRank *mds = dir->cache->mds;
283
284 list<CDentry*> parents;
285
286 // it may be okay not to include the maybe items, if
287 // - we journaled the maybe child inode in this segment
288 // - that subtree turns out to be unambiguously auth
289 list<CDentry*> maybe;
290 bool maybenot = false;
291
292 while (true) {
293 // already have this dir? (we must always add in order)
294 if (lump_map.count(dir->dirfrag())) {
295 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") have lump " << dir->dirfrag() << dendl;
296 break;
297 }
298
299 // stop at root/stray
300 CInode *diri = dir->get_inode();
301 CDentry *parent = diri->get_projected_parent_dn();
302
303 if (mode == TO_AUTH_SUBTREE_ROOT) {
304 // subtree root?
31f18b77
FG
305 if (dir->is_subtree_root()) {
306 // match logic in MDCache::create_subtree_map()
307 if (dir->get_dir_auth().first == mds->get_nodeid()) {
308 mds_authority_t parent_auth = parent ? parent->authority() : CDIR_AUTH_UNDEF;
309 if (parent_auth.first == dir->get_dir_auth().first) {
310 if (parent_auth.second == CDIR_AUTH_UNKNOWN &&
311 !dir->is_ambiguous_dir_auth() &&
312 !dir->state_test(CDir::STATE_EXPORTBOUND) &&
313 !dir->state_test(CDir::STATE_AUXSUBTREE) &&
314 !diri->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
315 dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir << dendl;
11fdf7f2 316 ceph_abort();
31f18b77
FG
317 }
318 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") ambiguous or transient subtree " << dendl;
7c673cae
FG
319 } else {
320 // it's an auth subtree, we don't need maybe (if any), and we're done.
321 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
322 << " at " << *dir << dendl;
323 maybe.clear();
324 break;
325 }
326 } else {
327 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached ambig or !auth subtree, need " << maybe
328 << " at " << *dir << dendl;
329 // we need the maybe list after all!
330 parents.splice(parents.begin(), maybe);
331 maybenot = false;
332 }
333 }
31f18b77 334
7c673cae
FG
335 // was the inode journaled in this blob?
336 if (event_seq && diri->last_journaled == event_seq) {
337 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri this blob " << *diri << dendl;
338 break;
339 }
340
341 // have we journaled this inode since the last subtree map?
342 if (!maybenot && last_subtree_map && diri->last_journaled >= last_subtree_map) {
343 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri in this segment ("
344 << diri->last_journaled << " >= " << last_subtree_map << "), setting maybenot flag "
345 << *diri << dendl;
346 maybenot = true;
347 }
348 }
349
350 if (!parent)
351 break;
352
353 if (maybenot) {
354 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") maybe " << *parent << dendl;
355 maybe.push_front(parent);
356 } else {
357 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") definitely " << *parent << dendl;
358 parents.push_front(parent);
359 }
360
361 dir = parent->get_dir();
362 }
363
364 parents.splice(parents.begin(), maybe);
365
366 dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl;
9f95a23c
TL
367 for (const auto& dentry : parents) {
368 ceph_assert(dentry->get_projected_linkage()->is_primary());
369 add_dentry(dentry, false);
7c673cae
FG
370 }
371}
372
373void EMetaBlob::update_segment(LogSegment *ls)
374{
375 // dirty inode mtimes
376 // -> handled directly by Server.cc, replay()
377
378 // alloc table update?
379 if (inotablev)
380 ls->inotablev = inotablev;
381 if (sessionmapv)
382 ls->sessionmapv = sessionmapv;
383
384 // truncated inodes
385 // -> handled directly by Server.cc
386
387 // client requests
388 // note the newest request per client
389 //if (!client_reqs.empty())
390 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
391}
392
393// EMetaBlob::fullbit
394
395void EMetaBlob::fullbit::encode(bufferlist& bl, uint64_t features) const {
396 ENCODE_START(8, 5, bl);
11fdf7f2
TL
397 encode(dn, bl);
398 encode(dnfirst, bl);
399 encode(dnlast, bl);
400 encode(dnv, bl);
401 encode(inode, bl, features);
402 encode(xattrs, bl);
7c673cae 403 if (inode.is_symlink())
11fdf7f2 404 encode(symlink, bl);
7c673cae 405 if (inode.is_dir()) {
11fdf7f2
TL
406 encode(dirfragtree, bl);
407 encode(snapbl, bl);
7c673cae 408 }
11fdf7f2 409 encode(state, bl);
7c673cae 410 if (old_inodes.empty()) {
11fdf7f2 411 encode(false, bl);
7c673cae 412 } else {
11fdf7f2
TL
413 encode(true, bl);
414 encode(old_inodes, bl, features);
7c673cae
FG
415 }
416 if (!inode.is_dir())
11fdf7f2
TL
417 encode(snapbl, bl);
418 encode(oldest_snap, bl);
7c673cae
FG
419 ENCODE_FINISH(bl);
420}
421
11fdf7f2 422void EMetaBlob::fullbit::decode(bufferlist::const_iterator &bl) {
7c673cae 423 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
11fdf7f2
TL
424 decode(dn, bl);
425 decode(dnfirst, bl);
426 decode(dnlast, bl);
427 decode(dnv, bl);
428 decode(inode, bl);
e306af50 429 decode_noshare(xattrs, bl);
7c673cae 430 if (inode.is_symlink())
11fdf7f2 431 decode(symlink, bl);
7c673cae 432 if (inode.is_dir()) {
11fdf7f2
TL
433 decode(dirfragtree, bl);
434 decode(snapbl, bl);
7c673cae
FG
435 if ((struct_v == 2) || (struct_v == 3)) {
436 bool dir_layout_exists;
11fdf7f2 437 decode(dir_layout_exists, bl);
7c673cae
FG
438 if (dir_layout_exists) {
439 __u8 dir_struct_v;
11fdf7f2
TL
440 decode(dir_struct_v, bl); // default_file_layout version
441 decode(inode.layout, bl); // and actual layout, that we care about
7c673cae
FG
442 }
443 }
444 }
445 if (struct_v >= 6) {
11fdf7f2 446 decode(state, bl);
7c673cae
FG
447 } else {
448 bool dirty;
11fdf7f2 449 decode(dirty, bl);
7c673cae
FG
450 state = dirty ? EMetaBlob::fullbit::STATE_DIRTY : 0;
451 }
452
453 if (struct_v >= 3) {
454 bool old_inodes_present;
11fdf7f2 455 decode(old_inodes_present, bl);
7c673cae 456 if (old_inodes_present) {
11fdf7f2 457 decode(old_inodes, bl);
7c673cae
FG
458 }
459 }
460 if (!inode.is_dir()) {
461 if (struct_v >= 7)
11fdf7f2 462 decode(snapbl, bl);
7c673cae
FG
463 }
464 if (struct_v >= 8)
11fdf7f2 465 decode(oldest_snap, bl);
7c673cae
FG
466 else
467 oldest_snap = CEPH_NOSNAP;
468
469 DECODE_FINISH(bl);
470}
471
472void EMetaBlob::fullbit::dump(Formatter *f) const
473{
474 f->dump_string("dentry", dn);
475 f->dump_stream("snapid.first") << dnfirst;
476 f->dump_stream("snapid.last") << dnlast;
477 f->dump_int("dentry version", dnv);
478 f->open_object_section("inode");
479 inode.dump(f);
480 f->close_section(); // inode
481 f->open_object_section("xattrs");
94b18763
FG
482 for (const auto &p : xattrs) {
483 std::string s(p.second.c_str(), p.second.length());
484 f->dump_string(p.first.c_str(), s);
7c673cae
FG
485 }
486 f->close_section(); // xattrs
487 if (inode.is_symlink()) {
488 f->dump_string("symlink", symlink);
489 }
490 if (inode.is_dir()) {
491 f->dump_stream("frag tree") << dirfragtree;
492 f->dump_string("has_snapbl", snapbl.length() ? "true" : "false");
493 if (inode.has_layout()) {
494 f->open_object_section("file layout policy");
495 // FIXME
496 f->dump_string("layout", "the layout exists");
497 f->close_section(); // file layout policy
498 }
499 }
500 f->dump_string("state", state_string());
501 if (!old_inodes.empty()) {
502 f->open_array_section("old inodes");
94b18763 503 for (const auto &p : old_inodes) {
7c673cae 504 f->open_object_section("inode");
94b18763
FG
505 f->dump_int("snapid", p.first);
506 p.second.dump(f);
7c673cae
FG
507 f->close_section(); // inode
508 }
509 f->close_section(); // old inodes
510 }
511}
512
9f95a23c 513void EMetaBlob::fullbit::generate_test_instances(std::list<EMetaBlob::fullbit*>& ls)
7c673cae 514{
94b18763 515 CInode::mempool_inode inode;
7c673cae 516 fragtree_t fragtree;
94b18763 517 CInode::mempool_xattr_map empty_xattrs;
7c673cae
FG
518 bufferlist empty_snapbl;
519 fullbit *sample = new fullbit("/testdn", 0, 0, 0,
520 inode, fragtree, empty_xattrs, "", 0, empty_snapbl,
521 false, NULL);
522 ls.push_back(sample);
523}
524
525void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in)
526{
527 in->inode = inode;
528 in->xattrs = xattrs;
529 if (in->inode.is_dir()) {
f6b5b4d7
TL
530 if (is_export_ephemeral_random()) {
531 dout(15) << "random ephemeral pin on " << *in << dendl;
532 in->set_ephemeral_rand(true);
533 in->maybe_ephemeral_rand(true);
534 }
535 in->maybe_ephemeral_dist();
536 in->maybe_export_pin();
7c673cae
FG
537 if (!(in->dirfragtree == dirfragtree)) {
538 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> "
539 << dirfragtree << " on " << *in << dendl;
540 in->dirfragtree = dirfragtree;
541 in->force_dirfrags();
9f95a23c
TL
542 if (in->get_num_dirfrags() && in->authority() == CDIR_AUTH_UNDEF) {
543 auto&& ls = in->get_nested_dirfrags();
544 for (const auto& dir : ls) {
7c673cae
FG
545 if (dir->get_num_any() == 0 &&
546 mds->mdcache->can_trim_non_auth_dirfrag(dir)) {
547 dout(10) << " closing empty non-auth dirfrag " << *dir << dendl;
548 in->close_dirfrag(dir->get_frag());
549 }
550 }
551 }
552 }
553 } else if (in->inode.is_symlink()) {
11fdf7f2 554 in->symlink = symlink;
7c673cae
FG
555 }
556 in->old_inodes = old_inodes;
557 if (!in->old_inodes.empty()) {
558 snapid_t min_first = in->old_inodes.rbegin()->first + 1;
559 if (min_first > in->first)
560 in->first = min_first;
561 }
562
563 /*
564 * we can do this before linking hte inode bc the split_at would
565 * be a no-op.. we have no children (namely open snaprealms) to
566 * divy up
567 */
568 in->oldest_snap = oldest_snap;
569 in->decode_snap_blob(snapbl);
570
571 /*
572 * In case there was anything malformed in the journal that we are
573 * replaying, do sanity checks on the inodes we're replaying and
574 * go damaged instead of letting any trash into a live cache
575 */
576 if (in->is_file()) {
577 // Files must have valid layouts with a pool set
578 if (in->inode.layout.pool_id == -1 || !in->inode.layout.is_valid()) {
579 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
580 << ": " << in->inode.layout << dendl;
581 std::ostringstream oss;
11fdf7f2 582 oss << "Invalid layout for inode " << in->ino() << " in journal";
7c673cae
FG
583 mds->clog->error() << oss.str();
584 mds->damaged();
585 ceph_abort(); // Should be unreachable because damaged() calls respawn()
586 }
587 }
588}
589
590// EMetaBlob::remotebit
591
592void EMetaBlob::remotebit::encode(bufferlist& bl) const
593{
594 ENCODE_START(2, 2, bl);
11fdf7f2
TL
595 encode(dn, bl);
596 encode(dnfirst, bl);
597 encode(dnlast, bl);
598 encode(dnv, bl);
599 encode(ino, bl);
600 encode(d_type, bl);
601 encode(dirty, bl);
7c673cae
FG
602 ENCODE_FINISH(bl);
603}
604
11fdf7f2 605void EMetaBlob::remotebit::decode(bufferlist::const_iterator &bl)
7c673cae
FG
606{
607 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
11fdf7f2
TL
608 decode(dn, bl);
609 decode(dnfirst, bl);
610 decode(dnlast, bl);
611 decode(dnv, bl);
612 decode(ino, bl);
613 decode(d_type, bl);
614 decode(dirty, bl);
7c673cae
FG
615 DECODE_FINISH(bl);
616}
617
618void EMetaBlob::remotebit::dump(Formatter *f) const
619{
620 f->dump_string("dentry", dn);
621 f->dump_int("snapid.first", dnfirst);
622 f->dump_int("snapid.last", dnlast);
623 f->dump_int("dentry version", dnv);
624 f->dump_int("inodeno", ino);
625 uint32_t type = DTTOIF(d_type) & S_IFMT; // convert to type entries
626 string type_string;
627 switch(type) {
628 case S_IFREG:
629 type_string = "file"; break;
630 case S_IFLNK:
631 type_string = "symlink"; break;
632 case S_IFDIR:
633 type_string = "directory"; break;
634 case S_IFIFO:
635 type_string = "fifo"; break;
636 case S_IFCHR:
637 type_string = "chr"; break;
638 case S_IFBLK:
639 type_string = "blk"; break;
640 case S_IFSOCK:
641 type_string = "sock"; break;
642 default:
643 assert (0 == "unknown d_type!");
644 }
645 f->dump_string("d_type", type_string);
646 f->dump_string("dirty", dirty ? "true" : "false");
647}
648
649void EMetaBlob::remotebit::
9f95a23c 650generate_test_instances(std::list<EMetaBlob::remotebit*>& ls)
7c673cae
FG
651{
652 remotebit *remote = new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG), false);
653 ls.push_back(remote);
654}
655
656// EMetaBlob::nullbit
657
658void EMetaBlob::nullbit::encode(bufferlist& bl) const
659{
660 ENCODE_START(2, 2, bl);
11fdf7f2
TL
661 encode(dn, bl);
662 encode(dnfirst, bl);
663 encode(dnlast, bl);
664 encode(dnv, bl);
665 encode(dirty, bl);
7c673cae
FG
666 ENCODE_FINISH(bl);
667}
668
11fdf7f2 669void EMetaBlob::nullbit::decode(bufferlist::const_iterator &bl)
7c673cae
FG
670{
671 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
11fdf7f2
TL
672 decode(dn, bl);
673 decode(dnfirst, bl);
674 decode(dnlast, bl);
675 decode(dnv, bl);
676 decode(dirty, bl);
7c673cae
FG
677 DECODE_FINISH(bl);
678}
679
680void EMetaBlob::nullbit::dump(Formatter *f) const
681{
682 f->dump_string("dentry", dn);
683 f->dump_int("snapid.first", dnfirst);
684 f->dump_int("snapid.last", dnlast);
685 f->dump_int("dentry version", dnv);
686 f->dump_string("dirty", dirty ? "true" : "false");
687}
688
9f95a23c 689void EMetaBlob::nullbit::generate_test_instances(std::list<nullbit*>& ls)
7c673cae
FG
690{
691 nullbit *sample = new nullbit("/test/dentry", 0, 10, 15, false);
692 nullbit *sample2 = new nullbit("/test/dirty", 10, 20, 25, true);
693 ls.push_back(sample);
694 ls.push_back(sample2);
695}
696
697// EMetaBlob::dirlump
698
699void EMetaBlob::dirlump::encode(bufferlist& bl, uint64_t features) const
700{
701 ENCODE_START(2, 2, bl);
11fdf7f2
TL
702 encode(fnode, bl);
703 encode(state, bl);
704 encode(nfull, bl);
705 encode(nremote, bl);
706 encode(nnull, bl);
7c673cae 707 _encode_bits(features);
11fdf7f2 708 encode(dnbl, bl);
7c673cae
FG
709 ENCODE_FINISH(bl);
710}
711
11fdf7f2 712void EMetaBlob::dirlump::decode(bufferlist::const_iterator &bl)
7c673cae
FG
713{
714 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl)
11fdf7f2
TL
715 decode(fnode, bl);
716 decode(state, bl);
717 decode(nfull, bl);
718 decode(nremote, bl);
719 decode(nnull, bl);
720 decode(dnbl, bl);
7c673cae
FG
721 dn_decoded = false; // don't decode bits unless we need them.
722 DECODE_FINISH(bl);
723}
724
725void EMetaBlob::dirlump::dump(Formatter *f) const
726{
727 if (!dn_decoded) {
728 dirlump *me = const_cast<dirlump*>(this);
729 me->_decode_bits();
730 }
731 f->open_object_section("fnode");
732 fnode.dump(f);
733 f->close_section(); // fnode
734 f->dump_string("state", state_string());
735 f->dump_int("nfull", nfull);
736 f->dump_int("nremote", nremote);
737 f->dump_int("nnull", nnull);
738
739 f->open_array_section("full bits");
11fdf7f2 740 for (const auto& iter : dfull) {
7c673cae 741 f->open_object_section("fullbit");
11fdf7f2 742 iter.dump(f);
7c673cae
FG
743 f->close_section(); // fullbit
744 }
745 f->close_section(); // full bits
746 f->open_array_section("remote bits");
11fdf7f2 747 for (const auto& iter : dremote) {
7c673cae 748 f->open_object_section("remotebit");
11fdf7f2 749 iter.dump(f);
7c673cae
FG
750 f->close_section(); // remotebit
751 }
752 f->close_section(); // remote bits
753 f->open_array_section("null bits");
11fdf7f2 754 for (const auto& iter : dnull) {
7c673cae 755 f->open_object_section("null bit");
11fdf7f2 756 iter.dump(f);
7c673cae
FG
757 f->close_section(); // null bit
758 }
759 f->close_section(); // null bits
760}
761
9f95a23c 762void EMetaBlob::dirlump::generate_test_instances(std::list<dirlump*>& ls)
7c673cae
FG
763{
764 ls.push_back(new dirlump());
765}
766
767/**
768 * EMetaBlob proper
769 */
770void EMetaBlob::encode(bufferlist& bl, uint64_t features) const
771{
772 ENCODE_START(8, 5, bl);
11fdf7f2
TL
773 encode(lump_order, bl);
774 encode(lump_map, bl, features);
775 encode(roots, bl, features);
776 encode(table_tids, bl);
777 encode(opened_ino, bl);
778 encode(allocated_ino, bl);
779 encode(used_preallocated_ino, bl);
780 encode(preallocated_inos, bl);
781 encode(client_name, bl);
782 encode(inotablev, bl);
783 encode(sessionmapv, bl);
784 encode(truncate_start, bl);
785 encode(truncate_finish, bl);
786 encode(destroyed_inodes, bl);
787 encode(client_reqs, bl);
788 encode(renamed_dirino, bl);
789 encode(renamed_dir_frags, bl);
7c673cae
FG
790 {
791 // make MDSRank use v6 format happy
792 int64_t i = -1;
793 bool b = false;
11fdf7f2
TL
794 encode(i, bl);
795 encode(b, bl);
7c673cae 796 }
11fdf7f2 797 encode(client_flushes, bl);
7c673cae
FG
798 ENCODE_FINISH(bl);
799}
11fdf7f2 800void EMetaBlob::decode(bufferlist::const_iterator &bl)
7c673cae 801{
9f95a23c 802 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
11fdf7f2
TL
803 decode(lump_order, bl);
804 decode(lump_map, bl);
7c673cae 805 if (struct_v >= 4) {
11fdf7f2 806 decode(roots, bl);
7c673cae
FG
807 } else {
808 bufferlist rootbl;
11fdf7f2 809 decode(rootbl, bl);
7c673cae 810 if (rootbl.length()) {
11fdf7f2
TL
811 auto p = rootbl.cbegin();
812 roots.emplace_back(p);
7c673cae
FG
813 }
814 }
11fdf7f2
TL
815 decode(table_tids, bl);
816 decode(opened_ino, bl);
817 decode(allocated_ino, bl);
818 decode(used_preallocated_ino, bl);
819 decode(preallocated_inos, bl);
820 decode(client_name, bl);
821 decode(inotablev, bl);
822 decode(sessionmapv, bl);
823 decode(truncate_start, bl);
824 decode(truncate_finish, bl);
825 decode(destroyed_inodes, bl);
7c673cae 826 if (struct_v >= 2) {
11fdf7f2 827 decode(client_reqs, bl);
7c673cae
FG
828 } else {
829 list<metareqid_t> r;
11fdf7f2 830 decode(r, bl);
7c673cae
FG
831 while (!r.empty()) {
832 client_reqs.push_back(pair<metareqid_t,uint64_t>(r.front(), 0));
833 r.pop_front();
834 }
835 }
836 if (struct_v >= 3) {
11fdf7f2
TL
837 decode(renamed_dirino, bl);
838 decode(renamed_dir_frags, bl);
7c673cae
FG
839 }
840 if (struct_v >= 6) {
841 // ignore
842 int64_t i;
843 bool b;
11fdf7f2
TL
844 decode(i, bl);
845 decode(b, bl);
7c673cae
FG
846 }
847 if (struct_v >= 8) {
11fdf7f2 848 decode(client_flushes, bl);
7c673cae
FG
849 }
850 DECODE_FINISH(bl);
851}
852
853
854/**
855 * Get all inodes touched by this metablob. Includes the 'bits' within
856 * dirlumps, and the inodes of the dirs themselves.
857 */
858void EMetaBlob::get_inodes(
859 std::set<inodeno_t> &inodes) const
860{
861 // For all dirlumps in this metablob
862 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
863 // Record inode of dirlump
864 inodeno_t const dir_ino = i->first.ino;
865 inodes.insert(dir_ino);
866
867 // Decode dirlump bits
868 dirlump const &dl = i->second;
869 dl._decode_bits();
870
871 // Record inodes of fullbits
11fdf7f2
TL
872 for (const auto& iter : dl.get_dfull()) {
873 inodes.insert(iter.inode.ino);
7c673cae
FG
874 }
875
876 // Record inodes of remotebits
11fdf7f2
TL
877 for (const auto& iter : dl.get_dremote()) {
878 inodes.insert(iter.ino);
7c673cae
FG
879 }
880 }
881}
882
883
884/**
885 * Get a map of dirfrag to set of dentries in that dirfrag which are
886 * touched in this operation.
887 */
888void EMetaBlob::get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const
889{
890 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
891 dirlump const &dl = i->second;
892 dirfrag_t const &df = i->first;
893
894 // Get all bits
895 dl._decode_bits();
7c673cae
FG
896
897 // For all bits, store dentry
11fdf7f2
TL
898 for (const auto& iter : dl.get_dfull()) {
899 dentries[df].insert(iter.dn);
7c673cae 900 }
11fdf7f2
TL
901 for (const auto& iter : dl.get_dremote()) {
902 dentries[df].insert(iter.dn);
7c673cae 903 }
11fdf7f2
TL
904 for (const auto& iter : dl.get_dnull()) {
905 dentries[df].insert(iter.dn);
7c673cae
FG
906 }
907 }
908}
909
910
911
912/**
913 * Calculate all paths that we can infer are touched by this metablob. Only uses
914 * information local to this metablob so it may only be the path within the
915 * subtree.
916 */
917void EMetaBlob::get_paths(
918 std::vector<std::string> &paths) const
919{
920 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
921 typedef std::pair<inodeno_t, std::string> Location;
922
923 // Whenever we see a dentry within a dirlump, we remember it as a child of
924 // the dirlump's inode
9f95a23c 925 std::map<inodeno_t, std::vector<std::string> > children;
7c673cae
FG
926
927 // Whenever we see a location for an inode, remember it: this allows us to
928 // build a path given an inode
929 std::map<inodeno_t, Location> ino_locations;
930
931 // Special case: operations on root inode populate roots but not dirlumps
932 if (lump_map.empty() && !roots.empty()) {
933 paths.push_back("/");
934 return;
935 }
936
937 // First pass
938 // ==========
939 // Build a tiny local metadata cache for the path structure in this metablob
940 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
941 inodeno_t const dir_ino = i->first.ino;
942 dirlump const &dl = i->second;
943 dl._decode_bits();
944
11fdf7f2
TL
945 for (const auto& iter : dl.get_dfull()) {
946 std::string_view dentry = iter.dn;
94b18763 947 children[dir_ino].emplace_back(dentry);
11fdf7f2 948 ino_locations[iter.inode.ino] = Location(dir_ino, dentry);
7c673cae
FG
949 }
950
11fdf7f2
TL
951 for (const auto& iter : dl.get_dremote()) {
952 std::string_view dentry = iter.dn;
94b18763 953 children[dir_ino].emplace_back(dentry);
7c673cae
FG
954 }
955
11fdf7f2
TL
956 for (const auto& iter : dl.get_dnull()) {
957 std::string_view dentry = iter.dn;
94b18763 958 children[dir_ino].emplace_back(dentry);
7c673cae
FG
959 }
960 }
961
962 std::vector<Location> leaf_locations;
963
964 // Second pass
965 // ===========
966 // Output paths for all childless nodes in the metablob
967 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
968 inodeno_t const dir_ino = i->first.ino;
969 dirlump const &dl = i->second;
970 dl._decode_bits();
971
11fdf7f2
TL
972 for (const auto& iter : dl.get_dfull()) {
973 std::string_view dentry = iter.dn;
974 if (children.find(iter.inode.ino) == children.end()) {
975 leaf_locations.push_back(Location(dir_ino, dentry));
7c673cae
FG
976 }
977 }
978
11fdf7f2
TL
979 for (const auto& iter : dl.get_dremote()) {
980 std::string_view dentry = iter.dn;
981 leaf_locations.push_back(Location(dir_ino, dentry));
7c673cae
FG
982 }
983
11fdf7f2
TL
984 for (const auto& iter : dl.get_dnull()) {
985 std::string_view dentry = iter.dn;
986 leaf_locations.push_back(Location(dir_ino, dentry));
7c673cae
FG
987 }
988 }
989
990 // For all the leaf locations identified, generate paths
991 for (std::vector<Location>::iterator i = leaf_locations.begin(); i != leaf_locations.end(); ++i) {
992 Location const &loc = *i;
993 std::string path = loc.second;
994 inodeno_t ino = loc.first;
11fdf7f2
TL
995 std::map<inodeno_t, Location>::iterator iter = ino_locations.find(ino);
996 while(iter != ino_locations.end()) {
997 Location const &loc = iter->second;
7c673cae
FG
998 if (!path.empty()) {
999 path = loc.second + "/" + path;
1000 } else {
1001 path = loc.second + path;
1002 }
11fdf7f2 1003 iter = ino_locations.find(loc.first);
7c673cae
FG
1004 }
1005
1006 paths.push_back(path);
1007 }
1008}
1009
1010
1011void EMetaBlob::dump(Formatter *f) const
1012{
1013 f->open_array_section("lumps");
11fdf7f2 1014 for (const auto& d : lump_order) {
7c673cae
FG
1015 f->open_object_section("lump");
1016 f->open_object_section("dirfrag");
11fdf7f2 1017 f->dump_stream("dirfrag") << d;
7c673cae
FG
1018 f->close_section(); // dirfrag
1019 f->open_object_section("dirlump");
11fdf7f2 1020 lump_map.at(d).dump(f);
7c673cae
FG
1021 f->close_section(); // dirlump
1022 f->close_section(); // lump
1023 }
1024 f->close_section(); // lumps
1025
1026 f->open_array_section("roots");
11fdf7f2 1027 for (const auto& iter : roots) {
7c673cae 1028 f->open_object_section("root");
11fdf7f2 1029 iter.dump(f);
7c673cae
FG
1030 f->close_section(); // root
1031 }
1032 f->close_section(); // roots
1033
1034 f->open_array_section("tableclient tranactions");
11fdf7f2 1035 for (const auto& p : table_tids) {
7c673cae 1036 f->open_object_section("transaction");
11fdf7f2
TL
1037 f->dump_int("tid", p.first);
1038 f->dump_int("version", p.second);
7c673cae
FG
1039 f->close_section(); // transaction
1040 }
1041 f->close_section(); // tableclient transactions
1042
1043 f->dump_int("renamed directory inodeno", renamed_dirino);
1044
1045 f->open_array_section("renamed directory fragments");
11fdf7f2
TL
1046 for (const auto& p : renamed_dir_frags) {
1047 f->dump_int("frag", p);
7c673cae
FG
1048 }
1049 f->close_section(); // renamed directory fragments
1050
1051 f->dump_int("inotable version", inotablev);
1052 f->dump_int("SessionMap version", sessionmapv);
1053 f->dump_int("allocated ino", allocated_ino);
1054
1055 f->dump_stream("preallocated inos") << preallocated_inos;
1056 f->dump_int("used preallocated ino", used_preallocated_ino);
1057
1058 f->open_object_section("client name");
1059 client_name.dump(f);
1060 f->close_section(); // client name
1061
1062 f->open_array_section("inodes starting a truncate");
11fdf7f2
TL
1063 for(const auto& ino : truncate_start) {
1064 f->dump_int("inodeno", ino);
7c673cae
FG
1065 }
1066 f->close_section(); // truncate inodes
1067 f->open_array_section("inodes finishing a truncated");
11fdf7f2 1068 for(const auto& p : truncate_finish) {
7c673cae 1069 f->open_object_section("inode+segment");
11fdf7f2
TL
1070 f->dump_int("inodeno", p.first);
1071 f->dump_int("truncate starting segment", p.second);
7c673cae
FG
1072 f->close_section(); // truncated inode
1073 }
1074 f->close_section(); // truncate finish inodes
1075
1076 f->open_array_section("destroyed inodes");
1077 for(vector<inodeno_t>::const_iterator i = destroyed_inodes.begin();
1078 i != destroyed_inodes.end(); ++i) {
1079 f->dump_int("inodeno", *i);
1080 }
1081 f->close_section(); // destroyed inodes
1082
1083 f->open_array_section("client requests");
11fdf7f2 1084 for(const auto& p : client_reqs) {
7c673cae 1085 f->open_object_section("Client request");
11fdf7f2
TL
1086 f->dump_stream("request ID") << p.first;
1087 f->dump_int("oldest request on client", p.second);
7c673cae
FG
1088 f->close_section(); // request
1089 }
1090 f->close_section(); // client requests
1091}
1092
9f95a23c 1093void EMetaBlob::generate_test_instances(std::list<EMetaBlob*>& ls)
7c673cae
FG
1094{
1095 ls.push_back(new EMetaBlob());
1096}
1097
1098void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
1099{
1100 dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
1101
11fdf7f2 1102 ceph_assert(logseg);
7c673cae 1103
11fdf7f2 1104 ceph_assert(g_conf()->mds_kill_journal_replay_at != 1);
7c673cae 1105
11fdf7f2
TL
1106 for (auto& p : roots) {
1107 CInode *in = mds->mdcache->get_inode(p.inode.ino);
7c673cae
FG
1108 bool isnew = in ? false:true;
1109 if (!in)
11fdf7f2
TL
1110 in = new CInode(mds->mdcache, false, 2, CEPH_NOSNAP);
1111 p.update_inode(mds, in);
7c673cae
FG
1112
1113 if (isnew)
1114 mds->mdcache->add_inode(in);
11fdf7f2 1115 if (p.is_dirty()) in->_mark_dirty(logseg);
7c673cae
FG
1116 dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl;
1117 }
1118
1119 CInode *renamed_diri = 0;
1120 CDir *olddir = 0;
1121 if (renamed_dirino) {
1122 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
1123 if (renamed_diri)
1124 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri << dendl;
1125 else
1126 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino << dendl;
1127
1128 int nnull = 0;
11fdf7f2
TL
1129 for (const auto& lp : lump_order) {
1130 dirlump &lump = lump_map[lp];
7c673cae 1131 if (lump.nnull) {
11fdf7f2 1132 dout(10) << "EMetaBlob.replay found null dentry in dir " << lp << dendl;
7c673cae
FG
1133 nnull += lump.nnull;
1134 }
1135 }
11fdf7f2 1136 ceph_assert(nnull <= 1);
7c673cae
FG
1137 }
1138
1139 // keep track of any inodes we unlink and don't relink elsewhere
1140 map<CInode*, CDir*> unlinked;
1141 set<CInode*> linked;
1142
1143 // walk through my dirs (in order!)
f6b5b4d7 1144 int count = 0;
11fdf7f2
TL
1145 for (const auto& lp : lump_order) {
1146 dout(10) << "EMetaBlob.replay dir " << lp << dendl;
1147 dirlump &lump = lump_map[lp];
7c673cae
FG
1148
1149 // the dir
11fdf7f2 1150 CDir *dir = mds->mdcache->get_force_dirfrag(lp, true);
7c673cae
FG
1151 if (!dir) {
1152 // hmm. do i have the inode?
11fdf7f2 1153 CInode *diri = mds->mdcache->get_inode((lp).ino);
7c673cae 1154 if (!diri) {
11fdf7f2
TL
1155 if (MDS_INO_IS_MDSDIR(lp.ino)) {
1156 ceph_assert(MDS_INO_MDSDIR(mds->get_nodeid()) != lp.ino);
1157 diri = mds->mdcache->create_system_inode(lp.ino, S_IFDIR|0755);
7c673cae
FG
1158 diri->state_clear(CInode::STATE_AUTH);
1159 dout(10) << "EMetaBlob.replay created base " << *diri << dendl;
1160 } else {
11fdf7f2 1161 dout(0) << "EMetaBlob.replay missing dir ino " << lp.ino << dendl;
7c673cae
FG
1162 mds->clog->error() << "failure replaying journal (EMetaBlob)";
1163 mds->damaged();
1164 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1165 }
1166 }
1167
1168 // create the dirfrag
11fdf7f2 1169 dir = diri->get_or_open_dirfrag(mds->mdcache, lp.frag);
7c673cae 1170
11fdf7f2 1171 if (MDS_INO_IS_BASE(lp.ino))
7c673cae
FG
1172 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
1173
1174 dout(10) << "EMetaBlob.replay added dir " << *dir << dendl;
1175 }
1176 dir->set_version( lump.fnode.version );
1177 dir->fnode = lump.fnode;
1178
1179 if (lump.is_importing()) {
1180 dir->state_set(CDir::STATE_AUTH);
1181 dir->state_clear(CDir::STATE_COMPLETE);
1182 }
1183 if (lump.is_dirty()) {
1184 dir->_mark_dirty(logseg);
1185
1186 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
1187 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir << dendl;
1188 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
1189 logseg->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
1190 } else {
1191 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir << dendl;
1192 }
1193 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
1194 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir << dendl;
1195 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
1196 logseg->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
1197 } else {
1198 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir << dendl;
1199 }
1200 }
1201 if (lump.is_dirty_dft()) {
1202 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir << dendl;
1203 dir->state_set(CDir::STATE_DIRTYDFT);
1204 mds->locker->mark_updated_scatterlock(&dir->inode->dirfragtreelock);
1205 logseg->dirty_dirfrag_dirfragtree.push_back(&dir->inode->item_dirty_dirfrag_dirfragtree);
1206 }
1207 if (lump.is_new())
1208 dir->mark_new(logseg);
1209 if (lump.is_complete())
1210 dir->mark_complete();
1211
1212 dout(10) << "EMetaBlob.replay updated dir " << *dir << dendl;
1213
1214 // decode bits
1215 lump._decode_bits();
1216
1217 // full dentry+inode pairs
11fdf7f2
TL
1218 for (auto& fb : lump._get_dfull()) {
1219 CDentry *dn = dir->lookup_exact_snap(fb.dn, fb.dnlast);
7c673cae 1220 if (!dn) {
11fdf7f2
TL
1221 dn = dir->add_null_dentry(fb.dn, fb.dnfirst, fb.dnlast);
1222 dn->set_version(fb.dnv);
1223 if (fb.is_dirty()) dn->_mark_dirty(logseg);
7c673cae
FG
1224 dout(10) << "EMetaBlob.replay added (full) " << *dn << dendl;
1225 } else {
11fdf7f2
TL
1226 dn->set_version(fb.dnv);
1227 if (fb.is_dirty()) dn->_mark_dirty(logseg);
1228 dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *dn << dendl;
1229 dn->first = fb.dnfirst;
1230 ceph_assert(dn->last == fb.dnlast);
7c673cae
FG
1231 }
1232 if (lump.is_importing())
1233 dn->state_set(CDentry::STATE_AUTH);
1234
11fdf7f2 1235 CInode *in = mds->mdcache->get_inode(fb.inode.ino, fb.dnlast);
7c673cae 1236 if (!in) {
11fdf7f2
TL
1237 in = new CInode(mds->mdcache, dn->is_auth(), fb.dnfirst, fb.dnlast);
1238 fb.update_inode(mds, in);
7c673cae
FG
1239 mds->mdcache->add_inode(in);
1240 if (!dn->get_linkage()->is_null()) {
1241 if (dn->get_linkage()->is_primary()) {
1242 unlinked[dn->get_linkage()->get_inode()] = dir;
1243 stringstream ss;
1244 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
11fdf7f2 1245 << " " << *dn->get_linkage()->get_inode() << " should be " << fb.inode.ino;
7c673cae
FG
1246 dout(0) << ss.str() << dendl;
1247 mds->clog->warn(ss);
1248 }
31f18b77 1249 dir->unlink_inode(dn, false);
7c673cae
FG
1250 }
1251 if (unlinked.count(in))
1252 linked.insert(in);
1253 dir->link_primary_inode(dn, in);
1254 dout(10) << "EMetaBlob.replay added " << *in << dendl;
1255 } else {
11fdf7f2
TL
1256 in->first = fb.dnfirst;
1257 fb.update_inode(mds, in);
7c673cae
FG
1258 if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) {
1259 dout(10) << "EMetaBlob.replay unlinking " << *in << dendl;
1260 unlinked[in] = in->get_parent_dir();
7c673cae 1261 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
7c673cae
FG
1262 }
1263 if (dn->get_linkage()->get_inode() != in) {
1264 if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1265 if (dn->get_linkage()->is_primary()) {
1266 unlinked[dn->get_linkage()->get_inode()] = dir;
1267 stringstream ss;
1268 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
11fdf7f2 1269 << " " << *dn->get_linkage()->get_inode() << " should be " << fb.inode.ino;
7c673cae
FG
1270 dout(0) << ss.str() << dendl;
1271 mds->clog->warn(ss);
1272 }
31f18b77 1273 dir->unlink_inode(dn, false);
7c673cae
FG
1274 }
1275 if (unlinked.count(in))
1276 linked.insert(in);
1277 dir->link_primary_inode(dn, in);
1278 dout(10) << "EMetaBlob.replay linked " << *in << dendl;
1279 } else {
11fdf7f2 1280 dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *in << dendl;
7c673cae 1281 }
11fdf7f2
TL
1282 ceph_assert(in->first == fb.dnfirst ||
1283 (in->is_multiversion() && in->first > fb.dnfirst));
7c673cae 1284 }
11fdf7f2 1285 if (fb.is_dirty())
7c673cae 1286 in->_mark_dirty(logseg);
11fdf7f2
TL
1287 if (fb.is_dirty_parent())
1288 in->mark_dirty_parent(logseg, fb.is_dirty_pool());
1289 if (fb.need_snapflush())
7c673cae
FG
1290 logseg->open_files.push_back(&in->item_open_file);
1291 if (dn->is_auth())
1292 in->state_set(CInode::STATE_AUTH);
1293 else
1294 in->state_clear(CInode::STATE_AUTH);
11fdf7f2 1295 ceph_assert(g_conf()->mds_kill_journal_replay_at != 2);
f6b5b4d7
TL
1296
1297 if (!(++count % 1000))
1298 mds->heartbeat_reset();
7c673cae
FG
1299 }
1300
1301 // remote dentries
11fdf7f2
TL
1302 for (const auto& rb : lump.get_dremote()) {
1303 CDentry *dn = dir->lookup_exact_snap(rb.dn, rb.dnlast);
7c673cae 1304 if (!dn) {
11fdf7f2
TL
1305 dn = dir->add_remote_dentry(rb.dn, rb.ino, rb.d_type, rb.dnfirst, rb.dnlast);
1306 dn->set_version(rb.dnv);
1307 if (rb.dirty) dn->_mark_dirty(logseg);
7c673cae
FG
1308 dout(10) << "EMetaBlob.replay added " << *dn << dendl;
1309 } else {
1310 if (!dn->get_linkage()->is_null()) {
1311 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1312 if (dn->get_linkage()->is_primary()) {
1313 unlinked[dn->get_linkage()->get_inode()] = dir;
1314 stringstream ss;
1315 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
11fdf7f2 1316 << " " << *dn->get_linkage()->get_inode() << " should be remote " << rb.ino;
7c673cae
FG
1317 dout(0) << ss.str() << dendl;
1318 }
31f18b77 1319 dir->unlink_inode(dn, false);
7c673cae 1320 }
11fdf7f2
TL
1321 dir->link_remote_inode(dn, rb.ino, rb.d_type);
1322 dn->set_version(rb.dnv);
1323 if (rb.dirty) dn->_mark_dirty(logseg);
1324 dout(10) << "EMetaBlob.replay for [" << rb.dnfirst << "," << rb.dnlast << "] had " << *dn << dendl;
1325 dn->first = rb.dnfirst;
1326 ceph_assert(dn->last == rb.dnlast);
7c673cae
FG
1327 }
1328 if (lump.is_importing())
1329 dn->state_set(CDentry::STATE_AUTH);
f6b5b4d7
TL
1330
1331 if (!(++count % 1000))
1332 mds->heartbeat_reset();
7c673cae
FG
1333 }
1334
1335 // null dentries
11fdf7f2
TL
1336 for (const auto& nb : lump.get_dnull()) {
1337 CDentry *dn = dir->lookup_exact_snap(nb.dn, nb.dnlast);
7c673cae 1338 if (!dn) {
11fdf7f2
TL
1339 dn = dir->add_null_dentry(nb.dn, nb.dnfirst, nb.dnlast);
1340 dn->set_version(nb.dnv);
1341 if (nb.dirty) dn->_mark_dirty(logseg);
7c673cae
FG
1342 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn << dendl;
1343 } else {
11fdf7f2 1344 dn->first = nb.dnfirst;
7c673cae
FG
1345 if (!dn->get_linkage()->is_null()) {
1346 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1347 CInode *in = dn->get_linkage()->get_inode();
1348 // For renamed inode, We may call CInode::force_dirfrag() later.
1349 // CInode::force_dirfrag() doesn't work well when inode is detached
1350 // from the hierarchy.
1351 if (!renamed_diri || renamed_diri != in) {
1352 if (dn->get_linkage()->is_primary())
1353 unlinked[in] = dir;
1354 dir->unlink_inode(dn);
7c673cae
FG
1355 }
1356 }
11fdf7f2
TL
1357 dn->set_version(nb.dnv);
1358 if (nb.dirty) dn->_mark_dirty(logseg);
7c673cae 1359 dout(10) << "EMetaBlob.replay had " << *dn << dendl;
11fdf7f2 1360 ceph_assert(dn->last == nb.dnlast);
7c673cae
FG
1361 }
1362 olddir = dir;
1363 if (lump.is_importing())
1364 dn->state_set(CDentry::STATE_AUTH);
1365
1366 // Make null dentries the first things we trim
1367 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn << dendl;
f6b5b4d7
TL
1368
1369 if (!(++count % 1000))
1370 mds->heartbeat_reset();
7c673cae
FG
1371 }
1372 }
1373
11fdf7f2 1374 ceph_assert(g_conf()->mds_kill_journal_replay_at != 3);
7c673cae
FG
1375
1376 if (renamed_dirino) {
1377 if (renamed_diri) {
11fdf7f2
TL
1378 ceph_assert(unlinked.count(renamed_diri));
1379 ceph_assert(linked.count(renamed_diri));
7c673cae
FG
1380 olddir = unlinked[renamed_diri];
1381 } else {
1382 // we imported a diri we haven't seen before
1383 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
11fdf7f2 1384 ceph_assert(renamed_diri); // it was in the metablob
7c673cae
FG
1385 }
1386
1387 if (olddir) {
1388 if (olddir->authority() != CDIR_AUTH_UNDEF &&
1389 renamed_diri->authority() == CDIR_AUTH_UNDEF) {
11fdf7f2
TL
1390 ceph_assert(slaveup); // auth to non-auth, must be slave prepare
1391 frag_vec_t leaves;
7c673cae 1392 renamed_diri->dirfragtree.get_leaves(leaves);
11fdf7f2
TL
1393 for (const auto& leaf : leaves) {
1394 CDir *dir = renamed_diri->get_dirfrag(leaf);
1395 ceph_assert(dir);
7c673cae
FG
1396 if (dir->get_dir_auth() == CDIR_AUTH_UNDEF)
1397 // preserve subtree bound until slave commit
1398 slaveup->olddirs.insert(dir->inode);
1399 else
1400 dir->state_set(CDir::STATE_AUTH);
f6b5b4d7
TL
1401
1402 if (!(++count % 1000))
1403 mds->heartbeat_reset();
7c673cae
FG
1404 }
1405 }
1406
1407 mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
1408
1409 // see if we can discard the subtree we renamed out of
1410 CDir *root = mds->mdcache->get_subtree_root(olddir);
1411 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
1412 if (slaveup) // preserve the old dir until slave commit
1413 slaveup->olddirs.insert(olddir->inode);
1414 else
1415 mds->mdcache->try_trim_non_auth_subtree(root);
1416 }
1417 }
1418
1419 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1420 if (renamed_diri->authority() != CDIR_AUTH_UNDEF) {
11fdf7f2
TL
1421 for (const auto& p : renamed_dir_frags) {
1422 CDir *dir = renamed_diri->get_dirfrag(p);
7c673cae
FG
1423 if (dir) {
1424 // we already had the inode before, and we already adjusted this subtree accordingly.
1425 dout(10) << " already had+adjusted rename import bound " << *dir << dendl;
11fdf7f2 1426 ceph_assert(olddir);
7c673cae
FG
1427 continue;
1428 }
11fdf7f2 1429 dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, p);
7c673cae
FG
1430 dout(10) << " creating new rename import bound " << *dir << dendl;
1431 dir->state_clear(CDir::STATE_AUTH);
224ce89b 1432 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
f6b5b4d7
TL
1433
1434 if (!(++count % 1000))
1435 mds->heartbeat_reset();
7c673cae
FG
1436 }
1437 }
1438
1439 // rename may overwrite an empty directory and move it into stray dir.
1440 unlinked.erase(renamed_diri);
1441 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
1442 if (!linked.count(p->first))
1443 continue;
11fdf7f2 1444 ceph_assert(p->first->is_dir());
7c673cae 1445 mds->mdcache->adjust_subtree_after_rename(p->first, p->second, false);
f6b5b4d7
TL
1446
1447 if (!(++count % 1000))
1448 mds->heartbeat_reset();
7c673cae
FG
1449 }
1450 }
1451
1452 if (!unlinked.empty()) {
1453 for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); ++p)
1454 unlinked.erase(*p);
1455 dout(10) << " unlinked set contains " << unlinked << dendl;
1456 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
11fdf7f2
TL
1457 CInode *in = p->first;
1458 if (slaveup) { // preserve unlinked inodes until slave commit
1459 slaveup->unlinked.insert(in);
1460 if (in->snaprealm)
1461 in->snaprealm->adjust_parent();
1462 } else
1463 mds->mdcache->remove_inode_recursive(in);
f6b5b4d7
TL
1464
1465 if (!(++count % 1000))
1466 mds->heartbeat_reset();
7c673cae
FG
1467 }
1468 }
1469
1470 // table client transactions
11fdf7f2
TL
1471 for (const auto& p : table_tids) {
1472 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p.first)
1473 << " transaction " << p.second << dendl;
1474 MDSTableClient *client = mds->get_table_client(p.first);
7c673cae 1475 if (client)
11fdf7f2 1476 client->got_journaled_agree(p.second, logseg);
f6b5b4d7
TL
1477
1478 if (!(++count % 1000))
1479 mds->heartbeat_reset();
7c673cae
FG
1480 }
1481
1482 // opened ino?
1483 if (opened_ino) {
1484 CInode *in = mds->mdcache->get_inode(opened_ino);
11fdf7f2 1485 ceph_assert(in);
7c673cae
FG
1486 dout(10) << "EMetaBlob.replay noting opened inode " << *in << dendl;
1487 logseg->open_files.push_back(&in->item_open_file);
1488 }
1489
1490 // allocated_inos
1491 if (inotablev) {
1492 if (mds->inotable->get_version() >= inotablev) {
1493 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1494 << " <= table " << mds->inotable->get_version() << dendl;
1495 } else {
1496 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1497 << " - 1 == table " << mds->inotable->get_version()
1498 << " allocated+used " << allocated_ino
1499 << " prealloc " << preallocated_inos
1500 << dendl;
1501 if (allocated_ino)
1502 mds->inotable->replay_alloc_id(allocated_ino);
1503 if (preallocated_inos.size())
1504 mds->inotable->replay_alloc_ids(preallocated_inos);
1505
1506 // [repair bad inotable updates]
1507 if (inotablev > mds->inotable->get_version()) {
1508 mds->clog->error() << "journal replay inotablev mismatch "
1509 << mds->inotable->get_version() << " -> " << inotablev;
1510 mds->inotable->force_replay_version(inotablev);
1511 }
1512
11fdf7f2 1513 ceph_assert(inotablev == mds->inotable->get_version());
7c673cae
FG
1514 }
1515 }
1516 if (sessionmapv) {
81eedcae 1517 unsigned diff = (used_preallocated_ino && !preallocated_inos.empty()) ? 2 : 1;
7c673cae
FG
1518 if (mds->sessionmap.get_version() >= sessionmapv) {
1519 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1520 << " <= table " << mds->sessionmap.get_version() << dendl;
81eedcae 1521 } else if (mds->sessionmap.get_version() + diff == sessionmapv) {
7c673cae 1522 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
81eedcae 1523 << " - " << diff << " == table " << mds->sessionmap.get_version()
7c673cae
FG
1524 << " prealloc " << preallocated_inos
1525 << " used " << used_preallocated_ino
1526 << dendl;
1527 Session *session = mds->sessionmap.get_session(client_name);
1528 if (session) {
1529 dout(20) << " (session prealloc " << session->info.prealloc_inos << ")" << dendl;
1530 if (used_preallocated_ino) {
1531 if (!session->info.prealloc_inos.empty()) {
7c673cae 1532 inodeno_t i = session->take_ino(used_preallocated_ino);
11fdf7f2 1533 ceph_assert(i == used_preallocated_ino);
7c673cae
FG
1534 session->info.used_inos.clear();
1535 }
1536 mds->sessionmap.replay_dirty_session(session);
1537 }
1538 if (!preallocated_inos.empty()) {
1539 session->info.prealloc_inos.insert(preallocated_inos);
1540 mds->sessionmap.replay_dirty_session(session);
1541 }
1542
1543 } else {
1544 dout(10) << "EMetaBlob.replay no session for " << client_name << dendl;
81eedcae 1545 if (used_preallocated_ino)
7c673cae 1546 mds->sessionmap.replay_advance_version();
81eedcae 1547
7c673cae
FG
1548 if (!preallocated_inos.empty())
1549 mds->sessionmap.replay_advance_version();
1550 }
11fdf7f2 1551 ceph_assert(sessionmapv == mds->sessionmap.get_version());
7c673cae 1552 } else {
81eedcae
TL
1553 mds->clog->error() << "EMetaBlob.replay sessionmap v " << sessionmapv
1554 << " - " << diff << " > table " << mds->sessionmap.get_version();
11fdf7f2 1555 ceph_assert(g_conf()->mds_wipe_sessions);
7c673cae
FG
1556 mds->sessionmap.wipe();
1557 mds->sessionmap.set_version(sessionmapv);
1558 }
1559 }
1560
1561 // truncating inodes
11fdf7f2
TL
1562 for (const auto& ino : truncate_start) {
1563 CInode *in = mds->mdcache->get_inode(ino);
1564 ceph_assert(in);
7c673cae 1565 mds->mdcache->add_recovered_truncate(in, logseg);
f6b5b4d7
TL
1566
1567 if (!(++count % 1000))
1568 mds->heartbeat_reset();
7c673cae 1569 }
11fdf7f2
TL
1570 for (const auto& p : truncate_finish) {
1571 LogSegment *ls = mds->mdlog->get_segment(p.second);
7c673cae 1572 if (ls) {
11fdf7f2
TL
1573 CInode *in = mds->mdcache->get_inode(p.first);
1574 ceph_assert(in);
7c673cae
FG
1575 mds->mdcache->remove_recovered_truncate(in, ls);
1576 }
f6b5b4d7
TL
1577
1578 if (!(++count % 1000))
1579 mds->heartbeat_reset();
7c673cae
FG
1580 }
1581
1582 // destroyed inodes
11fdf7f2
TL
1583 if (!destroyed_inodes.empty()) {
1584 for (vector<inodeno_t>::iterator p = destroyed_inodes.begin();
1585 p != destroyed_inodes.end();
1586 ++p) {
1587 CInode *in = mds->mdcache->get_inode(*p);
1588 if (in) {
1589 dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl;
1590 CDentry *parent = in->get_parent_dn();
1591 mds->mdcache->remove_inode(in);
1592 if (parent) {
1593 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl;
1594 ceph_assert(parent->get_linkage()->is_null());
1595 }
1596 } else {
1597 dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl;
7c673cae 1598 }
f6b5b4d7
TL
1599
1600 if (!(++count % 1000))
1601 mds->heartbeat_reset();
7c673cae 1602 }
11fdf7f2 1603 mds->mdcache->open_file_table.note_destroyed_inos(logseg->seq, destroyed_inodes);
7c673cae
FG
1604 }
1605
1606 // client requests
11fdf7f2
TL
1607 for (const auto& p : client_reqs) {
1608 if (p.first.name.is_client()) {
1609 dout(10) << "EMetaBlob.replay request " << p.first << " trim_to " << p.second << dendl;
7c673cae
FG
1610 inodeno_t created = allocated_ino ? allocated_ino : used_preallocated_ino;
1611 // if we allocated an inode, there should be exactly one client request id.
11fdf7f2 1612 ceph_assert(created == inodeno_t() || client_reqs.size() == 1);
7c673cae 1613
11fdf7f2 1614 Session *session = mds->sessionmap.get_session(p.first.name);
7c673cae 1615 if (session) {
11fdf7f2
TL
1616 session->add_completed_request(p.first.tid, created);
1617 if (p.second)
1618 session->trim_completed_requests(p.second);
7c673cae
FG
1619 }
1620 }
f6b5b4d7
TL
1621
1622 if (!(++count % 1000))
1623 mds->heartbeat_reset();
7c673cae
FG
1624 }
1625
1626 // client flushes
11fdf7f2
TL
1627 for (const auto& p : client_flushes) {
1628 if (p.first.name.is_client()) {
1629 dout(10) << "EMetaBlob.replay flush " << p.first << " trim_to " << p.second << dendl;
1630 Session *session = mds->sessionmap.get_session(p.first.name);
7c673cae 1631 if (session) {
11fdf7f2
TL
1632 session->add_completed_flush(p.first.tid);
1633 if (p.second)
1634 session->trim_completed_flushes(p.second);
7c673cae
FG
1635 }
1636 }
f6b5b4d7
TL
1637
1638 if (!(++count % 1000))
1639 mds->heartbeat_reset();
7c673cae
FG
1640 }
1641
1642 // update segment
1643 update_segment(logseg);
1644
11fdf7f2 1645 ceph_assert(g_conf()->mds_kill_journal_replay_at != 4);
7c673cae
FG
1646}
1647
9f95a23c
TL
1648// -----------------------
1649// EPurged
1650void EPurged::update_segment()
1651{
1652 if (inos.size() && inotablev)
1653 get_segment()->inotablev = inotablev;
1654 return;
1655}
1656
1657void EPurged::replay(MDSRank *mds)
1658{
1659 if (inos.size()) {
1660 LogSegment *ls = mds->mdlog->get_segment(seq);
1661 if (ls) {
1662 ls->purge_inodes.subtract(inos);
1663 }
1664 if (mds->inotable->get_version() >= inotablev) {
1665 dout(10) << "EPurged.replay inotable " << mds->inotable->get_version()
1666 << " >= " << inotablev << ", noop" << dendl;
1667 } else {
1668 dout(10) << "EPurged.replay inotable " << mds->inotable->get_version()
1669 << " < " << inotablev << " " << dendl;
1670 mds->inotable->replay_release_ids(inos);
1671 assert(mds->inotable->get_version() == inotablev);
1672 }
1673 }
1674 update_segment();
1675}
1676
1677void EPurged::encode(bufferlist& bl, uint64_t features) const
1678{
1679 ENCODE_START(1, 1, bl);
1680 encode(inos, bl);
1681 encode(inotablev, bl);
1682 encode(seq, bl);
1683 ENCODE_FINISH(bl);
1684}
1685
1686void EPurged::decode(bufferlist::const_iterator& bl)
1687{
1688 DECODE_START(1, bl);
1689 decode(inos, bl);
1690 decode(inotablev, bl);
1691 decode(seq, bl);
1692 DECODE_FINISH(bl);
1693}
1694
1695void EPurged::dump(Formatter *f) const
1696{
1697 f->dump_stream("inos") << inos;
1698 f->dump_int("inotable version", inotablev);
1699 f->dump_int("segment seq", seq);
1700}
1701
7c673cae
FG
1702// -----------------------
1703// ESession
1704
1705void ESession::update_segment()
1706{
11fdf7f2 1707 get_segment()->sessionmapv = cmapv;
7c673cae 1708 if (inos.size() && inotablev)
11fdf7f2 1709 get_segment()->inotablev = inotablev;
7c673cae
FG
1710}
1711
1712void ESession::replay(MDSRank *mds)
1713{
9f95a23c
TL
1714 if (purge_inos.size())
1715 get_segment()->purge_inodes.insert(purge_inos);
1716
7c673cae
FG
1717 if (mds->sessionmap.get_version() >= cmapv) {
1718 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1719 << " >= " << cmapv << ", noop" << dendl;
81eedcae 1720 } else if (mds->sessionmap.get_version() + 1 == cmapv) {
7c673cae
FG
1721 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1722 << " < " << cmapv << " " << (open ? "open":"close") << " " << client_inst << dendl;
1723 Session *session;
1724 if (open) {
1725 session = mds->sessionmap.get_or_add_session(client_inst);
1726 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1727 session->set_client_metadata(client_metadata);
1728 dout(10) << " opened session " << session->info.inst << dendl;
1729 } else {
1730 session = mds->sessionmap.get_session(client_inst.name);
1731 if (session) { // there always should be a session, but there's a bug
11fdf7f2 1732 if (session->get_connection() == NULL) {
7c673cae
FG
1733 dout(10) << " removed session " << session->info.inst << dendl;
1734 mds->sessionmap.remove_session(session);
1735 session = NULL;
1736 } else {
1737 session->clear(); // the client has reconnected; keep the Session, but reset
1738 dout(10) << " reset session " << session->info.inst << " (they reconnected)" << dendl;
1739 }
1740 } else {
1741 mds->clog->error() << "replayed stray Session close event for " << client_inst
1742 << " from time " << stamp << ", ignoring";
1743 }
1744 }
1745 if (session) {
1746 mds->sessionmap.replay_dirty_session(session);
1747 } else {
1748 mds->sessionmap.replay_advance_version();
1749 }
11fdf7f2 1750 ceph_assert(mds->sessionmap.get_version() == cmapv);
81eedcae
TL
1751 } else {
1752 mds->clog->error() << "ESession.replay sessionmap v " << cmapv
1753 << " - 1 > table " << mds->sessionmap.get_version();
1754 ceph_assert(g_conf()->mds_wipe_sessions);
1755 mds->sessionmap.wipe();
1756 mds->sessionmap.set_version(cmapv);
7c673cae
FG
1757 }
1758
1759 if (inos.size() && inotablev) {
1760 if (mds->inotable->get_version() >= inotablev) {
1761 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1762 << " >= " << inotablev << ", noop" << dendl;
1763 } else {
1764 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1765 << " < " << inotablev << " " << (open ? "add":"remove") << dendl;
11fdf7f2 1766 ceph_assert(!open); // for now
7c673cae 1767 mds->inotable->replay_release_ids(inos);
11fdf7f2 1768 ceph_assert(mds->inotable->get_version() == inotablev);
7c673cae
FG
1769 }
1770 }
1771
1772 update_segment();
1773}
1774
1775void ESession::encode(bufferlist &bl, uint64_t features) const
1776{
9f95a23c 1777 ENCODE_START(6, 5, bl);
11fdf7f2
TL
1778 encode(stamp, bl);
1779 encode(client_inst, bl, features);
1780 encode(open, bl);
1781 encode(cmapv, bl);
1782 encode(inos, bl);
1783 encode(inotablev, bl);
1784 encode(client_metadata, bl);
9f95a23c 1785 encode(purge_inos, bl);
7c673cae
FG
1786 ENCODE_FINISH(bl);
1787}
1788
11fdf7f2 1789void ESession::decode(bufferlist::const_iterator &bl)
7c673cae 1790{
9f95a23c 1791 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
7c673cae 1792 if (struct_v >= 2)
11fdf7f2
TL
1793 decode(stamp, bl);
1794 decode(client_inst, bl);
1795 decode(open, bl);
1796 decode(cmapv, bl);
1797 decode(inos, bl);
1798 decode(inotablev, bl);
1799 if (struct_v == 4) {
1800 decode(client_metadata.kv_map, bl);
1801 } else if (struct_v >= 5) {
1802 decode(client_metadata, bl);
7c673cae 1803 }
9f95a23c
TL
1804 if (struct_v >= 6){
1805 decode(purge_inos, bl);
1806 }
1807
7c673cae
FG
1808 DECODE_FINISH(bl);
1809}
1810
1811void ESession::dump(Formatter *f) const
1812{
1813 f->dump_stream("client instance") << client_inst;
1814 f->dump_string("open", open ? "true" : "false");
1815 f->dump_int("client map version", cmapv);
1816 f->dump_stream("inos") << inos;
1817 f->dump_int("inotable version", inotablev);
1818 f->open_object_section("client_metadata");
11fdf7f2 1819 client_metadata.dump(f);
7c673cae
FG
1820 f->close_section(); // client_metadata
1821}
1822
9f95a23c 1823void ESession::generate_test_instances(std::list<ESession*>& ls)
7c673cae
FG
1824{
1825 ls.push_back(new ESession);
1826}
1827
1828// -----------------------
1829// ESessions
1830
1831void ESessions::encode(bufferlist &bl, uint64_t features) const
1832{
11fdf7f2
TL
1833 ENCODE_START(2, 1, bl);
1834 encode(client_map, bl, features);
1835 encode(cmapv, bl);
1836 encode(stamp, bl);
1837 encode(client_metadata_map, bl);
7c673cae
FG
1838 ENCODE_FINISH(bl);
1839}
1840
11fdf7f2 1841void ESessions::decode_old(bufferlist::const_iterator &bl)
7c673cae 1842{
11fdf7f2
TL
1843 using ceph::decode;
1844 decode(client_map, bl);
1845 decode(cmapv, bl);
7c673cae 1846 if (!bl.end())
11fdf7f2 1847 decode(stamp, bl);
7c673cae
FG
1848}
1849
11fdf7f2 1850void ESessions::decode_new(bufferlist::const_iterator &bl)
7c673cae 1851{
11fdf7f2
TL
1852 DECODE_START(2, bl);
1853 decode(client_map, bl);
1854 decode(cmapv, bl);
1855 decode(stamp, bl);
1856 if (struct_v >= 2)
1857 decode(client_metadata_map, bl);
7c673cae
FG
1858 DECODE_FINISH(bl);
1859}
1860
1861void ESessions::dump(Formatter *f) const
1862{
1863 f->dump_int("client map version", cmapv);
1864
1865 f->open_array_section("client map");
1866 for (map<client_t,entity_inst_t>::const_iterator i = client_map.begin();
1867 i != client_map.end(); ++i) {
1868 f->open_object_section("client");
1869 f->dump_int("client id", i->first.v);
1870 f->dump_stream("client entity") << i->second;
1871 f->close_section(); // client
1872 }
1873 f->close_section(); // client map
1874}
1875
9f95a23c 1876void ESessions::generate_test_instances(std::list<ESessions*>& ls)
7c673cae
FG
1877{
1878 ls.push_back(new ESessions());
1879}
1880
1881void ESessions::update_segment()
1882{
11fdf7f2 1883 get_segment()->sessionmapv = cmapv;
7c673cae
FG
1884}
1885
1886void ESessions::replay(MDSRank *mds)
1887{
1888 if (mds->sessionmap.get_version() >= cmapv) {
1889 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1890 << " >= " << cmapv << ", noop" << dendl;
1891 } else {
1892 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1893 << " < " << cmapv << dendl;
81eedcae 1894 mds->sessionmap.replay_open_sessions(cmapv, client_map, client_metadata_map);
7c673cae
FG
1895 }
1896 update_segment();
1897}
1898
1899
1900// -----------------------
1901// ETableServer
1902
1903void ETableServer::encode(bufferlist& bl, uint64_t features) const
1904{
1905 ENCODE_START(3, 3, bl);
11fdf7f2
TL
1906 encode(stamp, bl);
1907 encode(table, bl);
1908 encode(op, bl);
1909 encode(reqid, bl);
1910 encode(bymds, bl);
1911 encode(mutation, bl);
1912 encode(tid, bl);
1913 encode(version, bl);
7c673cae
FG
1914 ENCODE_FINISH(bl);
1915}
1916
11fdf7f2 1917void ETableServer::decode(bufferlist::const_iterator &bl)
7c673cae
FG
1918{
1919 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
1920 if (struct_v >= 2)
11fdf7f2
TL
1921 decode(stamp, bl);
1922 decode(table, bl);
1923 decode(op, bl);
1924 decode(reqid, bl);
1925 decode(bymds, bl);
1926 decode(mutation, bl);
1927 decode(tid, bl);
1928 decode(version, bl);
7c673cae
FG
1929 DECODE_FINISH(bl);
1930}
1931
1932void ETableServer::dump(Formatter *f) const
1933{
1934 f->dump_int("table id", table);
1935 f->dump_int("op", op);
1936 f->dump_int("request id", reqid);
1937 f->dump_int("by mds", bymds);
1938 f->dump_int("tid", tid);
1939 f->dump_int("version", version);
1940}
1941
9f95a23c 1942void ETableServer::generate_test_instances(std::list<ETableServer*>& ls)
7c673cae
FG
1943{
1944 ls.push_back(new ETableServer());
1945}
1946
1947
1948void ETableServer::update_segment()
1949{
11fdf7f2 1950 get_segment()->tablev[table] = version;
7c673cae
FG
1951}
1952
1953void ETableServer::replay(MDSRank *mds)
1954{
1955 MDSTableServer *server = mds->get_table_server(table);
1956 if (!server)
1957 return;
1958
1959 if (server->get_version() >= version) {
1960 dout(10) << "ETableServer.replay " << get_mdstable_name(table)
1961 << " " << get_mdstableserver_opname(op)
1962 << " event " << version
1963 << " <= table " << server->get_version() << dendl;
1964 return;
1965 }
1966
1967 dout(10) << " ETableServer.replay " << get_mdstable_name(table)
1968 << " " << get_mdstableserver_opname(op)
1969 << " event " << version << " - 1 == table " << server->get_version() << dendl;
11fdf7f2 1970 ceph_assert(version-1 == server->get_version());
7c673cae
FG
1971
1972 switch (op) {
11fdf7f2
TL
1973 case TABLESERVER_OP_PREPARE: {
1974 server->_note_prepare(bymds, reqid, true);
1975 bufferlist out;
1976 server->_prepare(mutation, reqid, bymds, out);
1977 mutation = std::move(out);
7c673cae 1978 break;
11fdf7f2 1979 }
7c673cae 1980 case TABLESERVER_OP_COMMIT:
9f95a23c 1981 server->_commit(tid, ref_t<MMDSTableRequest>());
11fdf7f2 1982 server->_note_commit(tid, true);
7c673cae
FG
1983 break;
1984 case TABLESERVER_OP_ROLLBACK:
1985 server->_rollback(tid);
11fdf7f2 1986 server->_note_rollback(tid, true);
7c673cae
FG
1987 break;
1988 case TABLESERVER_OP_SERVER_UPDATE:
1989 server->_server_update(mutation);
11fdf7f2 1990 server->_note_server_update(mutation, true);
7c673cae
FG
1991 break;
1992 default:
1993 mds->clog->error() << "invalid tableserver op in ETableServer";
1994 mds->damaged();
1995 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1996 }
1997
11fdf7f2 1998 ceph_assert(version == server->get_version());
7c673cae
FG
1999 update_segment();
2000}
2001
2002
2003// ---------------------
2004// ETableClient
2005
2006void ETableClient::encode(bufferlist& bl, uint64_t features) const
2007{
2008 ENCODE_START(3, 3, bl);
11fdf7f2
TL
2009 encode(stamp, bl);
2010 encode(table, bl);
2011 encode(op, bl);
2012 encode(tid, bl);
7c673cae
FG
2013 ENCODE_FINISH(bl);
2014}
2015
11fdf7f2 2016void ETableClient::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2017{
2018 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2019 if (struct_v >= 2)
11fdf7f2
TL
2020 decode(stamp, bl);
2021 decode(table, bl);
2022 decode(op, bl);
2023 decode(tid, bl);
7c673cae
FG
2024 DECODE_FINISH(bl);
2025}
2026
2027void ETableClient::dump(Formatter *f) const
2028{
2029 f->dump_int("table", table);
2030 f->dump_int("op", op);
2031 f->dump_int("tid", tid);
2032}
2033
9f95a23c 2034void ETableClient::generate_test_instances(std::list<ETableClient*>& ls)
7c673cae
FG
2035{
2036 ls.push_back(new ETableClient());
2037}
2038
2039void ETableClient::replay(MDSRank *mds)
2040{
2041 dout(10) << " ETableClient.replay " << get_mdstable_name(table)
2042 << " op " << get_mdstableserver_opname(op)
2043 << " tid " << tid << dendl;
2044
2045 MDSTableClient *client = mds->get_table_client(table);
2046 if (!client)
2047 return;
2048
11fdf7f2 2049 ceph_assert(op == TABLESERVER_OP_ACK);
7c673cae
FG
2050 client->got_journaled_ack(tid);
2051}
2052
2053
2054// -----------------------
2055// ESnap
2056/*
2057void ESnap::update_segment()
2058{
11fdf7f2 2059 get_segment()->tablev[TABLE_SNAP] = version;
7c673cae
FG
2060}
2061
2062void ESnap::replay(MDSRank *mds)
2063{
2064 if (mds->snaptable->get_version() >= version) {
2065 dout(10) << "ESnap.replay event " << version
2066 << " <= table " << mds->snaptable->get_version() << dendl;
2067 return;
2068 }
2069
2070 dout(10) << " ESnap.replay event " << version
2071 << " - 1 == table " << mds->snaptable->get_version() << dendl;
11fdf7f2 2072 ceph_assert(version-1 == mds->snaptable->get_version());
7c673cae
FG
2073
2074 if (create) {
2075 version_t v;
2076 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
11fdf7f2 2077 ceph_assert(s == snap.snapid);
7c673cae
FG
2078 } else {
2079 mds->snaptable->remove(snap.snapid);
2080 }
2081
11fdf7f2 2082 ceph_assert(version == mds->snaptable->get_version());
7c673cae
FG
2083}
2084*/
2085
2086
2087
2088// -----------------------
2089// EUpdate
2090
2091void EUpdate::encode(bufferlist &bl, uint64_t features) const
2092{
2093 ENCODE_START(4, 4, bl);
11fdf7f2
TL
2094 encode(stamp, bl);
2095 encode(type, bl);
2096 encode(metablob, bl, features);
2097 encode(client_map, bl);
2098 encode(cmapv, bl);
2099 encode(reqid, bl);
2100 encode(had_slaves, bl);
7c673cae
FG
2101 ENCODE_FINISH(bl);
2102}
2103
11fdf7f2 2104void EUpdate::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2105{
2106 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl);
2107 if (struct_v >= 2)
11fdf7f2
TL
2108 decode(stamp, bl);
2109 decode(type, bl);
2110 decode(metablob, bl);
2111 decode(client_map, bl);
7c673cae 2112 if (struct_v >= 3)
11fdf7f2
TL
2113 decode(cmapv, bl);
2114 decode(reqid, bl);
2115 decode(had_slaves, bl);
7c673cae
FG
2116 DECODE_FINISH(bl);
2117}
2118
2119void EUpdate::dump(Formatter *f) const
2120{
2121 f->open_object_section("metablob");
2122 metablob.dump(f);
2123 f->close_section(); // metablob
2124
2125 f->dump_string("type", type);
2126 f->dump_int("client map length", client_map.length());
2127 f->dump_int("client map version", cmapv);
2128 f->dump_stream("reqid") << reqid;
2129 f->dump_string("had slaves", had_slaves ? "true" : "false");
2130}
2131
9f95a23c 2132void EUpdate::generate_test_instances(std::list<EUpdate*>& ls)
7c673cae
FG
2133{
2134 ls.push_back(new EUpdate());
2135}
2136
2137
2138void EUpdate::update_segment()
2139{
11fdf7f2
TL
2140 auto&& segment = get_segment();
2141 metablob.update_segment(segment);
7c673cae
FG
2142
2143 if (client_map.length())
11fdf7f2 2144 segment->sessionmapv = cmapv;
7c673cae
FG
2145
2146 if (had_slaves)
11fdf7f2 2147 segment->uncommitted_masters.insert(reqid);
7c673cae
FG
2148}
2149
2150void EUpdate::replay(MDSRank *mds)
2151{
11fdf7f2
TL
2152 auto&& segment = get_segment();
2153 metablob.replay(mds, segment);
7c673cae
FG
2154
2155 if (had_slaves) {
2156 dout(10) << "EUpdate.replay " << reqid << " had slaves, expecting a matching ECommitted" << dendl;
11fdf7f2 2157 segment->uncommitted_masters.insert(reqid);
7c673cae 2158 set<mds_rank_t> slaves;
11fdf7f2 2159 mds->mdcache->add_uncommitted_master(reqid, segment, slaves, true);
7c673cae
FG
2160 }
2161
2162 if (client_map.length()) {
2163 if (mds->sessionmap.get_version() >= cmapv) {
2164 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2165 << " <= table " << mds->sessionmap.get_version() << dendl;
2166 } else {
2167 dout(10) << "EUpdate.replay sessionmap " << mds->sessionmap.get_version()
2168 << " < " << cmapv << dendl;
2169 // open client sessions?
2170 map<client_t,entity_inst_t> cm;
11fdf7f2
TL
2171 map<client_t,client_metadata_t> cmm;
2172 auto blp = client_map.cbegin();
2173 using ceph::decode;
2174 decode(cm, blp);
2175 if (!blp.end())
2176 decode(cmm, blp);
81eedcae 2177 mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
7c673cae
FG
2178 }
2179 }
2180 update_segment();
2181}
2182
2183
2184// ------------------------
2185// EOpen
2186
2187void EOpen::encode(bufferlist &bl, uint64_t features) const {
2188 ENCODE_START(4, 3, bl);
11fdf7f2
TL
2189 encode(stamp, bl);
2190 encode(metablob, bl, features);
2191 encode(inos, bl);
2192 encode(snap_inos, bl);
7c673cae
FG
2193 ENCODE_FINISH(bl);
2194}
2195
11fdf7f2 2196void EOpen::decode(bufferlist::const_iterator &bl) {
7c673cae
FG
2197 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2198 if (struct_v >= 2)
11fdf7f2
TL
2199 decode(stamp, bl);
2200 decode(metablob, bl);
2201 decode(inos, bl);
7c673cae 2202 if (struct_v >= 4)
11fdf7f2 2203 decode(snap_inos, bl);
7c673cae
FG
2204 DECODE_FINISH(bl);
2205}
2206
2207void EOpen::dump(Formatter *f) const
2208{
2209 f->open_object_section("metablob");
2210 metablob.dump(f);
2211 f->close_section(); // metablob
2212 f->open_array_section("inos involved");
2213 for (vector<inodeno_t>::const_iterator i = inos.begin();
2214 i != inos.end(); ++i) {
2215 f->dump_int("ino", *i);
2216 }
2217 f->close_section(); // inos
2218}
2219
9f95a23c 2220void EOpen::generate_test_instances(std::list<EOpen*>& ls)
7c673cae
FG
2221{
2222 ls.push_back(new EOpen());
2223 ls.push_back(new EOpen());
2224 ls.back()->add_ino(0);
2225}
2226
2227void EOpen::update_segment()
2228{
2229 // ??
2230}
2231
2232void EOpen::replay(MDSRank *mds)
2233{
2234 dout(10) << "EOpen.replay " << dendl;
11fdf7f2
TL
2235 auto&& segment = get_segment();
2236 metablob.replay(mds, segment);
7c673cae
FG
2237
2238 // note which segments inodes belong to, so we don't have to start rejournaling them
2239 for (const auto &ino : inos) {
2240 CInode *in = mds->mdcache->get_inode(ino);
2241 if (!in) {
2242 dout(0) << "EOpen.replay ino " << ino << " not in metablob" << dendl;
11fdf7f2 2243 ceph_assert(in);
7c673cae 2244 }
11fdf7f2 2245 segment->open_files.push_back(&in->item_open_file);
7c673cae
FG
2246 }
2247 for (const auto &vino : snap_inos) {
2248 CInode *in = mds->mdcache->get_inode(vino);
2249 if (!in) {
2250 dout(0) << "EOpen.replay ino " << vino << " not in metablob" << dendl;
11fdf7f2 2251 ceph_assert(in);
7c673cae 2252 }
11fdf7f2 2253 segment->open_files.push_back(&in->item_open_file);
7c673cae
FG
2254 }
2255}
2256
2257
2258// -----------------------
2259// ECommitted
2260
2261void ECommitted::replay(MDSRank *mds)
2262{
2263 if (mds->mdcache->uncommitted_masters.count(reqid)) {
2264 dout(10) << "ECommitted.replay " << reqid << dendl;
2265 mds->mdcache->uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2266 mds->mdcache->uncommitted_masters.erase(reqid);
2267 } else {
2268 dout(10) << "ECommitted.replay " << reqid << " -- didn't see original op" << dendl;
2269 }
2270}
2271
2272void ECommitted::encode(bufferlist& bl, uint64_t features) const
2273{
2274 ENCODE_START(3, 3, bl);
11fdf7f2
TL
2275 encode(stamp, bl);
2276 encode(reqid, bl);
7c673cae
FG
2277 ENCODE_FINISH(bl);
2278}
2279
11fdf7f2 2280void ECommitted::decode(bufferlist::const_iterator& bl)
7c673cae
FG
2281{
2282 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2283 if (struct_v >= 2)
11fdf7f2
TL
2284 decode(stamp, bl);
2285 decode(reqid, bl);
7c673cae
FG
2286 DECODE_FINISH(bl);
2287}
2288
2289void ECommitted::dump(Formatter *f) const {
2290 f->dump_stream("stamp") << stamp;
2291 f->dump_stream("reqid") << reqid;
2292}
2293
9f95a23c 2294void ECommitted::generate_test_instances(std::list<ECommitted*>& ls)
7c673cae
FG
2295{
2296 ls.push_back(new ECommitted);
2297 ls.push_back(new ECommitted);
2298 ls.back()->stamp = utime_t(1, 2);
2299 ls.back()->reqid = metareqid_t(entity_name_t::CLIENT(123), 456);
2300}
2301
2302// -----------------------
2303// ESlaveUpdate
2304
2305void link_rollback::encode(bufferlist &bl) const
2306{
11fdf7f2
TL
2307 ENCODE_START(3, 2, bl);
2308 encode(reqid, bl);
2309 encode(ino, bl);
2310 encode(was_inc, bl);
2311 encode(old_ctime, bl);
2312 encode(old_dir_mtime, bl);
2313 encode(old_dir_rctime, bl);
2314 encode(snapbl, bl);
7c673cae
FG
2315 ENCODE_FINISH(bl);
2316}
2317
11fdf7f2 2318void link_rollback::decode(bufferlist::const_iterator &bl)
7c673cae 2319{
11fdf7f2
TL
2320 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2321 decode(reqid, bl);
2322 decode(ino, bl);
2323 decode(was_inc, bl);
2324 decode(old_ctime, bl);
2325 decode(old_dir_mtime, bl);
2326 decode(old_dir_rctime, bl);
2327 if (struct_v >= 3)
2328 decode(snapbl, bl);
7c673cae
FG
2329 DECODE_FINISH(bl);
2330}
2331
2332void link_rollback::dump(Formatter *f) const
2333{
2334 f->dump_stream("metareqid") << reqid;
2335 f->dump_int("ino", ino);
2336 f->dump_string("was incremented", was_inc ? "true" : "false");
2337 f->dump_stream("old_ctime") << old_ctime;
2338 f->dump_stream("old_dir_mtime") << old_dir_mtime;
2339 f->dump_stream("old_dir_rctime") << old_dir_rctime;
2340}
2341
9f95a23c 2342void link_rollback::generate_test_instances(std::list<link_rollback*>& ls)
7c673cae
FG
2343{
2344 ls.push_back(new link_rollback());
2345}
2346
2347void rmdir_rollback::encode(bufferlist& bl) const
2348{
11fdf7f2
TL
2349 ENCODE_START(3, 2, bl);
2350 encode(reqid, bl);
2351 encode(src_dir, bl);
2352 encode(src_dname, bl);
2353 encode(dest_dir, bl);
2354 encode(dest_dname, bl);
2355 encode(snapbl, bl);
7c673cae
FG
2356 ENCODE_FINISH(bl);
2357}
2358
11fdf7f2 2359void rmdir_rollback::decode(bufferlist::const_iterator& bl)
7c673cae 2360{
11fdf7f2
TL
2361 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2362 decode(reqid, bl);
2363 decode(src_dir, bl);
2364 decode(src_dname, bl);
2365 decode(dest_dir, bl);
2366 decode(dest_dname, bl);
2367 if (struct_v >= 3)
2368 decode(snapbl, bl);
7c673cae
FG
2369 DECODE_FINISH(bl);
2370}
2371
2372void rmdir_rollback::dump(Formatter *f) const
2373{
2374 f->dump_stream("metareqid") << reqid;
2375 f->dump_stream("source directory") << src_dir;
2376 f->dump_string("source dname", src_dname);
2377 f->dump_stream("destination directory") << dest_dir;
2378 f->dump_string("destination dname", dest_dname);
2379}
2380
9f95a23c 2381void rmdir_rollback::generate_test_instances(std::list<rmdir_rollback*>& ls)
7c673cae
FG
2382{
2383 ls.push_back(new rmdir_rollback());
2384}
2385
2386void rename_rollback::drec::encode(bufferlist &bl) const
2387{
2388 ENCODE_START(2, 2, bl);
11fdf7f2
TL
2389 encode(dirfrag, bl);
2390 encode(dirfrag_old_mtime, bl);
2391 encode(dirfrag_old_rctime, bl);
2392 encode(ino, bl);
2393 encode(remote_ino, bl);
2394 encode(dname, bl);
2395 encode(remote_d_type, bl);
2396 encode(old_ctime, bl);
7c673cae
FG
2397 ENCODE_FINISH(bl);
2398}
2399
11fdf7f2 2400void rename_rollback::drec::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2401{
2402 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
11fdf7f2
TL
2403 decode(dirfrag, bl);
2404 decode(dirfrag_old_mtime, bl);
2405 decode(dirfrag_old_rctime, bl);
2406 decode(ino, bl);
2407 decode(remote_ino, bl);
2408 decode(dname, bl);
2409 decode(remote_d_type, bl);
2410 decode(old_ctime, bl);
7c673cae
FG
2411 DECODE_FINISH(bl);
2412}
2413
2414void rename_rollback::drec::dump(Formatter *f) const
2415{
2416 f->dump_stream("directory fragment") << dirfrag;
2417 f->dump_stream("directory old mtime") << dirfrag_old_mtime;
2418 f->dump_stream("directory old rctime") << dirfrag_old_rctime;
2419 f->dump_int("ino", ino);
2420 f->dump_int("remote ino", remote_ino);
2421 f->dump_string("dname", dname);
2422 uint32_t type = DTTOIF(remote_d_type) & S_IFMT; // convert to type entries
2423 string type_string;
2424 switch(type) {
2425 case S_IFREG:
2426 type_string = "file"; break;
2427 case S_IFLNK:
2428 type_string = "symlink"; break;
2429 case S_IFDIR:
2430 type_string = "directory"; break;
2431 default:
2432 type_string = "UNKNOWN-" + stringify((int)type); break;
2433 }
2434 f->dump_string("remote dtype", type_string);
2435 f->dump_stream("old ctime") << old_ctime;
2436}
2437
9f95a23c 2438void rename_rollback::drec::generate_test_instances(std::list<drec*>& ls)
7c673cae
FG
2439{
2440 ls.push_back(new drec());
2441 ls.back()->remote_d_type = IFTODT(S_IFREG);
2442}
2443
2444void rename_rollback::encode(bufferlist &bl) const
2445{
11fdf7f2
TL
2446 ENCODE_START(3, 2, bl);
2447 encode(reqid, bl);
7c673cae
FG
2448 encode(orig_src, bl);
2449 encode(orig_dest, bl);
2450 encode(stray, bl);
11fdf7f2
TL
2451 encode(ctime, bl);
2452 encode(srci_snapbl, bl);
2453 encode(desti_snapbl, bl);
7c673cae
FG
2454 ENCODE_FINISH(bl);
2455}
2456
11fdf7f2 2457void rename_rollback::decode(bufferlist::const_iterator &bl)
7c673cae 2458{
11fdf7f2
TL
2459 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2460 decode(reqid, bl);
7c673cae
FG
2461 decode(orig_src, bl);
2462 decode(orig_dest, bl);
2463 decode(stray, bl);
11fdf7f2
TL
2464 decode(ctime, bl);
2465 if (struct_v >= 3) {
2466 decode(srci_snapbl, bl);
2467 decode(desti_snapbl, bl);
2468 }
7c673cae
FG
2469 DECODE_FINISH(bl);
2470}
2471
2472void rename_rollback::dump(Formatter *f) const
2473{
2474 f->dump_stream("request id") << reqid;
2475 f->open_object_section("original src drec");
2476 orig_src.dump(f);
2477 f->close_section(); // original src drec
2478 f->open_object_section("original dest drec");
2479 orig_dest.dump(f);
2480 f->close_section(); // original dest drec
2481 f->open_object_section("stray drec");
2482 stray.dump(f);
2483 f->close_section(); // stray drec
2484 f->dump_stream("ctime") << ctime;
2485}
2486
9f95a23c 2487void rename_rollback::generate_test_instances(std::list<rename_rollback*>& ls)
7c673cae
FG
2488{
2489 ls.push_back(new rename_rollback());
2490 ls.back()->orig_src.remote_d_type = IFTODT(S_IFREG);
2491 ls.back()->orig_dest.remote_d_type = IFTODT(S_IFREG);
2492 ls.back()->stray.remote_d_type = IFTODT(S_IFREG);
2493}
2494
2495void ESlaveUpdate::encode(bufferlist &bl, uint64_t features) const
2496{
2497 ENCODE_START(3, 3, bl);
11fdf7f2
TL
2498 encode(stamp, bl);
2499 encode(type, bl);
2500 encode(reqid, bl);
2501 encode(master, bl);
2502 encode(op, bl);
2503 encode(origop, bl);
2504 encode(commit, bl, features);
2505 encode(rollback, bl);
7c673cae
FG
2506 ENCODE_FINISH(bl);
2507}
2508
11fdf7f2 2509void ESlaveUpdate::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2510{
2511 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2512 if (struct_v >= 2)
11fdf7f2
TL
2513 decode(stamp, bl);
2514 decode(type, bl);
2515 decode(reqid, bl);
2516 decode(master, bl);
2517 decode(op, bl);
2518 decode(origop, bl);
2519 decode(commit, bl);
2520 decode(rollback, bl);
7c673cae
FG
2521 DECODE_FINISH(bl);
2522}
2523
2524void ESlaveUpdate::dump(Formatter *f) const
2525{
2526 f->open_object_section("metablob");
2527 commit.dump(f);
2528 f->close_section(); // metablob
2529
2530 f->dump_int("rollback length", rollback.length());
2531 f->dump_string("type", type);
2532 f->dump_stream("metareqid") << reqid;
2533 f->dump_int("master", master);
2534 f->dump_int("op", op);
2535 f->dump_int("original op", origop);
2536}
2537
9f95a23c 2538void ESlaveUpdate::generate_test_instances(std::list<ESlaveUpdate*>& ls)
7c673cae
FG
2539{
2540 ls.push_back(new ESlaveUpdate());
2541}
2542
7c673cae
FG
2543void ESlaveUpdate::replay(MDSRank *mds)
2544{
2545 MDSlaveUpdate *su;
11fdf7f2 2546 auto&& segment = get_segment();
7c673cae
FG
2547 switch (op) {
2548 case ESlaveUpdate::OP_PREPARE:
2549 dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds." << master
2550 << ": applying commit, saving rollback info" << dendl;
e306af50 2551 su = new MDSlaveUpdate(origop, rollback);
11fdf7f2 2552 commit.replay(mds, segment, su);
e306af50 2553 mds->mdcache->add_uncommitted_slave(reqid, segment, master, su);
7c673cae
FG
2554 break;
2555
2556 case ESlaveUpdate::OP_COMMIT:
e306af50
TL
2557 dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << dendl;
2558 mds->mdcache->finish_uncommitted_slave(reqid, false);
7c673cae
FG
2559 break;
2560
2561 case ESlaveUpdate::OP_ROLLBACK:
2562 dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
2563 << ": applying rollback commit blob" << dendl;
11fdf7f2 2564 commit.replay(mds, segment);
e306af50 2565 mds->mdcache->finish_uncommitted_slave(reqid, false);
7c673cae
FG
2566 break;
2567
2568 default:
2569 mds->clog->error() << "invalid op in ESlaveUpdate";
2570 mds->damaged();
2571 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2572 }
2573}
2574
2575
2576// -----------------------
2577// ESubtreeMap
2578
2579void ESubtreeMap::encode(bufferlist& bl, uint64_t features) const
2580{
2581 ENCODE_START(6, 5, bl);
11fdf7f2
TL
2582 encode(stamp, bl);
2583 encode(metablob, bl, features);
2584 encode(subtrees, bl);
2585 encode(ambiguous_subtrees, bl);
2586 encode(expire_pos, bl);
2587 encode(event_seq, bl);
7c673cae
FG
2588 ENCODE_FINISH(bl);
2589}
2590
11fdf7f2 2591void ESubtreeMap::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2592{
2593 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
2594 if (struct_v >= 2)
11fdf7f2
TL
2595 decode(stamp, bl);
2596 decode(metablob, bl);
2597 decode(subtrees, bl);
7c673cae 2598 if (struct_v >= 4)
11fdf7f2 2599 decode(ambiguous_subtrees, bl);
7c673cae 2600 if (struct_v >= 3)
11fdf7f2 2601 decode(expire_pos, bl);
7c673cae 2602 if (struct_v >= 6)
11fdf7f2 2603 decode(event_seq, bl);
7c673cae
FG
2604 DECODE_FINISH(bl);
2605}
2606
2607void ESubtreeMap::dump(Formatter *f) const
2608{
2609 f->open_object_section("metablob");
2610 metablob.dump(f);
2611 f->close_section(); // metablob
2612
2613 f->open_array_section("subtrees");
2614 for(map<dirfrag_t,vector<dirfrag_t> >::const_iterator i = subtrees.begin();
2615 i != subtrees.end(); ++i) {
2616 f->open_object_section("tree");
2617 f->dump_stream("root dirfrag") << i->first;
2618 for (vector<dirfrag_t>::const_iterator j = i->second.begin();
2619 j != i->second.end(); ++j) {
2620 f->dump_stream("bound dirfrag") << *j;
2621 }
2622 f->close_section(); // tree
2623 }
2624 f->close_section(); // subtrees
2625
2626 f->open_array_section("ambiguous subtrees");
2627 for(set<dirfrag_t>::const_iterator i = ambiguous_subtrees.begin();
2628 i != ambiguous_subtrees.end(); ++i) {
2629 f->dump_stream("dirfrag") << *i;
2630 }
2631 f->close_section(); // ambiguous subtrees
2632
2633 f->dump_int("expire position", expire_pos);
2634}
2635
9f95a23c 2636void ESubtreeMap::generate_test_instances(std::list<ESubtreeMap*>& ls)
7c673cae
FG
2637{
2638 ls.push_back(new ESubtreeMap());
2639}
2640
2641void ESubtreeMap::replay(MDSRank *mds)
2642{
2643 if (expire_pos && expire_pos > mds->mdlog->journaler->get_expire_pos())
2644 mds->mdlog->journaler->set_expire_pos(expire_pos);
2645
2646 // suck up the subtree map?
2647 if (mds->mdcache->is_subtrees()) {
2648 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl;
2649 int errors = 0;
2650
2651 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2652 p != subtrees.end();
2653 ++p) {
2654 CDir *dir = mds->mdcache->get_dirfrag(p->first);
2655 if (!dir) {
2656 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2657 << " subtree root " << p->first << " not in cache";
2658 ++errors;
2659 continue;
2660 }
2661
2662 if (!mds->mdcache->is_subtree(dir)) {
2663 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2664 << " subtree root " << p->first << " not a subtree in cache";
2665 ++errors;
2666 continue;
2667 }
2668 if (dir->get_dir_auth().first != mds->get_nodeid()) {
2669 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2670 << " subtree root " << p->first
2671 << " is not mine in cache (it's " << dir->get_dir_auth() << ")";
2672 ++errors;
2673 continue;
2674 }
2675
2676 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2677 mds->mdcache->get_force_dirfrag(*q, true);
2678
2679 set<CDir*> bounds;
2680 mds->mdcache->get_subtree_bounds(dir, bounds);
2681 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2682 CDir *b = mds->mdcache->get_dirfrag(*q);
2683 if (!b) {
2684 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2685 << " subtree " << p->first << " bound " << *q << " not in cache";
2686 ++errors;
2687 continue;
2688 }
2689 if (bounds.count(b) == 0) {
2690 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2691 << " subtree " << p->first << " bound " << *q << " not a bound in cache";
2692 ++errors;
2693 continue;
2694 }
2695 bounds.erase(b);
2696 }
2697 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q) {
2698 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2699 << " subtree " << p->first << " has extra bound in cache " << (*q)->dirfrag();
2700 ++errors;
2701 }
2702
2703 if (ambiguous_subtrees.count(p->first)) {
2704 if (!mds->mdcache->have_ambiguous_import(p->first)) {
2705 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2706 << " subtree " << p->first << " is ambiguous but is not in our cache";
2707 ++errors;
2708 }
2709 } else {
2710 if (mds->mdcache->have_ambiguous_import(p->first)) {
2711 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2712 << " subtree " << p->first << " is not ambiguous but is in our cache";
2713 ++errors;
2714 }
2715 }
2716 }
2717
11fdf7f2
TL
2718 std::vector<CDir*> dirs;
2719 mds->mdcache->get_subtrees(dirs);
2720 for (const auto& dir : dirs) {
7c673cae
FG
2721 if (dir->get_dir_auth().first != mds->get_nodeid())
2722 continue;
2723 if (subtrees.count(dir->dirfrag()) == 0) {
2724 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2725 << " does not include cache subtree " << dir->dirfrag();
2726 ++errors;
2727 }
2728 }
2729
2730 if (errors) {
2731 dout(0) << "journal subtrees: " << subtrees << dendl;
2732 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees << dendl;
2733 mds->mdcache->show_subtrees();
11fdf7f2 2734 ceph_assert(!g_conf()->mds_debug_subtrees || errors == 0);
7c673cae
FG
2735 }
2736 return;
2737 }
2738
2739 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl;
2740
2741 // first, stick the spanning tree in my cache
2742 //metablob.print(*_dout);
11fdf7f2 2743 metablob.replay(mds, get_segment());
7c673cae
FG
2744
2745 // restore import/export maps
2746 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2747 p != subtrees.end();
2748 ++p) {
2749 CDir *dir = mds->mdcache->get_dirfrag(p->first);
11fdf7f2 2750 ceph_assert(dir);
7c673cae
FG
2751 if (ambiguous_subtrees.count(p->first)) {
2752 // ambiguous!
2753 mds->mdcache->add_ambiguous_import(p->first, p->second);
2754 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second,
2755 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
2756 } else {
2757 // not ambiguous
2758 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid());
2759 }
2760 }
2761
2762 mds->mdcache->recalc_auth_bits(true);
2763
2764 mds->mdcache->show_subtrees();
2765}
2766
2767
2768
2769// -----------------------
2770// EFragment
2771
2772void EFragment::replay(MDSRank *mds)
2773{
2774 dout(10) << "EFragment.replay " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << dendl;
2775
9f95a23c 2776 std::vector<CDir*> resultfrags;
11fdf7f2 2777 MDSContext::vec waiters;
7c673cae
FG
2778
2779 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2780 // it will be once we replay the metablob , but first we need to
2781 // refragment anything we already have in the cache.
2782 CInode *in = mds->mdcache->get_inode(ino);
2783
11fdf7f2 2784 auto&& segment = get_segment();
7c673cae
FG
2785 switch (op) {
2786 case OP_PREPARE:
11fdf7f2 2787 mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, segment, &rollback);
7c673cae
FG
2788
2789 if (in)
9f95a23c 2790 mds->mdcache->adjust_dir_fragments(in, basefrag, bits, &resultfrags, waiters, true);
7c673cae
FG
2791 break;
2792
11fdf7f2
TL
2793 case OP_ROLLBACK: {
2794 frag_vec_t old_frags;
7c673cae
FG
2795 if (in) {
2796 in->dirfragtree.get_leaves_under(basefrag, old_frags);
2797 if (orig_frags.empty()) {
2798 // old format EFragment
9f95a23c 2799 mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, &resultfrags, waiters, true);
7c673cae 2800 } else {
11fdf7f2
TL
2801 for (const auto& fg : orig_frags)
2802 mds->mdcache->force_dir_fragment(in, fg);
7c673cae
FG
2803 }
2804 }
11fdf7f2 2805 mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), std::move(old_frags));
7c673cae 2806 break;
11fdf7f2 2807 }
7c673cae
FG
2808
2809 case OP_COMMIT:
2810 case OP_FINISH:
2811 mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op);
2812 break;
2813
2814 default:
2815 ceph_abort();
2816 }
2817
11fdf7f2
TL
2818 metablob.replay(mds, segment);
2819 if (in && g_conf()->mds_debug_frag)
7c673cae
FG
2820 in->verify_dirfrags();
2821}
2822
2823void EFragment::encode(bufferlist &bl, uint64_t features) const {
2824 ENCODE_START(5, 4, bl);
11fdf7f2
TL
2825 encode(stamp, bl);
2826 encode(op, bl);
2827 encode(ino, bl);
2828 encode(basefrag, bl);
2829 encode(bits, bl);
2830 encode(metablob, bl, features);
2831 encode(orig_frags, bl);
2832 encode(rollback, bl);
7c673cae
FG
2833 ENCODE_FINISH(bl);
2834}
2835
11fdf7f2 2836void EFragment::decode(bufferlist::const_iterator &bl) {
7c673cae
FG
2837 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
2838 if (struct_v >= 2)
11fdf7f2 2839 decode(stamp, bl);
7c673cae 2840 if (struct_v >= 3)
11fdf7f2
TL
2841 decode(op, bl);
2842 decode(ino, bl);
2843 decode(basefrag, bl);
2844 decode(bits, bl);
2845 decode(metablob, bl);
7c673cae 2846 if (struct_v >= 5) {
11fdf7f2
TL
2847 decode(orig_frags, bl);
2848 decode(rollback, bl);
7c673cae
FG
2849 }
2850 DECODE_FINISH(bl);
2851}
2852
2853void EFragment::dump(Formatter *f) const
2854{
2855 /*f->open_object_section("Metablob");
2856 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2857 f->close_section();*/
2858 f->dump_string("op", op_name(op));
2859 f->dump_stream("ino") << ino;
2860 f->dump_stream("base frag") << basefrag;
2861 f->dump_int("bits", bits);
2862}
2863
9f95a23c 2864void EFragment::generate_test_instances(std::list<EFragment*>& ls)
7c673cae
FG
2865{
2866 ls.push_back(new EFragment);
2867 ls.push_back(new EFragment);
2868 ls.back()->op = OP_PREPARE;
2869 ls.back()->ino = 1;
2870 ls.back()->bits = 5;
2871}
2872
2873void dirfrag_rollback::encode(bufferlist &bl) const
2874{
2875 ENCODE_START(1, 1, bl);
11fdf7f2 2876 encode(fnode, bl);
7c673cae
FG
2877 ENCODE_FINISH(bl);
2878}
2879
11fdf7f2 2880void dirfrag_rollback::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2881{
2882 DECODE_START(1, bl);
11fdf7f2 2883 decode(fnode, bl);
7c673cae
FG
2884 DECODE_FINISH(bl);
2885}
2886
2887
2888
2889// =========================================================================
2890
2891// -----------------------
2892// EExport
2893
2894void EExport::replay(MDSRank *mds)
2895{
2896 dout(10) << "EExport.replay " << base << dendl;
11fdf7f2
TL
2897 auto&& segment = get_segment();
2898 metablob.replay(mds, segment);
7c673cae
FG
2899
2900 CDir *dir = mds->mdcache->get_dirfrag(base);
11fdf7f2 2901 ceph_assert(dir);
7c673cae
FG
2902
2903 set<CDir*> realbounds;
2904 for (set<dirfrag_t>::iterator p = bounds.begin();
2905 p != bounds.end();
2906 ++p) {
2907 CDir *bd = mds->mdcache->get_dirfrag(*p);
11fdf7f2 2908 ceph_assert(bd);
7c673cae
FG
2909 realbounds.insert(bd);
2910 }
2911
2912 // adjust auth away
2913 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, CDIR_AUTH_UNDEF);
2914
2915 mds->mdcache->try_trim_non_auth_subtree(dir);
2916}
2917
2918void EExport::encode(bufferlist& bl, uint64_t features) const
2919{
31f18b77 2920 ENCODE_START(4, 3, bl);
11fdf7f2
TL
2921 encode(stamp, bl);
2922 encode(metablob, bl, features);
2923 encode(base, bl);
2924 encode(bounds, bl);
2925 encode(target, bl);
7c673cae
FG
2926 ENCODE_FINISH(bl);
2927}
2928
11fdf7f2 2929void EExport::decode(bufferlist::const_iterator &bl)
7c673cae
FG
2930{
2931 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2932 if (struct_v >= 2)
11fdf7f2
TL
2933 decode(stamp, bl);
2934 decode(metablob, bl);
2935 decode(base, bl);
2936 decode(bounds, bl);
31f18b77 2937 if (struct_v >= 4)
11fdf7f2 2938 decode(target, bl);
7c673cae
FG
2939 DECODE_FINISH(bl);
2940}
2941
2942void EExport::dump(Formatter *f) const
2943{
2944 f->dump_float("stamp", (double)stamp);
2945 /*f->open_object_section("Metablob");
2946 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2947 f->close_section();*/
2948 f->dump_stream("base dirfrag") << base;
2949 f->open_array_section("bounds dirfrags");
2950 for (set<dirfrag_t>::const_iterator i = bounds.begin();
2951 i != bounds.end(); ++i) {
2952 f->dump_stream("dirfrag") << *i;
2953 }
2954 f->close_section(); // bounds dirfrags
2955}
2956
9f95a23c 2957void EExport::generate_test_instances(std::list<EExport*>& ls)
7c673cae
FG
2958{
2959 EExport *sample = new EExport();
2960 ls.push_back(sample);
2961}
2962
2963
2964// -----------------------
2965// EImportStart
2966
2967void EImportStart::update_segment()
2968{
11fdf7f2 2969 get_segment()->sessionmapv = cmapv;
7c673cae
FG
2970}
2971
2972void EImportStart::replay(MDSRank *mds)
2973{
2974 dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl;
2975 //metablob.print(*_dout);
11fdf7f2
TL
2976 auto&& segment = get_segment();
2977 metablob.replay(mds, segment);
7c673cae
FG
2978
2979 // put in ambiguous import list
2980 mds->mdcache->add_ambiguous_import(base, bounds);
2981
2982 // set auth partially to us so we don't trim it
2983 CDir *dir = mds->mdcache->get_dirfrag(base);
11fdf7f2 2984 ceph_assert(dir);
7c673cae
FG
2985
2986 set<CDir*> realbounds;
2987 for (vector<dirfrag_t>::iterator p = bounds.begin();
2988 p != bounds.end();
2989 ++p) {
2990 CDir *bd = mds->mdcache->get_dirfrag(*p);
11fdf7f2 2991 ceph_assert(bd);
7c673cae
FG
2992 if (!bd->is_subtree_root())
2993 bd->state_clear(CDir::STATE_AUTH);
2994 realbounds.insert(bd);
2995 }
2996
2997 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds,
2998 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
2999
3000 // open client sessions?
3001 if (mds->sessionmap.get_version() >= cmapv) {
3002 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
3003 << " >= " << cmapv << ", noop" << dendl;
3004 } else {
3005 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
3006 << " < " << cmapv << dendl;
3007 map<client_t,entity_inst_t> cm;
11fdf7f2
TL
3008 map<client_t,client_metadata_t> cmm;
3009 auto blp = client_map.cbegin();
3010 using ceph::decode;
3011 decode(cm, blp);
3012 if (!blp.end())
3013 decode(cmm, blp);
81eedcae 3014 mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
7c673cae
FG
3015 }
3016 update_segment();
3017}
3018
3019void EImportStart::encode(bufferlist &bl, uint64_t features) const {
31f18b77 3020 ENCODE_START(4, 3, bl);
11fdf7f2
TL
3021 encode(stamp, bl);
3022 encode(base, bl);
3023 encode(metablob, bl, features);
3024 encode(bounds, bl);
3025 encode(cmapv, bl);
3026 encode(client_map, bl);
3027 encode(from, bl);
7c673cae
FG
3028 ENCODE_FINISH(bl);
3029}
3030
11fdf7f2 3031void EImportStart::decode(bufferlist::const_iterator &bl) {
7c673cae
FG
3032 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
3033 if (struct_v >= 2)
11fdf7f2
TL
3034 decode(stamp, bl);
3035 decode(base, bl);
3036 decode(metablob, bl);
3037 decode(bounds, bl);
3038 decode(cmapv, bl);
3039 decode(client_map, bl);
31f18b77 3040 if (struct_v >= 4)
11fdf7f2 3041 decode(from, bl);
7c673cae
FG
3042 DECODE_FINISH(bl);
3043}
3044
3045void EImportStart::dump(Formatter *f) const
3046{
3047 f->dump_stream("base dirfrag") << base;
3048 f->open_array_section("boundary dirfrags");
3049 for (vector<dirfrag_t>::const_iterator iter = bounds.begin();
3050 iter != bounds.end(); ++iter) {
3051 f->dump_stream("frag") << *iter;
3052 }
3053 f->close_section();
3054}
3055
9f95a23c 3056void EImportStart::generate_test_instances(std::list<EImportStart*>& ls)
7c673cae
FG
3057{
3058 ls.push_back(new EImportStart);
3059}
3060
3061// -----------------------
3062// EImportFinish
3063
3064void EImportFinish::replay(MDSRank *mds)
3065{
3066 if (mds->mdcache->have_ambiguous_import(base)) {
3067 dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl;
3068 if (success) {
3069 mds->mdcache->finish_ambiguous_import(base);
3070 } else {
3071 CDir *dir = mds->mdcache->get_dirfrag(base);
11fdf7f2 3072 ceph_assert(dir);
7c673cae
FG
3073 vector<dirfrag_t> bounds;
3074 mds->mdcache->get_ambiguous_import_bounds(base, bounds);
3075 mds->mdcache->adjust_bounded_subtree_auth(dir, bounds, CDIR_AUTH_UNDEF);
3076 mds->mdcache->cancel_ambiguous_import(dir);
3077 mds->mdcache->try_trim_non_auth_subtree(dir);
3078 }
3079 } else {
3080 // this shouldn't happen unless this is an old journal
3081 dout(10) << "EImportFinish.replay " << base << " success=" << success
3082 << " on subtree not marked as ambiguous"
3083 << dendl;
3084 mds->clog->error() << "failure replaying journal (EImportFinish)";
3085 mds->damaged();
3086 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3087 }
3088}
3089
3090void EImportFinish::encode(bufferlist& bl, uint64_t features) const
3091{
3092 ENCODE_START(3, 3, bl);
11fdf7f2
TL
3093 encode(stamp, bl);
3094 encode(base, bl);
3095 encode(success, bl);
7c673cae
FG
3096 ENCODE_FINISH(bl);
3097}
3098
11fdf7f2 3099void EImportFinish::decode(bufferlist::const_iterator &bl)
7c673cae
FG
3100{
3101 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
3102 if (struct_v >= 2)
11fdf7f2
TL
3103 decode(stamp, bl);
3104 decode(base, bl);
3105 decode(success, bl);
7c673cae
FG
3106 DECODE_FINISH(bl);
3107}
3108
3109void EImportFinish::dump(Formatter *f) const
3110{
3111 f->dump_stream("base dirfrag") << base;
3112 f->dump_string("success", success ? "true" : "false");
3113}
9f95a23c 3114void EImportFinish::generate_test_instances(std::list<EImportFinish*>& ls)
7c673cae
FG
3115{
3116 ls.push_back(new EImportFinish);
3117 ls.push_back(new EImportFinish);
3118 ls.back()->success = true;
3119}
3120
3121
3122// ------------------------
3123// EResetJournal
3124
3125void EResetJournal::encode(bufferlist& bl, uint64_t features) const
3126{
3127 ENCODE_START(2, 2, bl);
11fdf7f2 3128 encode(stamp, bl);
7c673cae
FG
3129 ENCODE_FINISH(bl);
3130}
3131
11fdf7f2 3132void EResetJournal::decode(bufferlist::const_iterator &bl)
7c673cae
FG
3133{
3134 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
11fdf7f2 3135 decode(stamp, bl);
7c673cae
FG
3136 DECODE_FINISH(bl);
3137}
3138
3139void EResetJournal::dump(Formatter *f) const
3140{
3141 f->dump_stream("timestamp") << stamp;
3142}
3143
9f95a23c 3144void EResetJournal::generate_test_instances(std::list<EResetJournal*>& ls)
7c673cae
FG
3145{
3146 ls.push_back(new EResetJournal());
3147}
3148
3149void EResetJournal::replay(MDSRank *mds)
3150{
3151 dout(1) << "EResetJournal" << dendl;
3152
3153 mds->sessionmap.wipe();
3154 mds->inotable->replay_reset();
3155
3156 if (mds->mdsmap->get_root() == mds->get_nodeid()) {
3157 CDir *rootdir = mds->mdcache->get_root()->get_or_open_dirfrag(mds->mdcache, frag_t());
3158 mds->mdcache->adjust_subtree_auth(rootdir, mds->get_nodeid());
3159 }
3160
3161 CDir *mydir = mds->mdcache->get_myin()->get_or_open_dirfrag(mds->mdcache, frag_t());
3162 mds->mdcache->adjust_subtree_auth(mydir, mds->get_nodeid());
3163
3164 mds->mdcache->recalc_auth_bits(true);
3165
3166 mds->mdcache->show_subtrees();
3167}
3168
3169
3170void ENoOp::encode(bufferlist &bl, uint64_t features) const
3171{
3172 ENCODE_START(2, 2, bl);
11fdf7f2 3173 encode(pad_size, bl);
7c673cae
FG
3174 uint8_t const pad = 0xff;
3175 for (unsigned int i = 0; i < pad_size; ++i) {
11fdf7f2 3176 encode(pad, bl);
7c673cae
FG
3177 }
3178 ENCODE_FINISH(bl);
3179}
3180
3181
11fdf7f2 3182void ENoOp::decode(bufferlist::const_iterator &bl)
7c673cae
FG
3183{
3184 DECODE_START(2, bl);
11fdf7f2 3185 decode(pad_size, bl);
7c673cae
FG
3186 if (bl.get_remaining() != pad_size) {
3187 // This is spiritually an assertion, but expressing in a way that will let
3188 // journal debug tools catch it and recognise a malformed entry.
3189 throw buffer::end_of_buffer();
3190 } else {
9f95a23c 3191 bl += pad_size;
7c673cae
FG
3192 }
3193 DECODE_FINISH(bl);
3194}
3195
3196
3197void ENoOp::replay(MDSRank *mds)
3198{
3199 dout(4) << "ENoOp::replay, " << pad_size << " bytes skipped in journal" << dendl;
3200}
3201
3202/**
3203 * If re-formatting an old journal that used absolute log position
3204 * references as segment sequence numbers, use this function to update
3205 * it.
3206 *
3207 * @param mds
3208 * MDSRank instance, just used for logging
3209 * @param old_to_new
3210 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3211 *
3212 * @return
3213 * True if the event was modified.
3214 */
3215bool EMetaBlob::rewrite_truncate_finish(MDSRank const *mds,
9f95a23c 3216 std::map<LogSegment::seq_t, LogSegment::seq_t> const &old_to_new)
7c673cae
FG
3217{
3218 bool modified = false;
9f95a23c 3219 map<inodeno_t, LogSegment::seq_t> new_trunc_finish;
11fdf7f2
TL
3220 for (const auto& p : truncate_finish) {
3221 auto q = old_to_new.find(p.second);
3222 if (q != old_to_new.end()) {
7c673cae 3223 dout(20) << __func__ << " applying segment seq mapping "
11fdf7f2
TL
3224 << p.second << " -> " << q->second << dendl;
3225 new_trunc_finish.emplace(p.first, q->second);
7c673cae
FG
3226 modified = true;
3227 } else {
3228 dout(20) << __func__ << " no segment seq mapping found for "
11fdf7f2
TL
3229 << p.second << dendl;
3230 new_trunc_finish.insert(p);
7c673cae
FG
3231 }
3232 }
11fdf7f2 3233 truncate_finish.swap(new_trunc_finish);
7c673cae
FG
3234
3235 return modified;
3236}