]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/journal.cc
14966f14ce153847b78a94da0e2216ea2b18e37b
[ceph.git] / ceph / src / mds / journal.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "common/config.h"
16 #include "osdc/Journaler.h"
17 #include "events/ESubtreeMap.h"
18 #include "events/ESession.h"
19 #include "events/ESessions.h"
20
21 #include "events/EMetaBlob.h"
22 #include "events/EResetJournal.h"
23 #include "events/ENoOp.h"
24
25 #include "events/EUpdate.h"
26 #include "events/ESlaveUpdate.h"
27 #include "events/EOpen.h"
28 #include "events/ECommitted.h"
29 #include "events/EPurged.h"
30
31 #include "events/EExport.h"
32 #include "events/EImportStart.h"
33 #include "events/EImportFinish.h"
34 #include "events/EFragment.h"
35
36 #include "events/ETableClient.h"
37 #include "events/ETableServer.h"
38
39 #include "include/stringify.h"
40
41 #include "LogSegment.h"
42
43 #include "MDSRank.h"
44 #include "MDLog.h"
45 #include "MDCache.h"
46 #include "Server.h"
47 #include "Migrator.h"
48 #include "Mutation.h"
49
50 #include "InoTable.h"
51 #include "MDSTableClient.h"
52 #include "MDSTableServer.h"
53
54 #include "Locker.h"
55
56 #define dout_context g_ceph_context
57 #define dout_subsys ceph_subsys_mds
58 #undef dout_prefix
59 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
60
61
62 // -----------------------
63 // LogSegment
64
65 void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio)
66 {
67 set<CDir*> commit;
68
69 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire" << dendl;
70
71 ceph_assert(g_conf()->mds_kill_journal_expire_at != 1);
72
73 // commit dirs
74 for (elist<CDir*>::iterator p = new_dirfrags.begin(); !p.end(); ++p) {
75 dout(20) << " new_dirfrag " << **p << dendl;
76 ceph_assert((*p)->is_auth());
77 commit.insert(*p);
78 }
79 for (elist<CDir*>::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) {
80 dout(20) << " dirty_dirfrag " << **p << dendl;
81 ceph_assert((*p)->is_auth());
82 commit.insert(*p);
83 }
84 for (elist<CDentry*>::iterator p = dirty_dentries.begin(); !p.end(); ++p) {
85 dout(20) << " dirty_dentry " << **p << dendl;
86 ceph_assert((*p)->is_auth());
87 commit.insert((*p)->get_dir());
88 }
89 for (elist<CInode*>::iterator p = dirty_inodes.begin(); !p.end(); ++p) {
90 dout(20) << " dirty_inode " << **p << dendl;
91 ceph_assert((*p)->is_auth());
92 if ((*p)->is_base()) {
93 (*p)->store(gather_bld.new_sub());
94 } else
95 commit.insert((*p)->get_parent_dn()->get_dir());
96 }
97
98 if (!commit.empty()) {
99 for (set<CDir*>::iterator p = commit.begin();
100 p != commit.end();
101 ++p) {
102 CDir *dir = *p;
103 ceph_assert(dir->is_auth());
104 if (dir->can_auth_pin()) {
105 dout(15) << "try_to_expire committing " << *dir << dendl;
106 dir->commit(0, gather_bld.new_sub(), false, op_prio);
107 } else {
108 dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl;
109 dir->add_waiter(CDir::WAIT_UNFREEZE, gather_bld.new_sub());
110 }
111 }
112 }
113
114 // master ops with possibly uncommitted slaves
115 for (set<metareqid_t>::iterator p = uncommitted_masters.begin();
116 p != uncommitted_masters.end();
117 ++p) {
118 dout(10) << "try_to_expire waiting for slaves to ack commit on " << *p << dendl;
119 mds->mdcache->wait_for_uncommitted_master(*p, gather_bld.new_sub());
120 }
121
122 // uncommitted fragments
123 for (set<dirfrag_t>::iterator p = uncommitted_fragments.begin();
124 p != uncommitted_fragments.end();
125 ++p) {
126 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p << dendl;
127 mds->mdcache->wait_for_uncommitted_fragment(*p, gather_bld.new_sub());
128 }
129
130 // nudge scatterlocks
131 for (elist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) {
132 CInode *in = *p;
133 dout(10) << "try_to_expire waiting for dirlock flush on " << *in << dendl;
134 mds->locker->scatter_nudge(&in->filelock, gather_bld.new_sub());
135 }
136 for (elist<CInode*>::iterator p = dirty_dirfrag_dirfragtree.begin(); !p.end(); ++p) {
137 CInode *in = *p;
138 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in << dendl;
139 mds->locker->scatter_nudge(&in->dirfragtreelock, gather_bld.new_sub());
140 }
141 for (elist<CInode*>::iterator p = dirty_dirfrag_nest.begin(); !p.end(); ++p) {
142 CInode *in = *p;
143 dout(10) << "try_to_expire waiting for nest flush on " << *in << dendl;
144 mds->locker->scatter_nudge(&in->nestlock, gather_bld.new_sub());
145 }
146
147 ceph_assert(g_conf()->mds_kill_journal_expire_at != 2);
148
149 // open files and snap inodes
150 if (!open_files.empty()) {
151 ceph_assert(!mds->mdlog->is_capped()); // hmm FIXME
152 EOpen *le = 0;
153 LogSegment *ls = mds->mdlog->get_current_segment();
154 ceph_assert(ls != this);
155 elist<CInode*>::iterator p = open_files.begin(member_offset(CInode, item_open_file));
156 while (!p.end()) {
157 CInode *in = *p;
158 ++p;
159 if (in->last != CEPH_NOSNAP && in->is_auth() && !in->client_snap_caps.empty()) {
160 // journal snap inodes that need flush. This simplify the mds failover hanlding
161 dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl;
162 if (!le) {
163 le = new EOpen(mds->mdlog);
164 mds->mdlog->start_entry(le);
165 }
166 le->add_clean_inode(in);
167 ls->open_files.push_back(&in->item_open_file);
168 } else {
169 // open files are tracked by open file table, no need to journal them again
170 in->item_open_file.remove_myself();
171 }
172 }
173 if (le) {
174 mds->mdlog->submit_entry(le);
175 mds->mdlog->wait_for_safe(gather_bld.new_sub());
176 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl;
177 }
178 }
179
180 ceph_assert(g_conf()->mds_kill_journal_expire_at != 3);
181
182 // backtraces to be stored/updated
183 for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) {
184 CInode *in = *p;
185 ceph_assert(in->is_auth());
186 if (in->can_auth_pin()) {
187 dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl;
188 in->store_backtrace(gather_bld.new_sub(), op_prio);
189 } else {
190 dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl;
191 in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub());
192 }
193 }
194
195 ceph_assert(g_conf()->mds_kill_journal_expire_at != 4);
196
197 // slave updates
198 for (elist<MDSlaveUpdate*>::iterator p = slave_updates.begin(member_offset(MDSlaveUpdate,
199 item));
200 !p.end(); ++p) {
201 MDSlaveUpdate *su = *p;
202 dout(10) << "try_to_expire waiting on slave update " << su << dendl;
203 ceph_assert(su->waiter == 0);
204 su->waiter = gather_bld.new_sub();
205 }
206
207 // idalloc
208 if (inotablev > mds->inotable->get_committed_version()) {
209 dout(10) << "try_to_expire saving inotable table, need " << inotablev
210 << ", committed is " << mds->inotable->get_committed_version()
211 << " (" << mds->inotable->get_committing_version() << ")"
212 << dendl;
213 mds->inotable->save(gather_bld.new_sub(), inotablev);
214 }
215
216 // sessionmap
217 if (sessionmapv > mds->sessionmap.get_committed()) {
218 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
219 << ", committed is " << mds->sessionmap.get_committed()
220 << " (" << mds->sessionmap.get_committing() << ")"
221 << dendl;
222 mds->sessionmap.save(gather_bld.new_sub(), sessionmapv);
223 }
224
225 // updates to sessions for completed_requests
226 mds->sessionmap.save_if_dirty(touched_sessions, &gather_bld);
227 touched_sessions.clear();
228
229 // pending commit atids
230 for (map<int, ceph::unordered_set<version_t> >::iterator p = pending_commit_tids.begin();
231 p != pending_commit_tids.end();
232 ++p) {
233 MDSTableClient *client = mds->get_table_client(p->first);
234 ceph_assert(client);
235 for (ceph::unordered_set<version_t>::iterator q = p->second.begin();
236 q != p->second.end();
237 ++q) {
238 dout(10) << "try_to_expire " << get_mdstable_name(p->first) << " transaction " << *q
239 << " pending commit (not yet acked), waiting" << dendl;
240 ceph_assert(!client->has_committed(*q));
241 client->wait_for_ack(*q, gather_bld.new_sub());
242 }
243 }
244
245 // table servers
246 for (map<int, version_t>::iterator p = tablev.begin();
247 p != tablev.end();
248 ++p) {
249 MDSTableServer *server = mds->get_table_server(p->first);
250 ceph_assert(server);
251 if (p->second > server->get_committed_version()) {
252 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p->first)
253 << " to save, need " << p->second << dendl;
254 server->save(gather_bld.new_sub());
255 }
256 }
257
258 // truncating
259 for (set<CInode*>::iterator p = truncating_inodes.begin();
260 p != truncating_inodes.end();
261 ++p) {
262 dout(10) << "try_to_expire waiting for truncate of " << **p << dendl;
263 (*p)->add_waiter(CInode::WAIT_TRUNC, gather_bld.new_sub());
264 }
265 // purge inodes
266 dout(10) << "try_to_expire waiting for purge of " << purge_inodes << dendl;
267 if (purge_inodes.size())
268 set_purged_cb(gather_bld.new_sub());
269
270 if (gather_bld.has_subs()) {
271 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire waiting" << dendl;
272 mds->mdlog->flush();
273 } else {
274 ceph_assert(g_conf()->mds_kill_journal_expire_at != 5);
275 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire success" << dendl;
276 }
277 }
278
279 // -----------------------
280 // EMetaBlob
281
282 void EMetaBlob::add_dir_context(CDir *dir, int mode)
283 {
284 MDSRank *mds = dir->cache->mds;
285
286 list<CDentry*> parents;
287
288 // it may be okay not to include the maybe items, if
289 // - we journaled the maybe child inode in this segment
290 // - that subtree turns out to be unambiguously auth
291 list<CDentry*> maybe;
292 bool maybenot = false;
293
294 while (true) {
295 // already have this dir? (we must always add in order)
296 if (lump_map.count(dir->dirfrag())) {
297 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") have lump " << dir->dirfrag() << dendl;
298 break;
299 }
300
301 // stop at root/stray
302 CInode *diri = dir->get_inode();
303 CDentry *parent = diri->get_projected_parent_dn();
304
305 if (mode == TO_AUTH_SUBTREE_ROOT) {
306 // subtree root?
307 if (dir->is_subtree_root()) {
308 // match logic in MDCache::create_subtree_map()
309 if (dir->get_dir_auth().first == mds->get_nodeid()) {
310 mds_authority_t parent_auth = parent ? parent->authority() : CDIR_AUTH_UNDEF;
311 if (parent_auth.first == dir->get_dir_auth().first) {
312 if (parent_auth.second == CDIR_AUTH_UNKNOWN &&
313 !dir->is_ambiguous_dir_auth() &&
314 !dir->state_test(CDir::STATE_EXPORTBOUND) &&
315 !dir->state_test(CDir::STATE_AUXSUBTREE) &&
316 !diri->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
317 dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir << dendl;
318 ceph_abort();
319 }
320 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") ambiguous or transient subtree " << dendl;
321 } else {
322 // it's an auth subtree, we don't need maybe (if any), and we're done.
323 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
324 << " at " << *dir << dendl;
325 maybe.clear();
326 break;
327 }
328 } else {
329 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached ambig or !auth subtree, need " << maybe
330 << " at " << *dir << dendl;
331 // we need the maybe list after all!
332 parents.splice(parents.begin(), maybe);
333 maybenot = false;
334 }
335 }
336
337 // was the inode journaled in this blob?
338 if (event_seq && diri->last_journaled == event_seq) {
339 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri this blob " << *diri << dendl;
340 break;
341 }
342
343 // have we journaled this inode since the last subtree map?
344 if (!maybenot && last_subtree_map && diri->last_journaled >= last_subtree_map) {
345 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri in this segment ("
346 << diri->last_journaled << " >= " << last_subtree_map << "), setting maybenot flag "
347 << *diri << dendl;
348 maybenot = true;
349 }
350 }
351
352 if (!parent)
353 break;
354
355 if (maybenot) {
356 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") maybe " << *parent << dendl;
357 maybe.push_front(parent);
358 } else {
359 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") definitely " << *parent << dendl;
360 parents.push_front(parent);
361 }
362
363 dir = parent->get_dir();
364 }
365
366 parents.splice(parents.begin(), maybe);
367
368 dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl;
369 for (const auto& dentry : parents) {
370 ceph_assert(dentry->get_projected_linkage()->is_primary());
371 add_dentry(dentry, false);
372 }
373 }
374
375 void EMetaBlob::update_segment(LogSegment *ls)
376 {
377 // dirty inode mtimes
378 // -> handled directly by Server.cc, replay()
379
380 // alloc table update?
381 if (inotablev)
382 ls->inotablev = inotablev;
383 if (sessionmapv)
384 ls->sessionmapv = sessionmapv;
385
386 // truncated inodes
387 // -> handled directly by Server.cc
388
389 // client requests
390 // note the newest request per client
391 //if (!client_reqs.empty())
392 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
393 }
394
395 // EMetaBlob::fullbit
396
397 void EMetaBlob::fullbit::encode(bufferlist& bl, uint64_t features) const {
398 ENCODE_START(8, 5, bl);
399 encode(dn, bl);
400 encode(dnfirst, bl);
401 encode(dnlast, bl);
402 encode(dnv, bl);
403 encode(inode, bl, features);
404 encode(xattrs, bl);
405 if (inode.is_symlink())
406 encode(symlink, bl);
407 if (inode.is_dir()) {
408 encode(dirfragtree, bl);
409 encode(snapbl, bl);
410 }
411 encode(state, bl);
412 if (old_inodes.empty()) {
413 encode(false, bl);
414 } else {
415 encode(true, bl);
416 encode(old_inodes, bl, features);
417 }
418 if (!inode.is_dir())
419 encode(snapbl, bl);
420 encode(oldest_snap, bl);
421 ENCODE_FINISH(bl);
422 }
423
424 void EMetaBlob::fullbit::decode(bufferlist::const_iterator &bl) {
425 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
426 decode(dn, bl);
427 decode(dnfirst, bl);
428 decode(dnlast, bl);
429 decode(dnv, bl);
430 decode(inode, bl);
431 decode(xattrs, bl);
432 if (inode.is_symlink())
433 decode(symlink, bl);
434 if (inode.is_dir()) {
435 decode(dirfragtree, bl);
436 decode(snapbl, bl);
437 if ((struct_v == 2) || (struct_v == 3)) {
438 bool dir_layout_exists;
439 decode(dir_layout_exists, bl);
440 if (dir_layout_exists) {
441 __u8 dir_struct_v;
442 decode(dir_struct_v, bl); // default_file_layout version
443 decode(inode.layout, bl); // and actual layout, that we care about
444 }
445 }
446 }
447 if (struct_v >= 6) {
448 decode(state, bl);
449 } else {
450 bool dirty;
451 decode(dirty, bl);
452 state = dirty ? EMetaBlob::fullbit::STATE_DIRTY : 0;
453 }
454
455 if (struct_v >= 3) {
456 bool old_inodes_present;
457 decode(old_inodes_present, bl);
458 if (old_inodes_present) {
459 decode(old_inodes, bl);
460 }
461 }
462 if (!inode.is_dir()) {
463 if (struct_v >= 7)
464 decode(snapbl, bl);
465 }
466 if (struct_v >= 8)
467 decode(oldest_snap, bl);
468 else
469 oldest_snap = CEPH_NOSNAP;
470
471 DECODE_FINISH(bl);
472 }
473
474 void EMetaBlob::fullbit::dump(Formatter *f) const
475 {
476 f->dump_string("dentry", dn);
477 f->dump_stream("snapid.first") << dnfirst;
478 f->dump_stream("snapid.last") << dnlast;
479 f->dump_int("dentry version", dnv);
480 f->open_object_section("inode");
481 inode.dump(f);
482 f->close_section(); // inode
483 f->open_object_section("xattrs");
484 for (const auto &p : xattrs) {
485 std::string s(p.second.c_str(), p.second.length());
486 f->dump_string(p.first.c_str(), s);
487 }
488 f->close_section(); // xattrs
489 if (inode.is_symlink()) {
490 f->dump_string("symlink", symlink);
491 }
492 if (inode.is_dir()) {
493 f->dump_stream("frag tree") << dirfragtree;
494 f->dump_string("has_snapbl", snapbl.length() ? "true" : "false");
495 if (inode.has_layout()) {
496 f->open_object_section("file layout policy");
497 // FIXME
498 f->dump_string("layout", "the layout exists");
499 f->close_section(); // file layout policy
500 }
501 }
502 f->dump_string("state", state_string());
503 if (!old_inodes.empty()) {
504 f->open_array_section("old inodes");
505 for (const auto &p : old_inodes) {
506 f->open_object_section("inode");
507 f->dump_int("snapid", p.first);
508 p.second.dump(f);
509 f->close_section(); // inode
510 }
511 f->close_section(); // old inodes
512 }
513 }
514
515 void EMetaBlob::fullbit::generate_test_instances(std::list<EMetaBlob::fullbit*>& ls)
516 {
517 CInode::mempool_inode inode;
518 fragtree_t fragtree;
519 CInode::mempool_xattr_map empty_xattrs;
520 bufferlist empty_snapbl;
521 fullbit *sample = new fullbit("/testdn", 0, 0, 0,
522 inode, fragtree, empty_xattrs, "", 0, empty_snapbl,
523 false, NULL);
524 ls.push_back(sample);
525 }
526
527 void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in)
528 {
529 in->inode = inode;
530 in->xattrs = xattrs;
531 in->maybe_export_pin();
532 if (in->inode.is_dir()) {
533 if (!(in->dirfragtree == dirfragtree)) {
534 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> "
535 << dirfragtree << " on " << *in << dendl;
536 in->dirfragtree = dirfragtree;
537 in->force_dirfrags();
538 if (in->get_num_dirfrags() && in->authority() == CDIR_AUTH_UNDEF) {
539 auto&& ls = in->get_nested_dirfrags();
540 for (const auto& dir : ls) {
541 if (dir->get_num_any() == 0 &&
542 mds->mdcache->can_trim_non_auth_dirfrag(dir)) {
543 dout(10) << " closing empty non-auth dirfrag " << *dir << dendl;
544 in->close_dirfrag(dir->get_frag());
545 }
546 }
547 }
548 }
549 } else if (in->inode.is_symlink()) {
550 in->symlink = symlink;
551 }
552 in->old_inodes = old_inodes;
553 if (!in->old_inodes.empty()) {
554 snapid_t min_first = in->old_inodes.rbegin()->first + 1;
555 if (min_first > in->first)
556 in->first = min_first;
557 }
558
559 /*
560 * we can do this before linking hte inode bc the split_at would
561 * be a no-op.. we have no children (namely open snaprealms) to
562 * divy up
563 */
564 in->oldest_snap = oldest_snap;
565 in->decode_snap_blob(snapbl);
566
567 /*
568 * In case there was anything malformed in the journal that we are
569 * replaying, do sanity checks on the inodes we're replaying and
570 * go damaged instead of letting any trash into a live cache
571 */
572 if (in->is_file()) {
573 // Files must have valid layouts with a pool set
574 if (in->inode.layout.pool_id == -1 || !in->inode.layout.is_valid()) {
575 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
576 << ": " << in->inode.layout << dendl;
577 std::ostringstream oss;
578 oss << "Invalid layout for inode " << in->ino() << " in journal";
579 mds->clog->error() << oss.str();
580 mds->damaged();
581 ceph_abort(); // Should be unreachable because damaged() calls respawn()
582 }
583 }
584 }
585
586 // EMetaBlob::remotebit
587
588 void EMetaBlob::remotebit::encode(bufferlist& bl) const
589 {
590 ENCODE_START(2, 2, bl);
591 encode(dn, bl);
592 encode(dnfirst, bl);
593 encode(dnlast, bl);
594 encode(dnv, bl);
595 encode(ino, bl);
596 encode(d_type, bl);
597 encode(dirty, bl);
598 ENCODE_FINISH(bl);
599 }
600
601 void EMetaBlob::remotebit::decode(bufferlist::const_iterator &bl)
602 {
603 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
604 decode(dn, bl);
605 decode(dnfirst, bl);
606 decode(dnlast, bl);
607 decode(dnv, bl);
608 decode(ino, bl);
609 decode(d_type, bl);
610 decode(dirty, bl);
611 DECODE_FINISH(bl);
612 }
613
614 void EMetaBlob::remotebit::dump(Formatter *f) const
615 {
616 f->dump_string("dentry", dn);
617 f->dump_int("snapid.first", dnfirst);
618 f->dump_int("snapid.last", dnlast);
619 f->dump_int("dentry version", dnv);
620 f->dump_int("inodeno", ino);
621 uint32_t type = DTTOIF(d_type) & S_IFMT; // convert to type entries
622 string type_string;
623 switch(type) {
624 case S_IFREG:
625 type_string = "file"; break;
626 case S_IFLNK:
627 type_string = "symlink"; break;
628 case S_IFDIR:
629 type_string = "directory"; break;
630 case S_IFIFO:
631 type_string = "fifo"; break;
632 case S_IFCHR:
633 type_string = "chr"; break;
634 case S_IFBLK:
635 type_string = "blk"; break;
636 case S_IFSOCK:
637 type_string = "sock"; break;
638 default:
639 assert (0 == "unknown d_type!");
640 }
641 f->dump_string("d_type", type_string);
642 f->dump_string("dirty", dirty ? "true" : "false");
643 }
644
645 void EMetaBlob::remotebit::
646 generate_test_instances(std::list<EMetaBlob::remotebit*>& ls)
647 {
648 remotebit *remote = new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG), false);
649 ls.push_back(remote);
650 }
651
652 // EMetaBlob::nullbit
653
654 void EMetaBlob::nullbit::encode(bufferlist& bl) const
655 {
656 ENCODE_START(2, 2, bl);
657 encode(dn, bl);
658 encode(dnfirst, bl);
659 encode(dnlast, bl);
660 encode(dnv, bl);
661 encode(dirty, bl);
662 ENCODE_FINISH(bl);
663 }
664
665 void EMetaBlob::nullbit::decode(bufferlist::const_iterator &bl)
666 {
667 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
668 decode(dn, bl);
669 decode(dnfirst, bl);
670 decode(dnlast, bl);
671 decode(dnv, bl);
672 decode(dirty, bl);
673 DECODE_FINISH(bl);
674 }
675
676 void EMetaBlob::nullbit::dump(Formatter *f) const
677 {
678 f->dump_string("dentry", dn);
679 f->dump_int("snapid.first", dnfirst);
680 f->dump_int("snapid.last", dnlast);
681 f->dump_int("dentry version", dnv);
682 f->dump_string("dirty", dirty ? "true" : "false");
683 }
684
685 void EMetaBlob::nullbit::generate_test_instances(std::list<nullbit*>& ls)
686 {
687 nullbit *sample = new nullbit("/test/dentry", 0, 10, 15, false);
688 nullbit *sample2 = new nullbit("/test/dirty", 10, 20, 25, true);
689 ls.push_back(sample);
690 ls.push_back(sample2);
691 }
692
693 // EMetaBlob::dirlump
694
695 void EMetaBlob::dirlump::encode(bufferlist& bl, uint64_t features) const
696 {
697 ENCODE_START(2, 2, bl);
698 encode(fnode, bl);
699 encode(state, bl);
700 encode(nfull, bl);
701 encode(nremote, bl);
702 encode(nnull, bl);
703 _encode_bits(features);
704 encode(dnbl, bl);
705 ENCODE_FINISH(bl);
706 }
707
708 void EMetaBlob::dirlump::decode(bufferlist::const_iterator &bl)
709 {
710 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl)
711 decode(fnode, bl);
712 decode(state, bl);
713 decode(nfull, bl);
714 decode(nremote, bl);
715 decode(nnull, bl);
716 decode(dnbl, bl);
717 dn_decoded = false; // don't decode bits unless we need them.
718 DECODE_FINISH(bl);
719 }
720
721 void EMetaBlob::dirlump::dump(Formatter *f) const
722 {
723 if (!dn_decoded) {
724 dirlump *me = const_cast<dirlump*>(this);
725 me->_decode_bits();
726 }
727 f->open_object_section("fnode");
728 fnode.dump(f);
729 f->close_section(); // fnode
730 f->dump_string("state", state_string());
731 f->dump_int("nfull", nfull);
732 f->dump_int("nremote", nremote);
733 f->dump_int("nnull", nnull);
734
735 f->open_array_section("full bits");
736 for (const auto& iter : dfull) {
737 f->open_object_section("fullbit");
738 iter.dump(f);
739 f->close_section(); // fullbit
740 }
741 f->close_section(); // full bits
742 f->open_array_section("remote bits");
743 for (const auto& iter : dremote) {
744 f->open_object_section("remotebit");
745 iter.dump(f);
746 f->close_section(); // remotebit
747 }
748 f->close_section(); // remote bits
749 f->open_array_section("null bits");
750 for (const auto& iter : dnull) {
751 f->open_object_section("null bit");
752 iter.dump(f);
753 f->close_section(); // null bit
754 }
755 f->close_section(); // null bits
756 }
757
758 void EMetaBlob::dirlump::generate_test_instances(std::list<dirlump*>& ls)
759 {
760 ls.push_back(new dirlump());
761 }
762
763 /**
764 * EMetaBlob proper
765 */
766 void EMetaBlob::encode(bufferlist& bl, uint64_t features) const
767 {
768 ENCODE_START(8, 5, bl);
769 encode(lump_order, bl);
770 encode(lump_map, bl, features);
771 encode(roots, bl, features);
772 encode(table_tids, bl);
773 encode(opened_ino, bl);
774 encode(allocated_ino, bl);
775 encode(used_preallocated_ino, bl);
776 encode(preallocated_inos, bl);
777 encode(client_name, bl);
778 encode(inotablev, bl);
779 encode(sessionmapv, bl);
780 encode(truncate_start, bl);
781 encode(truncate_finish, bl);
782 encode(destroyed_inodes, bl);
783 encode(client_reqs, bl);
784 encode(renamed_dirino, bl);
785 encode(renamed_dir_frags, bl);
786 {
787 // make MDSRank use v6 format happy
788 int64_t i = -1;
789 bool b = false;
790 encode(i, bl);
791 encode(b, bl);
792 }
793 encode(client_flushes, bl);
794 ENCODE_FINISH(bl);
795 }
796 void EMetaBlob::decode(bufferlist::const_iterator &bl)
797 {
798 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
799 decode(lump_order, bl);
800 decode(lump_map, bl);
801 if (struct_v >= 4) {
802 decode(roots, bl);
803 } else {
804 bufferlist rootbl;
805 decode(rootbl, bl);
806 if (rootbl.length()) {
807 auto p = rootbl.cbegin();
808 roots.emplace_back(p);
809 }
810 }
811 decode(table_tids, bl);
812 decode(opened_ino, bl);
813 decode(allocated_ino, bl);
814 decode(used_preallocated_ino, bl);
815 decode(preallocated_inos, bl);
816 decode(client_name, bl);
817 decode(inotablev, bl);
818 decode(sessionmapv, bl);
819 decode(truncate_start, bl);
820 decode(truncate_finish, bl);
821 decode(destroyed_inodes, bl);
822 if (struct_v >= 2) {
823 decode(client_reqs, bl);
824 } else {
825 list<metareqid_t> r;
826 decode(r, bl);
827 while (!r.empty()) {
828 client_reqs.push_back(pair<metareqid_t,uint64_t>(r.front(), 0));
829 r.pop_front();
830 }
831 }
832 if (struct_v >= 3) {
833 decode(renamed_dirino, bl);
834 decode(renamed_dir_frags, bl);
835 }
836 if (struct_v >= 6) {
837 // ignore
838 int64_t i;
839 bool b;
840 decode(i, bl);
841 decode(b, bl);
842 }
843 if (struct_v >= 8) {
844 decode(client_flushes, bl);
845 }
846 DECODE_FINISH(bl);
847 }
848
849
850 /**
851 * Get all inodes touched by this metablob. Includes the 'bits' within
852 * dirlumps, and the inodes of the dirs themselves.
853 */
854 void EMetaBlob::get_inodes(
855 std::set<inodeno_t> &inodes) const
856 {
857 // For all dirlumps in this metablob
858 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
859 // Record inode of dirlump
860 inodeno_t const dir_ino = i->first.ino;
861 inodes.insert(dir_ino);
862
863 // Decode dirlump bits
864 dirlump const &dl = i->second;
865 dl._decode_bits();
866
867 // Record inodes of fullbits
868 for (const auto& iter : dl.get_dfull()) {
869 inodes.insert(iter.inode.ino);
870 }
871
872 // Record inodes of remotebits
873 for (const auto& iter : dl.get_dremote()) {
874 inodes.insert(iter.ino);
875 }
876 }
877 }
878
879
880 /**
881 * Get a map of dirfrag to set of dentries in that dirfrag which are
882 * touched in this operation.
883 */
884 void EMetaBlob::get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const
885 {
886 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
887 dirlump const &dl = i->second;
888 dirfrag_t const &df = i->first;
889
890 // Get all bits
891 dl._decode_bits();
892
893 // For all bits, store dentry
894 for (const auto& iter : dl.get_dfull()) {
895 dentries[df].insert(iter.dn);
896 }
897 for (const auto& iter : dl.get_dremote()) {
898 dentries[df].insert(iter.dn);
899 }
900 for (const auto& iter : dl.get_dnull()) {
901 dentries[df].insert(iter.dn);
902 }
903 }
904 }
905
906
907
908 /**
909 * Calculate all paths that we can infer are touched by this metablob. Only uses
910 * information local to this metablob so it may only be the path within the
911 * subtree.
912 */
913 void EMetaBlob::get_paths(
914 std::vector<std::string> &paths) const
915 {
916 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
917 typedef std::pair<inodeno_t, std::string> Location;
918
919 // Whenever we see a dentry within a dirlump, we remember it as a child of
920 // the dirlump's inode
921 std::map<inodeno_t, std::vector<std::string> > children;
922
923 // Whenever we see a location for an inode, remember it: this allows us to
924 // build a path given an inode
925 std::map<inodeno_t, Location> ino_locations;
926
927 // Special case: operations on root inode populate roots but not dirlumps
928 if (lump_map.empty() && !roots.empty()) {
929 paths.push_back("/");
930 return;
931 }
932
933 // First pass
934 // ==========
935 // Build a tiny local metadata cache for the path structure in this metablob
936 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
937 inodeno_t const dir_ino = i->first.ino;
938 dirlump const &dl = i->second;
939 dl._decode_bits();
940
941 for (const auto& iter : dl.get_dfull()) {
942 std::string_view dentry = iter.dn;
943 children[dir_ino].emplace_back(dentry);
944 ino_locations[iter.inode.ino] = Location(dir_ino, dentry);
945 }
946
947 for (const auto& iter : dl.get_dremote()) {
948 std::string_view dentry = iter.dn;
949 children[dir_ino].emplace_back(dentry);
950 }
951
952 for (const auto& iter : dl.get_dnull()) {
953 std::string_view dentry = iter.dn;
954 children[dir_ino].emplace_back(dentry);
955 }
956 }
957
958 std::vector<Location> leaf_locations;
959
960 // Second pass
961 // ===========
962 // Output paths for all childless nodes in the metablob
963 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
964 inodeno_t const dir_ino = i->first.ino;
965 dirlump const &dl = i->second;
966 dl._decode_bits();
967
968 for (const auto& iter : dl.get_dfull()) {
969 std::string_view dentry = iter.dn;
970 if (children.find(iter.inode.ino) == children.end()) {
971 leaf_locations.push_back(Location(dir_ino, dentry));
972 }
973 }
974
975 for (const auto& iter : dl.get_dremote()) {
976 std::string_view dentry = iter.dn;
977 leaf_locations.push_back(Location(dir_ino, dentry));
978 }
979
980 for (const auto& iter : dl.get_dnull()) {
981 std::string_view dentry = iter.dn;
982 leaf_locations.push_back(Location(dir_ino, dentry));
983 }
984 }
985
986 // For all the leaf locations identified, generate paths
987 for (std::vector<Location>::iterator i = leaf_locations.begin(); i != leaf_locations.end(); ++i) {
988 Location const &loc = *i;
989 std::string path = loc.second;
990 inodeno_t ino = loc.first;
991 std::map<inodeno_t, Location>::iterator iter = ino_locations.find(ino);
992 while(iter != ino_locations.end()) {
993 Location const &loc = iter->second;
994 if (!path.empty()) {
995 path = loc.second + "/" + path;
996 } else {
997 path = loc.second + path;
998 }
999 iter = ino_locations.find(loc.first);
1000 }
1001
1002 paths.push_back(path);
1003 }
1004 }
1005
1006
1007 void EMetaBlob::dump(Formatter *f) const
1008 {
1009 f->open_array_section("lumps");
1010 for (const auto& d : lump_order) {
1011 f->open_object_section("lump");
1012 f->open_object_section("dirfrag");
1013 f->dump_stream("dirfrag") << d;
1014 f->close_section(); // dirfrag
1015 f->open_object_section("dirlump");
1016 lump_map.at(d).dump(f);
1017 f->close_section(); // dirlump
1018 f->close_section(); // lump
1019 }
1020 f->close_section(); // lumps
1021
1022 f->open_array_section("roots");
1023 for (const auto& iter : roots) {
1024 f->open_object_section("root");
1025 iter.dump(f);
1026 f->close_section(); // root
1027 }
1028 f->close_section(); // roots
1029
1030 f->open_array_section("tableclient tranactions");
1031 for (const auto& p : table_tids) {
1032 f->open_object_section("transaction");
1033 f->dump_int("tid", p.first);
1034 f->dump_int("version", p.second);
1035 f->close_section(); // transaction
1036 }
1037 f->close_section(); // tableclient transactions
1038
1039 f->dump_int("renamed directory inodeno", renamed_dirino);
1040
1041 f->open_array_section("renamed directory fragments");
1042 for (const auto& p : renamed_dir_frags) {
1043 f->dump_int("frag", p);
1044 }
1045 f->close_section(); // renamed directory fragments
1046
1047 f->dump_int("inotable version", inotablev);
1048 f->dump_int("SessionMap version", sessionmapv);
1049 f->dump_int("allocated ino", allocated_ino);
1050
1051 f->dump_stream("preallocated inos") << preallocated_inos;
1052 f->dump_int("used preallocated ino", used_preallocated_ino);
1053
1054 f->open_object_section("client name");
1055 client_name.dump(f);
1056 f->close_section(); // client name
1057
1058 f->open_array_section("inodes starting a truncate");
1059 for(const auto& ino : truncate_start) {
1060 f->dump_int("inodeno", ino);
1061 }
1062 f->close_section(); // truncate inodes
1063 f->open_array_section("inodes finishing a truncated");
1064 for(const auto& p : truncate_finish) {
1065 f->open_object_section("inode+segment");
1066 f->dump_int("inodeno", p.first);
1067 f->dump_int("truncate starting segment", p.second);
1068 f->close_section(); // truncated inode
1069 }
1070 f->close_section(); // truncate finish inodes
1071
1072 f->open_array_section("destroyed inodes");
1073 for(vector<inodeno_t>::const_iterator i = destroyed_inodes.begin();
1074 i != destroyed_inodes.end(); ++i) {
1075 f->dump_int("inodeno", *i);
1076 }
1077 f->close_section(); // destroyed inodes
1078
1079 f->open_array_section("client requests");
1080 for(const auto& p : client_reqs) {
1081 f->open_object_section("Client request");
1082 f->dump_stream("request ID") << p.first;
1083 f->dump_int("oldest request on client", p.second);
1084 f->close_section(); // request
1085 }
1086 f->close_section(); // client requests
1087 }
1088
1089 void EMetaBlob::generate_test_instances(std::list<EMetaBlob*>& ls)
1090 {
1091 ls.push_back(new EMetaBlob());
1092 }
1093
1094 void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
1095 {
1096 dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
1097
1098 ceph_assert(logseg);
1099
1100 ceph_assert(g_conf()->mds_kill_journal_replay_at != 1);
1101
1102 for (auto& p : roots) {
1103 CInode *in = mds->mdcache->get_inode(p.inode.ino);
1104 bool isnew = in ? false:true;
1105 if (!in)
1106 in = new CInode(mds->mdcache, false, 2, CEPH_NOSNAP);
1107 p.update_inode(mds, in);
1108
1109 if (isnew)
1110 mds->mdcache->add_inode(in);
1111 if (p.is_dirty()) in->_mark_dirty(logseg);
1112 dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl;
1113 }
1114
1115 CInode *renamed_diri = 0;
1116 CDir *olddir = 0;
1117 if (renamed_dirino) {
1118 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
1119 if (renamed_diri)
1120 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri << dendl;
1121 else
1122 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino << dendl;
1123
1124 int nnull = 0;
1125 for (const auto& lp : lump_order) {
1126 dirlump &lump = lump_map[lp];
1127 if (lump.nnull) {
1128 dout(10) << "EMetaBlob.replay found null dentry in dir " << lp << dendl;
1129 nnull += lump.nnull;
1130 }
1131 }
1132 ceph_assert(nnull <= 1);
1133 }
1134
1135 // keep track of any inodes we unlink and don't relink elsewhere
1136 map<CInode*, CDir*> unlinked;
1137 set<CInode*> linked;
1138
1139 // walk through my dirs (in order!)
1140 for (const auto& lp : lump_order) {
1141 dout(10) << "EMetaBlob.replay dir " << lp << dendl;
1142 dirlump &lump = lump_map[lp];
1143
1144 // the dir
1145 CDir *dir = mds->mdcache->get_force_dirfrag(lp, true);
1146 if (!dir) {
1147 // hmm. do i have the inode?
1148 CInode *diri = mds->mdcache->get_inode((lp).ino);
1149 if (!diri) {
1150 if (MDS_INO_IS_MDSDIR(lp.ino)) {
1151 ceph_assert(MDS_INO_MDSDIR(mds->get_nodeid()) != lp.ino);
1152 diri = mds->mdcache->create_system_inode(lp.ino, S_IFDIR|0755);
1153 diri->state_clear(CInode::STATE_AUTH);
1154 dout(10) << "EMetaBlob.replay created base " << *diri << dendl;
1155 } else {
1156 dout(0) << "EMetaBlob.replay missing dir ino " << lp.ino << dendl;
1157 mds->clog->error() << "failure replaying journal (EMetaBlob)";
1158 mds->damaged();
1159 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1160 }
1161 }
1162
1163 // create the dirfrag
1164 dir = diri->get_or_open_dirfrag(mds->mdcache, lp.frag);
1165
1166 if (MDS_INO_IS_BASE(lp.ino))
1167 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
1168
1169 dout(10) << "EMetaBlob.replay added dir " << *dir << dendl;
1170 }
1171 dir->set_version( lump.fnode.version );
1172 dir->fnode = lump.fnode;
1173
1174 if (lump.is_importing()) {
1175 dir->state_set(CDir::STATE_AUTH);
1176 dir->state_clear(CDir::STATE_COMPLETE);
1177 }
1178 if (lump.is_dirty()) {
1179 dir->_mark_dirty(logseg);
1180
1181 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
1182 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir << dendl;
1183 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
1184 logseg->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
1185 } else {
1186 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir << dendl;
1187 }
1188 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
1189 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir << dendl;
1190 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
1191 logseg->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
1192 } else {
1193 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir << dendl;
1194 }
1195 }
1196 if (lump.is_dirty_dft()) {
1197 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir << dendl;
1198 dir->state_set(CDir::STATE_DIRTYDFT);
1199 mds->locker->mark_updated_scatterlock(&dir->inode->dirfragtreelock);
1200 logseg->dirty_dirfrag_dirfragtree.push_back(&dir->inode->item_dirty_dirfrag_dirfragtree);
1201 }
1202 if (lump.is_new())
1203 dir->mark_new(logseg);
1204 if (lump.is_complete())
1205 dir->mark_complete();
1206
1207 dout(10) << "EMetaBlob.replay updated dir " << *dir << dendl;
1208
1209 // decode bits
1210 lump._decode_bits();
1211
1212 // full dentry+inode pairs
1213 for (auto& fb : lump._get_dfull()) {
1214 CDentry *dn = dir->lookup_exact_snap(fb.dn, fb.dnlast);
1215 if (!dn) {
1216 dn = dir->add_null_dentry(fb.dn, fb.dnfirst, fb.dnlast);
1217 dn->set_version(fb.dnv);
1218 if (fb.is_dirty()) dn->_mark_dirty(logseg);
1219 dout(10) << "EMetaBlob.replay added (full) " << *dn << dendl;
1220 } else {
1221 dn->set_version(fb.dnv);
1222 if (fb.is_dirty()) dn->_mark_dirty(logseg);
1223 dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *dn << dendl;
1224 dn->first = fb.dnfirst;
1225 ceph_assert(dn->last == fb.dnlast);
1226 }
1227 if (lump.is_importing())
1228 dn->state_set(CDentry::STATE_AUTH);
1229
1230 CInode *in = mds->mdcache->get_inode(fb.inode.ino, fb.dnlast);
1231 if (!in) {
1232 in = new CInode(mds->mdcache, dn->is_auth(), fb.dnfirst, fb.dnlast);
1233 fb.update_inode(mds, in);
1234 mds->mdcache->add_inode(in);
1235 if (!dn->get_linkage()->is_null()) {
1236 if (dn->get_linkage()->is_primary()) {
1237 unlinked[dn->get_linkage()->get_inode()] = dir;
1238 stringstream ss;
1239 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1240 << " " << *dn->get_linkage()->get_inode() << " should be " << fb.inode.ino;
1241 dout(0) << ss.str() << dendl;
1242 mds->clog->warn(ss);
1243 }
1244 dir->unlink_inode(dn, false);
1245 }
1246 if (unlinked.count(in))
1247 linked.insert(in);
1248 dir->link_primary_inode(dn, in);
1249 dout(10) << "EMetaBlob.replay added " << *in << dendl;
1250 } else {
1251 in->first = fb.dnfirst;
1252 fb.update_inode(mds, in);
1253 if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) {
1254 dout(10) << "EMetaBlob.replay unlinking " << *in << dendl;
1255 unlinked[in] = in->get_parent_dir();
1256 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
1257 }
1258 if (dn->get_linkage()->get_inode() != in) {
1259 if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1260 if (dn->get_linkage()->is_primary()) {
1261 unlinked[dn->get_linkage()->get_inode()] = dir;
1262 stringstream ss;
1263 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1264 << " " << *dn->get_linkage()->get_inode() << " should be " << fb.inode.ino;
1265 dout(0) << ss.str() << dendl;
1266 mds->clog->warn(ss);
1267 }
1268 dir->unlink_inode(dn, false);
1269 }
1270 if (unlinked.count(in))
1271 linked.insert(in);
1272 dir->link_primary_inode(dn, in);
1273 dout(10) << "EMetaBlob.replay linked " << *in << dendl;
1274 } else {
1275 dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *in << dendl;
1276 }
1277 ceph_assert(in->first == fb.dnfirst ||
1278 (in->is_multiversion() && in->first > fb.dnfirst));
1279 }
1280 if (fb.is_dirty())
1281 in->_mark_dirty(logseg);
1282 if (fb.is_dirty_parent())
1283 in->mark_dirty_parent(logseg, fb.is_dirty_pool());
1284 if (fb.need_snapflush())
1285 logseg->open_files.push_back(&in->item_open_file);
1286 if (dn->is_auth())
1287 in->state_set(CInode::STATE_AUTH);
1288 else
1289 in->state_clear(CInode::STATE_AUTH);
1290 ceph_assert(g_conf()->mds_kill_journal_replay_at != 2);
1291 }
1292
1293 // remote dentries
1294 for (const auto& rb : lump.get_dremote()) {
1295 CDentry *dn = dir->lookup_exact_snap(rb.dn, rb.dnlast);
1296 if (!dn) {
1297 dn = dir->add_remote_dentry(rb.dn, rb.ino, rb.d_type, rb.dnfirst, rb.dnlast);
1298 dn->set_version(rb.dnv);
1299 if (rb.dirty) dn->_mark_dirty(logseg);
1300 dout(10) << "EMetaBlob.replay added " << *dn << dendl;
1301 } else {
1302 if (!dn->get_linkage()->is_null()) {
1303 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1304 if (dn->get_linkage()->is_primary()) {
1305 unlinked[dn->get_linkage()->get_inode()] = dir;
1306 stringstream ss;
1307 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1308 << " " << *dn->get_linkage()->get_inode() << " should be remote " << rb.ino;
1309 dout(0) << ss.str() << dendl;
1310 }
1311 dir->unlink_inode(dn, false);
1312 }
1313 dir->link_remote_inode(dn, rb.ino, rb.d_type);
1314 dn->set_version(rb.dnv);
1315 if (rb.dirty) dn->_mark_dirty(logseg);
1316 dout(10) << "EMetaBlob.replay for [" << rb.dnfirst << "," << rb.dnlast << "] had " << *dn << dendl;
1317 dn->first = rb.dnfirst;
1318 ceph_assert(dn->last == rb.dnlast);
1319 }
1320 if (lump.is_importing())
1321 dn->state_set(CDentry::STATE_AUTH);
1322 }
1323
1324 // null dentries
1325 for (const auto& nb : lump.get_dnull()) {
1326 CDentry *dn = dir->lookup_exact_snap(nb.dn, nb.dnlast);
1327 if (!dn) {
1328 dn = dir->add_null_dentry(nb.dn, nb.dnfirst, nb.dnlast);
1329 dn->set_version(nb.dnv);
1330 if (nb.dirty) dn->_mark_dirty(logseg);
1331 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn << dendl;
1332 } else {
1333 dn->first = nb.dnfirst;
1334 if (!dn->get_linkage()->is_null()) {
1335 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1336 CInode *in = dn->get_linkage()->get_inode();
1337 // For renamed inode, We may call CInode::force_dirfrag() later.
1338 // CInode::force_dirfrag() doesn't work well when inode is detached
1339 // from the hierarchy.
1340 if (!renamed_diri || renamed_diri != in) {
1341 if (dn->get_linkage()->is_primary())
1342 unlinked[in] = dir;
1343 dir->unlink_inode(dn);
1344 }
1345 }
1346 dn->set_version(nb.dnv);
1347 if (nb.dirty) dn->_mark_dirty(logseg);
1348 dout(10) << "EMetaBlob.replay had " << *dn << dendl;
1349 ceph_assert(dn->last == nb.dnlast);
1350 }
1351 olddir = dir;
1352 if (lump.is_importing())
1353 dn->state_set(CDentry::STATE_AUTH);
1354
1355 // Make null dentries the first things we trim
1356 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn << dendl;
1357 }
1358 }
1359
1360 ceph_assert(g_conf()->mds_kill_journal_replay_at != 3);
1361
1362 if (renamed_dirino) {
1363 if (renamed_diri) {
1364 ceph_assert(unlinked.count(renamed_diri));
1365 ceph_assert(linked.count(renamed_diri));
1366 olddir = unlinked[renamed_diri];
1367 } else {
1368 // we imported a diri we haven't seen before
1369 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
1370 ceph_assert(renamed_diri); // it was in the metablob
1371 }
1372
1373 if (olddir) {
1374 if (olddir->authority() != CDIR_AUTH_UNDEF &&
1375 renamed_diri->authority() == CDIR_AUTH_UNDEF) {
1376 ceph_assert(slaveup); // auth to non-auth, must be slave prepare
1377 frag_vec_t leaves;
1378 renamed_diri->dirfragtree.get_leaves(leaves);
1379 for (const auto& leaf : leaves) {
1380 CDir *dir = renamed_diri->get_dirfrag(leaf);
1381 ceph_assert(dir);
1382 if (dir->get_dir_auth() == CDIR_AUTH_UNDEF)
1383 // preserve subtree bound until slave commit
1384 slaveup->olddirs.insert(dir->inode);
1385 else
1386 dir->state_set(CDir::STATE_AUTH);
1387 }
1388 }
1389
1390 mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
1391
1392 // see if we can discard the subtree we renamed out of
1393 CDir *root = mds->mdcache->get_subtree_root(olddir);
1394 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
1395 if (slaveup) // preserve the old dir until slave commit
1396 slaveup->olddirs.insert(olddir->inode);
1397 else
1398 mds->mdcache->try_trim_non_auth_subtree(root);
1399 }
1400 }
1401
1402 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1403 if (renamed_diri->authority() != CDIR_AUTH_UNDEF) {
1404 for (const auto& p : renamed_dir_frags) {
1405 CDir *dir = renamed_diri->get_dirfrag(p);
1406 if (dir) {
1407 // we already had the inode before, and we already adjusted this subtree accordingly.
1408 dout(10) << " already had+adjusted rename import bound " << *dir << dendl;
1409 ceph_assert(olddir);
1410 continue;
1411 }
1412 dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, p);
1413 dout(10) << " creating new rename import bound " << *dir << dendl;
1414 dir->state_clear(CDir::STATE_AUTH);
1415 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
1416 }
1417 }
1418
1419 // rename may overwrite an empty directory and move it into stray dir.
1420 unlinked.erase(renamed_diri);
1421 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
1422 if (!linked.count(p->first))
1423 continue;
1424 ceph_assert(p->first->is_dir());
1425 mds->mdcache->adjust_subtree_after_rename(p->first, p->second, false);
1426 }
1427 }
1428
1429 if (!unlinked.empty()) {
1430 for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); ++p)
1431 unlinked.erase(*p);
1432 dout(10) << " unlinked set contains " << unlinked << dendl;
1433 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
1434 CInode *in = p->first;
1435 if (slaveup) { // preserve unlinked inodes until slave commit
1436 slaveup->unlinked.insert(in);
1437 if (in->snaprealm)
1438 in->snaprealm->adjust_parent();
1439 } else
1440 mds->mdcache->remove_inode_recursive(in);
1441 }
1442 }
1443
1444 // table client transactions
1445 for (const auto& p : table_tids) {
1446 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p.first)
1447 << " transaction " << p.second << dendl;
1448 MDSTableClient *client = mds->get_table_client(p.first);
1449 if (client)
1450 client->got_journaled_agree(p.second, logseg);
1451 }
1452
1453 // opened ino?
1454 if (opened_ino) {
1455 CInode *in = mds->mdcache->get_inode(opened_ino);
1456 ceph_assert(in);
1457 dout(10) << "EMetaBlob.replay noting opened inode " << *in << dendl;
1458 logseg->open_files.push_back(&in->item_open_file);
1459 }
1460
1461 // allocated_inos
1462 if (inotablev) {
1463 if (mds->inotable->get_version() >= inotablev) {
1464 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1465 << " <= table " << mds->inotable->get_version() << dendl;
1466 } else {
1467 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1468 << " - 1 == table " << mds->inotable->get_version()
1469 << " allocated+used " << allocated_ino
1470 << " prealloc " << preallocated_inos
1471 << dendl;
1472 if (allocated_ino)
1473 mds->inotable->replay_alloc_id(allocated_ino);
1474 if (preallocated_inos.size())
1475 mds->inotable->replay_alloc_ids(preallocated_inos);
1476
1477 // [repair bad inotable updates]
1478 if (inotablev > mds->inotable->get_version()) {
1479 mds->clog->error() << "journal replay inotablev mismatch "
1480 << mds->inotable->get_version() << " -> " << inotablev;
1481 mds->inotable->force_replay_version(inotablev);
1482 }
1483
1484 ceph_assert(inotablev == mds->inotable->get_version());
1485 }
1486 }
1487 if (sessionmapv) {
1488 unsigned diff = (used_preallocated_ino && !preallocated_inos.empty()) ? 2 : 1;
1489 if (mds->sessionmap.get_version() >= sessionmapv) {
1490 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1491 << " <= table " << mds->sessionmap.get_version() << dendl;
1492 } else if (mds->sessionmap.get_version() + diff == sessionmapv) {
1493 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1494 << " - " << diff << " == table " << mds->sessionmap.get_version()
1495 << " prealloc " << preallocated_inos
1496 << " used " << used_preallocated_ino
1497 << dendl;
1498 Session *session = mds->sessionmap.get_session(client_name);
1499 if (session) {
1500 dout(20) << " (session prealloc " << session->info.prealloc_inos << ")" << dendl;
1501 if (used_preallocated_ino) {
1502 if (!session->info.prealloc_inos.empty()) {
1503 inodeno_t i = session->take_ino(used_preallocated_ino);
1504 ceph_assert(i == used_preallocated_ino);
1505 session->info.used_inos.clear();
1506 }
1507 mds->sessionmap.replay_dirty_session(session);
1508 }
1509 if (!preallocated_inos.empty()) {
1510 session->info.prealloc_inos.insert(preallocated_inos);
1511 mds->sessionmap.replay_dirty_session(session);
1512 }
1513
1514 } else {
1515 dout(10) << "EMetaBlob.replay no session for " << client_name << dendl;
1516 if (used_preallocated_ino)
1517 mds->sessionmap.replay_advance_version();
1518
1519 if (!preallocated_inos.empty())
1520 mds->sessionmap.replay_advance_version();
1521 }
1522 ceph_assert(sessionmapv == mds->sessionmap.get_version());
1523 } else {
1524 mds->clog->error() << "EMetaBlob.replay sessionmap v " << sessionmapv
1525 << " - " << diff << " > table " << mds->sessionmap.get_version();
1526 ceph_assert(g_conf()->mds_wipe_sessions);
1527 mds->sessionmap.wipe();
1528 mds->sessionmap.set_version(sessionmapv);
1529 }
1530 }
1531
1532 // truncating inodes
1533 for (const auto& ino : truncate_start) {
1534 CInode *in = mds->mdcache->get_inode(ino);
1535 ceph_assert(in);
1536 mds->mdcache->add_recovered_truncate(in, logseg);
1537 }
1538 for (const auto& p : truncate_finish) {
1539 LogSegment *ls = mds->mdlog->get_segment(p.second);
1540 if (ls) {
1541 CInode *in = mds->mdcache->get_inode(p.first);
1542 ceph_assert(in);
1543 mds->mdcache->remove_recovered_truncate(in, ls);
1544 }
1545 }
1546
1547 // destroyed inodes
1548 if (!destroyed_inodes.empty()) {
1549 for (vector<inodeno_t>::iterator p = destroyed_inodes.begin();
1550 p != destroyed_inodes.end();
1551 ++p) {
1552 CInode *in = mds->mdcache->get_inode(*p);
1553 if (in) {
1554 dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl;
1555 CDentry *parent = in->get_parent_dn();
1556 mds->mdcache->remove_inode(in);
1557 if (parent) {
1558 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl;
1559 ceph_assert(parent->get_linkage()->is_null());
1560 }
1561 } else {
1562 dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl;
1563 }
1564 }
1565 mds->mdcache->open_file_table.note_destroyed_inos(logseg->seq, destroyed_inodes);
1566 }
1567
1568 // client requests
1569 for (const auto& p : client_reqs) {
1570 if (p.first.name.is_client()) {
1571 dout(10) << "EMetaBlob.replay request " << p.first << " trim_to " << p.second << dendl;
1572 inodeno_t created = allocated_ino ? allocated_ino : used_preallocated_ino;
1573 // if we allocated an inode, there should be exactly one client request id.
1574 ceph_assert(created == inodeno_t() || client_reqs.size() == 1);
1575
1576 Session *session = mds->sessionmap.get_session(p.first.name);
1577 if (session) {
1578 session->add_completed_request(p.first.tid, created);
1579 if (p.second)
1580 session->trim_completed_requests(p.second);
1581 }
1582 }
1583 }
1584
1585 // client flushes
1586 for (const auto& p : client_flushes) {
1587 if (p.first.name.is_client()) {
1588 dout(10) << "EMetaBlob.replay flush " << p.first << " trim_to " << p.second << dendl;
1589 Session *session = mds->sessionmap.get_session(p.first.name);
1590 if (session) {
1591 session->add_completed_flush(p.first.tid);
1592 if (p.second)
1593 session->trim_completed_flushes(p.second);
1594 }
1595 }
1596 }
1597
1598 // update segment
1599 update_segment(logseg);
1600
1601 ceph_assert(g_conf()->mds_kill_journal_replay_at != 4);
1602 }
1603
1604 // -----------------------
1605 // EPurged
1606 void EPurged::update_segment()
1607 {
1608 if (inos.size() && inotablev)
1609 get_segment()->inotablev = inotablev;
1610 return;
1611 }
1612
1613 void EPurged::replay(MDSRank *mds)
1614 {
1615 if (inos.size()) {
1616 LogSegment *ls = mds->mdlog->get_segment(seq);
1617 if (ls) {
1618 ls->purge_inodes.subtract(inos);
1619 }
1620 if (mds->inotable->get_version() >= inotablev) {
1621 dout(10) << "EPurged.replay inotable " << mds->inotable->get_version()
1622 << " >= " << inotablev << ", noop" << dendl;
1623 } else {
1624 dout(10) << "EPurged.replay inotable " << mds->inotable->get_version()
1625 << " < " << inotablev << " " << dendl;
1626 mds->inotable->replay_release_ids(inos);
1627 assert(mds->inotable->get_version() == inotablev);
1628 }
1629 }
1630 update_segment();
1631 }
1632
1633 void EPurged::encode(bufferlist& bl, uint64_t features) const
1634 {
1635 ENCODE_START(1, 1, bl);
1636 encode(inos, bl);
1637 encode(inotablev, bl);
1638 encode(seq, bl);
1639 ENCODE_FINISH(bl);
1640 }
1641
1642 void EPurged::decode(bufferlist::const_iterator& bl)
1643 {
1644 DECODE_START(1, bl);
1645 decode(inos, bl);
1646 decode(inotablev, bl);
1647 decode(seq, bl);
1648 DECODE_FINISH(bl);
1649 }
1650
1651 void EPurged::dump(Formatter *f) const
1652 {
1653 f->dump_stream("inos") << inos;
1654 f->dump_int("inotable version", inotablev);
1655 f->dump_int("segment seq", seq);
1656 }
1657
1658 // -----------------------
1659 // ESession
1660
1661 void ESession::update_segment()
1662 {
1663 get_segment()->sessionmapv = cmapv;
1664 if (inos.size() && inotablev)
1665 get_segment()->inotablev = inotablev;
1666 }
1667
1668 void ESession::replay(MDSRank *mds)
1669 {
1670 if (purge_inos.size())
1671 get_segment()->purge_inodes.insert(purge_inos);
1672
1673 if (mds->sessionmap.get_version() >= cmapv) {
1674 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1675 << " >= " << cmapv << ", noop" << dendl;
1676 } else if (mds->sessionmap.get_version() + 1 == cmapv) {
1677 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1678 << " < " << cmapv << " " << (open ? "open":"close") << " " << client_inst << dendl;
1679 Session *session;
1680 if (open) {
1681 session = mds->sessionmap.get_or_add_session(client_inst);
1682 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1683 session->set_client_metadata(client_metadata);
1684 dout(10) << " opened session " << session->info.inst << dendl;
1685 } else {
1686 session = mds->sessionmap.get_session(client_inst.name);
1687 if (session) { // there always should be a session, but there's a bug
1688 if (session->get_connection() == NULL) {
1689 dout(10) << " removed session " << session->info.inst << dendl;
1690 mds->sessionmap.remove_session(session);
1691 session = NULL;
1692 } else {
1693 session->clear(); // the client has reconnected; keep the Session, but reset
1694 dout(10) << " reset session " << session->info.inst << " (they reconnected)" << dendl;
1695 }
1696 } else {
1697 mds->clog->error() << "replayed stray Session close event for " << client_inst
1698 << " from time " << stamp << ", ignoring";
1699 }
1700 }
1701 if (session) {
1702 mds->sessionmap.replay_dirty_session(session);
1703 } else {
1704 mds->sessionmap.replay_advance_version();
1705 }
1706 ceph_assert(mds->sessionmap.get_version() == cmapv);
1707 } else {
1708 mds->clog->error() << "ESession.replay sessionmap v " << cmapv
1709 << " - 1 > table " << mds->sessionmap.get_version();
1710 ceph_assert(g_conf()->mds_wipe_sessions);
1711 mds->sessionmap.wipe();
1712 mds->sessionmap.set_version(cmapv);
1713 }
1714
1715 if (inos.size() && inotablev) {
1716 if (mds->inotable->get_version() >= inotablev) {
1717 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1718 << " >= " << inotablev << ", noop" << dendl;
1719 } else {
1720 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1721 << " < " << inotablev << " " << (open ? "add":"remove") << dendl;
1722 ceph_assert(!open); // for now
1723 mds->inotable->replay_release_ids(inos);
1724 ceph_assert(mds->inotable->get_version() == inotablev);
1725 }
1726 }
1727
1728 update_segment();
1729 }
1730
1731 void ESession::encode(bufferlist &bl, uint64_t features) const
1732 {
1733 ENCODE_START(6, 5, bl);
1734 encode(stamp, bl);
1735 encode(client_inst, bl, features);
1736 encode(open, bl);
1737 encode(cmapv, bl);
1738 encode(inos, bl);
1739 encode(inotablev, bl);
1740 encode(client_metadata, bl);
1741 encode(purge_inos, bl);
1742 ENCODE_FINISH(bl);
1743 }
1744
1745 void ESession::decode(bufferlist::const_iterator &bl)
1746 {
1747 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
1748 if (struct_v >= 2)
1749 decode(stamp, bl);
1750 decode(client_inst, bl);
1751 decode(open, bl);
1752 decode(cmapv, bl);
1753 decode(inos, bl);
1754 decode(inotablev, bl);
1755 if (struct_v == 4) {
1756 decode(client_metadata.kv_map, bl);
1757 } else if (struct_v >= 5) {
1758 decode(client_metadata, bl);
1759 }
1760 if (struct_v >= 6){
1761 decode(purge_inos, bl);
1762 }
1763
1764 DECODE_FINISH(bl);
1765 }
1766
1767 void ESession::dump(Formatter *f) const
1768 {
1769 f->dump_stream("client instance") << client_inst;
1770 f->dump_string("open", open ? "true" : "false");
1771 f->dump_int("client map version", cmapv);
1772 f->dump_stream("inos") << inos;
1773 f->dump_int("inotable version", inotablev);
1774 f->open_object_section("client_metadata");
1775 client_metadata.dump(f);
1776 f->close_section(); // client_metadata
1777 }
1778
1779 void ESession::generate_test_instances(std::list<ESession*>& ls)
1780 {
1781 ls.push_back(new ESession);
1782 }
1783
1784 // -----------------------
1785 // ESessions
1786
1787 void ESessions::encode(bufferlist &bl, uint64_t features) const
1788 {
1789 ENCODE_START(2, 1, bl);
1790 encode(client_map, bl, features);
1791 encode(cmapv, bl);
1792 encode(stamp, bl);
1793 encode(client_metadata_map, bl);
1794 ENCODE_FINISH(bl);
1795 }
1796
1797 void ESessions::decode_old(bufferlist::const_iterator &bl)
1798 {
1799 using ceph::decode;
1800 decode(client_map, bl);
1801 decode(cmapv, bl);
1802 if (!bl.end())
1803 decode(stamp, bl);
1804 }
1805
1806 void ESessions::decode_new(bufferlist::const_iterator &bl)
1807 {
1808 DECODE_START(2, bl);
1809 decode(client_map, bl);
1810 decode(cmapv, bl);
1811 decode(stamp, bl);
1812 if (struct_v >= 2)
1813 decode(client_metadata_map, bl);
1814 DECODE_FINISH(bl);
1815 }
1816
1817 void ESessions::dump(Formatter *f) const
1818 {
1819 f->dump_int("client map version", cmapv);
1820
1821 f->open_array_section("client map");
1822 for (map<client_t,entity_inst_t>::const_iterator i = client_map.begin();
1823 i != client_map.end(); ++i) {
1824 f->open_object_section("client");
1825 f->dump_int("client id", i->first.v);
1826 f->dump_stream("client entity") << i->second;
1827 f->close_section(); // client
1828 }
1829 f->close_section(); // client map
1830 }
1831
1832 void ESessions::generate_test_instances(std::list<ESessions*>& ls)
1833 {
1834 ls.push_back(new ESessions());
1835 }
1836
1837 void ESessions::update_segment()
1838 {
1839 get_segment()->sessionmapv = cmapv;
1840 }
1841
1842 void ESessions::replay(MDSRank *mds)
1843 {
1844 if (mds->sessionmap.get_version() >= cmapv) {
1845 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1846 << " >= " << cmapv << ", noop" << dendl;
1847 } else {
1848 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1849 << " < " << cmapv << dendl;
1850 mds->sessionmap.replay_open_sessions(cmapv, client_map, client_metadata_map);
1851 }
1852 update_segment();
1853 }
1854
1855
1856 // -----------------------
1857 // ETableServer
1858
1859 void ETableServer::encode(bufferlist& bl, uint64_t features) const
1860 {
1861 ENCODE_START(3, 3, bl);
1862 encode(stamp, bl);
1863 encode(table, bl);
1864 encode(op, bl);
1865 encode(reqid, bl);
1866 encode(bymds, bl);
1867 encode(mutation, bl);
1868 encode(tid, bl);
1869 encode(version, bl);
1870 ENCODE_FINISH(bl);
1871 }
1872
1873 void ETableServer::decode(bufferlist::const_iterator &bl)
1874 {
1875 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
1876 if (struct_v >= 2)
1877 decode(stamp, bl);
1878 decode(table, bl);
1879 decode(op, bl);
1880 decode(reqid, bl);
1881 decode(bymds, bl);
1882 decode(mutation, bl);
1883 decode(tid, bl);
1884 decode(version, bl);
1885 DECODE_FINISH(bl);
1886 }
1887
1888 void ETableServer::dump(Formatter *f) const
1889 {
1890 f->dump_int("table id", table);
1891 f->dump_int("op", op);
1892 f->dump_int("request id", reqid);
1893 f->dump_int("by mds", bymds);
1894 f->dump_int("tid", tid);
1895 f->dump_int("version", version);
1896 }
1897
1898 void ETableServer::generate_test_instances(std::list<ETableServer*>& ls)
1899 {
1900 ls.push_back(new ETableServer());
1901 }
1902
1903
1904 void ETableServer::update_segment()
1905 {
1906 get_segment()->tablev[table] = version;
1907 }
1908
1909 void ETableServer::replay(MDSRank *mds)
1910 {
1911 MDSTableServer *server = mds->get_table_server(table);
1912 if (!server)
1913 return;
1914
1915 if (server->get_version() >= version) {
1916 dout(10) << "ETableServer.replay " << get_mdstable_name(table)
1917 << " " << get_mdstableserver_opname(op)
1918 << " event " << version
1919 << " <= table " << server->get_version() << dendl;
1920 return;
1921 }
1922
1923 dout(10) << " ETableServer.replay " << get_mdstable_name(table)
1924 << " " << get_mdstableserver_opname(op)
1925 << " event " << version << " - 1 == table " << server->get_version() << dendl;
1926 ceph_assert(version-1 == server->get_version());
1927
1928 switch (op) {
1929 case TABLESERVER_OP_PREPARE: {
1930 server->_note_prepare(bymds, reqid, true);
1931 bufferlist out;
1932 server->_prepare(mutation, reqid, bymds, out);
1933 mutation = std::move(out);
1934 break;
1935 }
1936 case TABLESERVER_OP_COMMIT:
1937 server->_commit(tid, ref_t<MMDSTableRequest>());
1938 server->_note_commit(tid, true);
1939 break;
1940 case TABLESERVER_OP_ROLLBACK:
1941 server->_rollback(tid);
1942 server->_note_rollback(tid, true);
1943 break;
1944 case TABLESERVER_OP_SERVER_UPDATE:
1945 server->_server_update(mutation);
1946 server->_note_server_update(mutation, true);
1947 break;
1948 default:
1949 mds->clog->error() << "invalid tableserver op in ETableServer";
1950 mds->damaged();
1951 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1952 }
1953
1954 ceph_assert(version == server->get_version());
1955 update_segment();
1956 }
1957
1958
1959 // ---------------------
1960 // ETableClient
1961
1962 void ETableClient::encode(bufferlist& bl, uint64_t features) const
1963 {
1964 ENCODE_START(3, 3, bl);
1965 encode(stamp, bl);
1966 encode(table, bl);
1967 encode(op, bl);
1968 encode(tid, bl);
1969 ENCODE_FINISH(bl);
1970 }
1971
1972 void ETableClient::decode(bufferlist::const_iterator &bl)
1973 {
1974 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
1975 if (struct_v >= 2)
1976 decode(stamp, bl);
1977 decode(table, bl);
1978 decode(op, bl);
1979 decode(tid, bl);
1980 DECODE_FINISH(bl);
1981 }
1982
1983 void ETableClient::dump(Formatter *f) const
1984 {
1985 f->dump_int("table", table);
1986 f->dump_int("op", op);
1987 f->dump_int("tid", tid);
1988 }
1989
1990 void ETableClient::generate_test_instances(std::list<ETableClient*>& ls)
1991 {
1992 ls.push_back(new ETableClient());
1993 }
1994
1995 void ETableClient::replay(MDSRank *mds)
1996 {
1997 dout(10) << " ETableClient.replay " << get_mdstable_name(table)
1998 << " op " << get_mdstableserver_opname(op)
1999 << " tid " << tid << dendl;
2000
2001 MDSTableClient *client = mds->get_table_client(table);
2002 if (!client)
2003 return;
2004
2005 ceph_assert(op == TABLESERVER_OP_ACK);
2006 client->got_journaled_ack(tid);
2007 }
2008
2009
2010 // -----------------------
2011 // ESnap
2012 /*
2013 void ESnap::update_segment()
2014 {
2015 get_segment()->tablev[TABLE_SNAP] = version;
2016 }
2017
2018 void ESnap::replay(MDSRank *mds)
2019 {
2020 if (mds->snaptable->get_version() >= version) {
2021 dout(10) << "ESnap.replay event " << version
2022 << " <= table " << mds->snaptable->get_version() << dendl;
2023 return;
2024 }
2025
2026 dout(10) << " ESnap.replay event " << version
2027 << " - 1 == table " << mds->snaptable->get_version() << dendl;
2028 ceph_assert(version-1 == mds->snaptable->get_version());
2029
2030 if (create) {
2031 version_t v;
2032 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
2033 ceph_assert(s == snap.snapid);
2034 } else {
2035 mds->snaptable->remove(snap.snapid);
2036 }
2037
2038 ceph_assert(version == mds->snaptable->get_version());
2039 }
2040 */
2041
2042
2043
2044 // -----------------------
2045 // EUpdate
2046
2047 void EUpdate::encode(bufferlist &bl, uint64_t features) const
2048 {
2049 ENCODE_START(4, 4, bl);
2050 encode(stamp, bl);
2051 encode(type, bl);
2052 encode(metablob, bl, features);
2053 encode(client_map, bl);
2054 encode(cmapv, bl);
2055 encode(reqid, bl);
2056 encode(had_slaves, bl);
2057 ENCODE_FINISH(bl);
2058 }
2059
2060 void EUpdate::decode(bufferlist::const_iterator &bl)
2061 {
2062 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl);
2063 if (struct_v >= 2)
2064 decode(stamp, bl);
2065 decode(type, bl);
2066 decode(metablob, bl);
2067 decode(client_map, bl);
2068 if (struct_v >= 3)
2069 decode(cmapv, bl);
2070 decode(reqid, bl);
2071 decode(had_slaves, bl);
2072 DECODE_FINISH(bl);
2073 }
2074
2075 void EUpdate::dump(Formatter *f) const
2076 {
2077 f->open_object_section("metablob");
2078 metablob.dump(f);
2079 f->close_section(); // metablob
2080
2081 f->dump_string("type", type);
2082 f->dump_int("client map length", client_map.length());
2083 f->dump_int("client map version", cmapv);
2084 f->dump_stream("reqid") << reqid;
2085 f->dump_string("had slaves", had_slaves ? "true" : "false");
2086 }
2087
2088 void EUpdate::generate_test_instances(std::list<EUpdate*>& ls)
2089 {
2090 ls.push_back(new EUpdate());
2091 }
2092
2093
2094 void EUpdate::update_segment()
2095 {
2096 auto&& segment = get_segment();
2097 metablob.update_segment(segment);
2098
2099 if (client_map.length())
2100 segment->sessionmapv = cmapv;
2101
2102 if (had_slaves)
2103 segment->uncommitted_masters.insert(reqid);
2104 }
2105
2106 void EUpdate::replay(MDSRank *mds)
2107 {
2108 auto&& segment = get_segment();
2109 metablob.replay(mds, segment);
2110
2111 if (had_slaves) {
2112 dout(10) << "EUpdate.replay " << reqid << " had slaves, expecting a matching ECommitted" << dendl;
2113 segment->uncommitted_masters.insert(reqid);
2114 set<mds_rank_t> slaves;
2115 mds->mdcache->add_uncommitted_master(reqid, segment, slaves, true);
2116 }
2117
2118 if (client_map.length()) {
2119 if (mds->sessionmap.get_version() >= cmapv) {
2120 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2121 << " <= table " << mds->sessionmap.get_version() << dendl;
2122 } else {
2123 dout(10) << "EUpdate.replay sessionmap " << mds->sessionmap.get_version()
2124 << " < " << cmapv << dendl;
2125 // open client sessions?
2126 map<client_t,entity_inst_t> cm;
2127 map<client_t,client_metadata_t> cmm;
2128 auto blp = client_map.cbegin();
2129 using ceph::decode;
2130 decode(cm, blp);
2131 if (!blp.end())
2132 decode(cmm, blp);
2133 mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
2134 }
2135 }
2136 update_segment();
2137 }
2138
2139
2140 // ------------------------
2141 // EOpen
2142
2143 void EOpen::encode(bufferlist &bl, uint64_t features) const {
2144 ENCODE_START(4, 3, bl);
2145 encode(stamp, bl);
2146 encode(metablob, bl, features);
2147 encode(inos, bl);
2148 encode(snap_inos, bl);
2149 ENCODE_FINISH(bl);
2150 }
2151
2152 void EOpen::decode(bufferlist::const_iterator &bl) {
2153 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2154 if (struct_v >= 2)
2155 decode(stamp, bl);
2156 decode(metablob, bl);
2157 decode(inos, bl);
2158 if (struct_v >= 4)
2159 decode(snap_inos, bl);
2160 DECODE_FINISH(bl);
2161 }
2162
2163 void EOpen::dump(Formatter *f) const
2164 {
2165 f->open_object_section("metablob");
2166 metablob.dump(f);
2167 f->close_section(); // metablob
2168 f->open_array_section("inos involved");
2169 for (vector<inodeno_t>::const_iterator i = inos.begin();
2170 i != inos.end(); ++i) {
2171 f->dump_int("ino", *i);
2172 }
2173 f->close_section(); // inos
2174 }
2175
2176 void EOpen::generate_test_instances(std::list<EOpen*>& ls)
2177 {
2178 ls.push_back(new EOpen());
2179 ls.push_back(new EOpen());
2180 ls.back()->add_ino(0);
2181 }
2182
2183 void EOpen::update_segment()
2184 {
2185 // ??
2186 }
2187
2188 void EOpen::replay(MDSRank *mds)
2189 {
2190 dout(10) << "EOpen.replay " << dendl;
2191 auto&& segment = get_segment();
2192 metablob.replay(mds, segment);
2193
2194 // note which segments inodes belong to, so we don't have to start rejournaling them
2195 for (const auto &ino : inos) {
2196 CInode *in = mds->mdcache->get_inode(ino);
2197 if (!in) {
2198 dout(0) << "EOpen.replay ino " << ino << " not in metablob" << dendl;
2199 ceph_assert(in);
2200 }
2201 segment->open_files.push_back(&in->item_open_file);
2202 }
2203 for (const auto &vino : snap_inos) {
2204 CInode *in = mds->mdcache->get_inode(vino);
2205 if (!in) {
2206 dout(0) << "EOpen.replay ino " << vino << " not in metablob" << dendl;
2207 ceph_assert(in);
2208 }
2209 segment->open_files.push_back(&in->item_open_file);
2210 }
2211 }
2212
2213
2214 // -----------------------
2215 // ECommitted
2216
2217 void ECommitted::replay(MDSRank *mds)
2218 {
2219 if (mds->mdcache->uncommitted_masters.count(reqid)) {
2220 dout(10) << "ECommitted.replay " << reqid << dendl;
2221 mds->mdcache->uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2222 mds->mdcache->uncommitted_masters.erase(reqid);
2223 } else {
2224 dout(10) << "ECommitted.replay " << reqid << " -- didn't see original op" << dendl;
2225 }
2226 }
2227
2228 void ECommitted::encode(bufferlist& bl, uint64_t features) const
2229 {
2230 ENCODE_START(3, 3, bl);
2231 encode(stamp, bl);
2232 encode(reqid, bl);
2233 ENCODE_FINISH(bl);
2234 }
2235
2236 void ECommitted::decode(bufferlist::const_iterator& bl)
2237 {
2238 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2239 if (struct_v >= 2)
2240 decode(stamp, bl);
2241 decode(reqid, bl);
2242 DECODE_FINISH(bl);
2243 }
2244
2245 void ECommitted::dump(Formatter *f) const {
2246 f->dump_stream("stamp") << stamp;
2247 f->dump_stream("reqid") << reqid;
2248 }
2249
2250 void ECommitted::generate_test_instances(std::list<ECommitted*>& ls)
2251 {
2252 ls.push_back(new ECommitted);
2253 ls.push_back(new ECommitted);
2254 ls.back()->stamp = utime_t(1, 2);
2255 ls.back()->reqid = metareqid_t(entity_name_t::CLIENT(123), 456);
2256 }
2257
2258 // -----------------------
2259 // ESlaveUpdate
2260
2261 void link_rollback::encode(bufferlist &bl) const
2262 {
2263 ENCODE_START(3, 2, bl);
2264 encode(reqid, bl);
2265 encode(ino, bl);
2266 encode(was_inc, bl);
2267 encode(old_ctime, bl);
2268 encode(old_dir_mtime, bl);
2269 encode(old_dir_rctime, bl);
2270 encode(snapbl, bl);
2271 ENCODE_FINISH(bl);
2272 }
2273
2274 void link_rollback::decode(bufferlist::const_iterator &bl)
2275 {
2276 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2277 decode(reqid, bl);
2278 decode(ino, bl);
2279 decode(was_inc, bl);
2280 decode(old_ctime, bl);
2281 decode(old_dir_mtime, bl);
2282 decode(old_dir_rctime, bl);
2283 if (struct_v >= 3)
2284 decode(snapbl, bl);
2285 DECODE_FINISH(bl);
2286 }
2287
2288 void link_rollback::dump(Formatter *f) const
2289 {
2290 f->dump_stream("metareqid") << reqid;
2291 f->dump_int("ino", ino);
2292 f->dump_string("was incremented", was_inc ? "true" : "false");
2293 f->dump_stream("old_ctime") << old_ctime;
2294 f->dump_stream("old_dir_mtime") << old_dir_mtime;
2295 f->dump_stream("old_dir_rctime") << old_dir_rctime;
2296 }
2297
2298 void link_rollback::generate_test_instances(std::list<link_rollback*>& ls)
2299 {
2300 ls.push_back(new link_rollback());
2301 }
2302
2303 void rmdir_rollback::encode(bufferlist& bl) const
2304 {
2305 ENCODE_START(3, 2, bl);
2306 encode(reqid, bl);
2307 encode(src_dir, bl);
2308 encode(src_dname, bl);
2309 encode(dest_dir, bl);
2310 encode(dest_dname, bl);
2311 encode(snapbl, bl);
2312 ENCODE_FINISH(bl);
2313 }
2314
2315 void rmdir_rollback::decode(bufferlist::const_iterator& bl)
2316 {
2317 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2318 decode(reqid, bl);
2319 decode(src_dir, bl);
2320 decode(src_dname, bl);
2321 decode(dest_dir, bl);
2322 decode(dest_dname, bl);
2323 if (struct_v >= 3)
2324 decode(snapbl, bl);
2325 DECODE_FINISH(bl);
2326 }
2327
2328 void rmdir_rollback::dump(Formatter *f) const
2329 {
2330 f->dump_stream("metareqid") << reqid;
2331 f->dump_stream("source directory") << src_dir;
2332 f->dump_string("source dname", src_dname);
2333 f->dump_stream("destination directory") << dest_dir;
2334 f->dump_string("destination dname", dest_dname);
2335 }
2336
2337 void rmdir_rollback::generate_test_instances(std::list<rmdir_rollback*>& ls)
2338 {
2339 ls.push_back(new rmdir_rollback());
2340 }
2341
2342 void rename_rollback::drec::encode(bufferlist &bl) const
2343 {
2344 ENCODE_START(2, 2, bl);
2345 encode(dirfrag, bl);
2346 encode(dirfrag_old_mtime, bl);
2347 encode(dirfrag_old_rctime, bl);
2348 encode(ino, bl);
2349 encode(remote_ino, bl);
2350 encode(dname, bl);
2351 encode(remote_d_type, bl);
2352 encode(old_ctime, bl);
2353 ENCODE_FINISH(bl);
2354 }
2355
2356 void rename_rollback::drec::decode(bufferlist::const_iterator &bl)
2357 {
2358 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2359 decode(dirfrag, bl);
2360 decode(dirfrag_old_mtime, bl);
2361 decode(dirfrag_old_rctime, bl);
2362 decode(ino, bl);
2363 decode(remote_ino, bl);
2364 decode(dname, bl);
2365 decode(remote_d_type, bl);
2366 decode(old_ctime, bl);
2367 DECODE_FINISH(bl);
2368 }
2369
2370 void rename_rollback::drec::dump(Formatter *f) const
2371 {
2372 f->dump_stream("directory fragment") << dirfrag;
2373 f->dump_stream("directory old mtime") << dirfrag_old_mtime;
2374 f->dump_stream("directory old rctime") << dirfrag_old_rctime;
2375 f->dump_int("ino", ino);
2376 f->dump_int("remote ino", remote_ino);
2377 f->dump_string("dname", dname);
2378 uint32_t type = DTTOIF(remote_d_type) & S_IFMT; // convert to type entries
2379 string type_string;
2380 switch(type) {
2381 case S_IFREG:
2382 type_string = "file"; break;
2383 case S_IFLNK:
2384 type_string = "symlink"; break;
2385 case S_IFDIR:
2386 type_string = "directory"; break;
2387 default:
2388 type_string = "UNKNOWN-" + stringify((int)type); break;
2389 }
2390 f->dump_string("remote dtype", type_string);
2391 f->dump_stream("old ctime") << old_ctime;
2392 }
2393
2394 void rename_rollback::drec::generate_test_instances(std::list<drec*>& ls)
2395 {
2396 ls.push_back(new drec());
2397 ls.back()->remote_d_type = IFTODT(S_IFREG);
2398 }
2399
2400 void rename_rollback::encode(bufferlist &bl) const
2401 {
2402 ENCODE_START(3, 2, bl);
2403 encode(reqid, bl);
2404 encode(orig_src, bl);
2405 encode(orig_dest, bl);
2406 encode(stray, bl);
2407 encode(ctime, bl);
2408 encode(srci_snapbl, bl);
2409 encode(desti_snapbl, bl);
2410 ENCODE_FINISH(bl);
2411 }
2412
2413 void rename_rollback::decode(bufferlist::const_iterator &bl)
2414 {
2415 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2416 decode(reqid, bl);
2417 decode(orig_src, bl);
2418 decode(orig_dest, bl);
2419 decode(stray, bl);
2420 decode(ctime, bl);
2421 if (struct_v >= 3) {
2422 decode(srci_snapbl, bl);
2423 decode(desti_snapbl, bl);
2424 }
2425 DECODE_FINISH(bl);
2426 }
2427
2428 void rename_rollback::dump(Formatter *f) const
2429 {
2430 f->dump_stream("request id") << reqid;
2431 f->open_object_section("original src drec");
2432 orig_src.dump(f);
2433 f->close_section(); // original src drec
2434 f->open_object_section("original dest drec");
2435 orig_dest.dump(f);
2436 f->close_section(); // original dest drec
2437 f->open_object_section("stray drec");
2438 stray.dump(f);
2439 f->close_section(); // stray drec
2440 f->dump_stream("ctime") << ctime;
2441 }
2442
2443 void rename_rollback::generate_test_instances(std::list<rename_rollback*>& ls)
2444 {
2445 ls.push_back(new rename_rollback());
2446 ls.back()->orig_src.remote_d_type = IFTODT(S_IFREG);
2447 ls.back()->orig_dest.remote_d_type = IFTODT(S_IFREG);
2448 ls.back()->stray.remote_d_type = IFTODT(S_IFREG);
2449 }
2450
2451 void ESlaveUpdate::encode(bufferlist &bl, uint64_t features) const
2452 {
2453 ENCODE_START(3, 3, bl);
2454 encode(stamp, bl);
2455 encode(type, bl);
2456 encode(reqid, bl);
2457 encode(master, bl);
2458 encode(op, bl);
2459 encode(origop, bl);
2460 encode(commit, bl, features);
2461 encode(rollback, bl);
2462 ENCODE_FINISH(bl);
2463 }
2464
2465 void ESlaveUpdate::decode(bufferlist::const_iterator &bl)
2466 {
2467 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2468 if (struct_v >= 2)
2469 decode(stamp, bl);
2470 decode(type, bl);
2471 decode(reqid, bl);
2472 decode(master, bl);
2473 decode(op, bl);
2474 decode(origop, bl);
2475 decode(commit, bl);
2476 decode(rollback, bl);
2477 DECODE_FINISH(bl);
2478 }
2479
2480 void ESlaveUpdate::dump(Formatter *f) const
2481 {
2482 f->open_object_section("metablob");
2483 commit.dump(f);
2484 f->close_section(); // metablob
2485
2486 f->dump_int("rollback length", rollback.length());
2487 f->dump_string("type", type);
2488 f->dump_stream("metareqid") << reqid;
2489 f->dump_int("master", master);
2490 f->dump_int("op", op);
2491 f->dump_int("original op", origop);
2492 }
2493
2494 void ESlaveUpdate::generate_test_instances(std::list<ESlaveUpdate*>& ls)
2495 {
2496 ls.push_back(new ESlaveUpdate());
2497 }
2498
2499
2500 void ESlaveUpdate::replay(MDSRank *mds)
2501 {
2502 MDSlaveUpdate *su;
2503 auto&& segment = get_segment();
2504 switch (op) {
2505 case ESlaveUpdate::OP_PREPARE:
2506 dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds." << master
2507 << ": applying commit, saving rollback info" << dendl;
2508 su = new MDSlaveUpdate(origop, rollback, segment->slave_updates);
2509 commit.replay(mds, segment, su);
2510 mds->mdcache->add_uncommitted_slave_update(reqid, master, su);
2511 break;
2512
2513 case ESlaveUpdate::OP_COMMIT:
2514 su = mds->mdcache->get_uncommitted_slave_update(reqid, master);
2515 if (su) {
2516 dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << dendl;
2517 mds->mdcache->finish_uncommitted_slave_update(reqid, master);
2518 } else {
2519 dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master
2520 << ": ignoring, no previously saved prepare" << dendl;
2521 }
2522 break;
2523
2524 case ESlaveUpdate::OP_ROLLBACK:
2525 dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
2526 << ": applying rollback commit blob" << dendl;
2527 commit.replay(mds, segment);
2528 su = mds->mdcache->get_uncommitted_slave_update(reqid, master);
2529 if (su)
2530 mds->mdcache->finish_uncommitted_slave_update(reqid, master);
2531 break;
2532
2533 default:
2534 mds->clog->error() << "invalid op in ESlaveUpdate";
2535 mds->damaged();
2536 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2537 }
2538 }
2539
2540
2541 // -----------------------
2542 // ESubtreeMap
2543
2544 void ESubtreeMap::encode(bufferlist& bl, uint64_t features) const
2545 {
2546 ENCODE_START(6, 5, bl);
2547 encode(stamp, bl);
2548 encode(metablob, bl, features);
2549 encode(subtrees, bl);
2550 encode(ambiguous_subtrees, bl);
2551 encode(expire_pos, bl);
2552 encode(event_seq, bl);
2553 ENCODE_FINISH(bl);
2554 }
2555
2556 void ESubtreeMap::decode(bufferlist::const_iterator &bl)
2557 {
2558 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
2559 if (struct_v >= 2)
2560 decode(stamp, bl);
2561 decode(metablob, bl);
2562 decode(subtrees, bl);
2563 if (struct_v >= 4)
2564 decode(ambiguous_subtrees, bl);
2565 if (struct_v >= 3)
2566 decode(expire_pos, bl);
2567 if (struct_v >= 6)
2568 decode(event_seq, bl);
2569 DECODE_FINISH(bl);
2570 }
2571
2572 void ESubtreeMap::dump(Formatter *f) const
2573 {
2574 f->open_object_section("metablob");
2575 metablob.dump(f);
2576 f->close_section(); // metablob
2577
2578 f->open_array_section("subtrees");
2579 for(map<dirfrag_t,vector<dirfrag_t> >::const_iterator i = subtrees.begin();
2580 i != subtrees.end(); ++i) {
2581 f->open_object_section("tree");
2582 f->dump_stream("root dirfrag") << i->first;
2583 for (vector<dirfrag_t>::const_iterator j = i->second.begin();
2584 j != i->second.end(); ++j) {
2585 f->dump_stream("bound dirfrag") << *j;
2586 }
2587 f->close_section(); // tree
2588 }
2589 f->close_section(); // subtrees
2590
2591 f->open_array_section("ambiguous subtrees");
2592 for(set<dirfrag_t>::const_iterator i = ambiguous_subtrees.begin();
2593 i != ambiguous_subtrees.end(); ++i) {
2594 f->dump_stream("dirfrag") << *i;
2595 }
2596 f->close_section(); // ambiguous subtrees
2597
2598 f->dump_int("expire position", expire_pos);
2599 }
2600
2601 void ESubtreeMap::generate_test_instances(std::list<ESubtreeMap*>& ls)
2602 {
2603 ls.push_back(new ESubtreeMap());
2604 }
2605
2606 void ESubtreeMap::replay(MDSRank *mds)
2607 {
2608 if (expire_pos && expire_pos > mds->mdlog->journaler->get_expire_pos())
2609 mds->mdlog->journaler->set_expire_pos(expire_pos);
2610
2611 // suck up the subtree map?
2612 if (mds->mdcache->is_subtrees()) {
2613 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl;
2614 int errors = 0;
2615
2616 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2617 p != subtrees.end();
2618 ++p) {
2619 CDir *dir = mds->mdcache->get_dirfrag(p->first);
2620 if (!dir) {
2621 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2622 << " subtree root " << p->first << " not in cache";
2623 ++errors;
2624 continue;
2625 }
2626
2627 if (!mds->mdcache->is_subtree(dir)) {
2628 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2629 << " subtree root " << p->first << " not a subtree in cache";
2630 ++errors;
2631 continue;
2632 }
2633 if (dir->get_dir_auth().first != mds->get_nodeid()) {
2634 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2635 << " subtree root " << p->first
2636 << " is not mine in cache (it's " << dir->get_dir_auth() << ")";
2637 ++errors;
2638 continue;
2639 }
2640
2641 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2642 mds->mdcache->get_force_dirfrag(*q, true);
2643
2644 set<CDir*> bounds;
2645 mds->mdcache->get_subtree_bounds(dir, bounds);
2646 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2647 CDir *b = mds->mdcache->get_dirfrag(*q);
2648 if (!b) {
2649 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2650 << " subtree " << p->first << " bound " << *q << " not in cache";
2651 ++errors;
2652 continue;
2653 }
2654 if (bounds.count(b) == 0) {
2655 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2656 << " subtree " << p->first << " bound " << *q << " not a bound in cache";
2657 ++errors;
2658 continue;
2659 }
2660 bounds.erase(b);
2661 }
2662 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q) {
2663 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2664 << " subtree " << p->first << " has extra bound in cache " << (*q)->dirfrag();
2665 ++errors;
2666 }
2667
2668 if (ambiguous_subtrees.count(p->first)) {
2669 if (!mds->mdcache->have_ambiguous_import(p->first)) {
2670 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2671 << " subtree " << p->first << " is ambiguous but is not in our cache";
2672 ++errors;
2673 }
2674 } else {
2675 if (mds->mdcache->have_ambiguous_import(p->first)) {
2676 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2677 << " subtree " << p->first << " is not ambiguous but is in our cache";
2678 ++errors;
2679 }
2680 }
2681 }
2682
2683 std::vector<CDir*> dirs;
2684 mds->mdcache->get_subtrees(dirs);
2685 for (const auto& dir : dirs) {
2686 if (dir->get_dir_auth().first != mds->get_nodeid())
2687 continue;
2688 if (subtrees.count(dir->dirfrag()) == 0) {
2689 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2690 << " does not include cache subtree " << dir->dirfrag();
2691 ++errors;
2692 }
2693 }
2694
2695 if (errors) {
2696 dout(0) << "journal subtrees: " << subtrees << dendl;
2697 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees << dendl;
2698 mds->mdcache->show_subtrees();
2699 ceph_assert(!g_conf()->mds_debug_subtrees || errors == 0);
2700 }
2701 return;
2702 }
2703
2704 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl;
2705
2706 // first, stick the spanning tree in my cache
2707 //metablob.print(*_dout);
2708 metablob.replay(mds, get_segment());
2709
2710 // restore import/export maps
2711 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2712 p != subtrees.end();
2713 ++p) {
2714 CDir *dir = mds->mdcache->get_dirfrag(p->first);
2715 ceph_assert(dir);
2716 if (ambiguous_subtrees.count(p->first)) {
2717 // ambiguous!
2718 mds->mdcache->add_ambiguous_import(p->first, p->second);
2719 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second,
2720 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
2721 } else {
2722 // not ambiguous
2723 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid());
2724 }
2725 }
2726
2727 mds->mdcache->recalc_auth_bits(true);
2728
2729 mds->mdcache->show_subtrees();
2730 }
2731
2732
2733
2734 // -----------------------
2735 // EFragment
2736
2737 void EFragment::replay(MDSRank *mds)
2738 {
2739 dout(10) << "EFragment.replay " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << dendl;
2740
2741 std::vector<CDir*> resultfrags;
2742 MDSContext::vec waiters;
2743
2744 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2745 // it will be once we replay the metablob , but first we need to
2746 // refragment anything we already have in the cache.
2747 CInode *in = mds->mdcache->get_inode(ino);
2748
2749 auto&& segment = get_segment();
2750 switch (op) {
2751 case OP_PREPARE:
2752 mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, segment, &rollback);
2753
2754 if (in)
2755 mds->mdcache->adjust_dir_fragments(in, basefrag, bits, &resultfrags, waiters, true);
2756 break;
2757
2758 case OP_ROLLBACK: {
2759 frag_vec_t old_frags;
2760 if (in) {
2761 in->dirfragtree.get_leaves_under(basefrag, old_frags);
2762 if (orig_frags.empty()) {
2763 // old format EFragment
2764 mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, &resultfrags, waiters, true);
2765 } else {
2766 for (const auto& fg : orig_frags)
2767 mds->mdcache->force_dir_fragment(in, fg);
2768 }
2769 }
2770 mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), std::move(old_frags));
2771 break;
2772 }
2773
2774 case OP_COMMIT:
2775 case OP_FINISH:
2776 mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op);
2777 break;
2778
2779 default:
2780 ceph_abort();
2781 }
2782
2783 metablob.replay(mds, segment);
2784 if (in && g_conf()->mds_debug_frag)
2785 in->verify_dirfrags();
2786 }
2787
2788 void EFragment::encode(bufferlist &bl, uint64_t features) const {
2789 ENCODE_START(5, 4, bl);
2790 encode(stamp, bl);
2791 encode(op, bl);
2792 encode(ino, bl);
2793 encode(basefrag, bl);
2794 encode(bits, bl);
2795 encode(metablob, bl, features);
2796 encode(orig_frags, bl);
2797 encode(rollback, bl);
2798 ENCODE_FINISH(bl);
2799 }
2800
2801 void EFragment::decode(bufferlist::const_iterator &bl) {
2802 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
2803 if (struct_v >= 2)
2804 decode(stamp, bl);
2805 if (struct_v >= 3)
2806 decode(op, bl);
2807 decode(ino, bl);
2808 decode(basefrag, bl);
2809 decode(bits, bl);
2810 decode(metablob, bl);
2811 if (struct_v >= 5) {
2812 decode(orig_frags, bl);
2813 decode(rollback, bl);
2814 }
2815 DECODE_FINISH(bl);
2816 }
2817
2818 void EFragment::dump(Formatter *f) const
2819 {
2820 /*f->open_object_section("Metablob");
2821 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2822 f->close_section();*/
2823 f->dump_string("op", op_name(op));
2824 f->dump_stream("ino") << ino;
2825 f->dump_stream("base frag") << basefrag;
2826 f->dump_int("bits", bits);
2827 }
2828
2829 void EFragment::generate_test_instances(std::list<EFragment*>& ls)
2830 {
2831 ls.push_back(new EFragment);
2832 ls.push_back(new EFragment);
2833 ls.back()->op = OP_PREPARE;
2834 ls.back()->ino = 1;
2835 ls.back()->bits = 5;
2836 }
2837
2838 void dirfrag_rollback::encode(bufferlist &bl) const
2839 {
2840 ENCODE_START(1, 1, bl);
2841 encode(fnode, bl);
2842 ENCODE_FINISH(bl);
2843 }
2844
2845 void dirfrag_rollback::decode(bufferlist::const_iterator &bl)
2846 {
2847 DECODE_START(1, bl);
2848 decode(fnode, bl);
2849 DECODE_FINISH(bl);
2850 }
2851
2852
2853
2854 // =========================================================================
2855
2856 // -----------------------
2857 // EExport
2858
2859 void EExport::replay(MDSRank *mds)
2860 {
2861 dout(10) << "EExport.replay " << base << dendl;
2862 auto&& segment = get_segment();
2863 metablob.replay(mds, segment);
2864
2865 CDir *dir = mds->mdcache->get_dirfrag(base);
2866 ceph_assert(dir);
2867
2868 set<CDir*> realbounds;
2869 for (set<dirfrag_t>::iterator p = bounds.begin();
2870 p != bounds.end();
2871 ++p) {
2872 CDir *bd = mds->mdcache->get_dirfrag(*p);
2873 ceph_assert(bd);
2874 realbounds.insert(bd);
2875 }
2876
2877 // adjust auth away
2878 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, CDIR_AUTH_UNDEF);
2879
2880 mds->mdcache->try_trim_non_auth_subtree(dir);
2881 }
2882
2883 void EExport::encode(bufferlist& bl, uint64_t features) const
2884 {
2885 ENCODE_START(4, 3, bl);
2886 encode(stamp, bl);
2887 encode(metablob, bl, features);
2888 encode(base, bl);
2889 encode(bounds, bl);
2890 encode(target, bl);
2891 ENCODE_FINISH(bl);
2892 }
2893
2894 void EExport::decode(bufferlist::const_iterator &bl)
2895 {
2896 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2897 if (struct_v >= 2)
2898 decode(stamp, bl);
2899 decode(metablob, bl);
2900 decode(base, bl);
2901 decode(bounds, bl);
2902 if (struct_v >= 4)
2903 decode(target, bl);
2904 DECODE_FINISH(bl);
2905 }
2906
2907 void EExport::dump(Formatter *f) const
2908 {
2909 f->dump_float("stamp", (double)stamp);
2910 /*f->open_object_section("Metablob");
2911 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2912 f->close_section();*/
2913 f->dump_stream("base dirfrag") << base;
2914 f->open_array_section("bounds dirfrags");
2915 for (set<dirfrag_t>::const_iterator i = bounds.begin();
2916 i != bounds.end(); ++i) {
2917 f->dump_stream("dirfrag") << *i;
2918 }
2919 f->close_section(); // bounds dirfrags
2920 }
2921
2922 void EExport::generate_test_instances(std::list<EExport*>& ls)
2923 {
2924 EExport *sample = new EExport();
2925 ls.push_back(sample);
2926 }
2927
2928
2929 // -----------------------
2930 // EImportStart
2931
2932 void EImportStart::update_segment()
2933 {
2934 get_segment()->sessionmapv = cmapv;
2935 }
2936
2937 void EImportStart::replay(MDSRank *mds)
2938 {
2939 dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl;
2940 //metablob.print(*_dout);
2941 auto&& segment = get_segment();
2942 metablob.replay(mds, segment);
2943
2944 // put in ambiguous import list
2945 mds->mdcache->add_ambiguous_import(base, bounds);
2946
2947 // set auth partially to us so we don't trim it
2948 CDir *dir = mds->mdcache->get_dirfrag(base);
2949 ceph_assert(dir);
2950
2951 set<CDir*> realbounds;
2952 for (vector<dirfrag_t>::iterator p = bounds.begin();
2953 p != bounds.end();
2954 ++p) {
2955 CDir *bd = mds->mdcache->get_dirfrag(*p);
2956 ceph_assert(bd);
2957 if (!bd->is_subtree_root())
2958 bd->state_clear(CDir::STATE_AUTH);
2959 realbounds.insert(bd);
2960 }
2961
2962 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds,
2963 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
2964
2965 // open client sessions?
2966 if (mds->sessionmap.get_version() >= cmapv) {
2967 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
2968 << " >= " << cmapv << ", noop" << dendl;
2969 } else {
2970 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
2971 << " < " << cmapv << dendl;
2972 map<client_t,entity_inst_t> cm;
2973 map<client_t,client_metadata_t> cmm;
2974 auto blp = client_map.cbegin();
2975 using ceph::decode;
2976 decode(cm, blp);
2977 if (!blp.end())
2978 decode(cmm, blp);
2979 mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
2980 }
2981 update_segment();
2982 }
2983
2984 void EImportStart::encode(bufferlist &bl, uint64_t features) const {
2985 ENCODE_START(4, 3, bl);
2986 encode(stamp, bl);
2987 encode(base, bl);
2988 encode(metablob, bl, features);
2989 encode(bounds, bl);
2990 encode(cmapv, bl);
2991 encode(client_map, bl);
2992 encode(from, bl);
2993 ENCODE_FINISH(bl);
2994 }
2995
2996 void EImportStart::decode(bufferlist::const_iterator &bl) {
2997 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2998 if (struct_v >= 2)
2999 decode(stamp, bl);
3000 decode(base, bl);
3001 decode(metablob, bl);
3002 decode(bounds, bl);
3003 decode(cmapv, bl);
3004 decode(client_map, bl);
3005 if (struct_v >= 4)
3006 decode(from, bl);
3007 DECODE_FINISH(bl);
3008 }
3009
3010 void EImportStart::dump(Formatter *f) const
3011 {
3012 f->dump_stream("base dirfrag") << base;
3013 f->open_array_section("boundary dirfrags");
3014 for (vector<dirfrag_t>::const_iterator iter = bounds.begin();
3015 iter != bounds.end(); ++iter) {
3016 f->dump_stream("frag") << *iter;
3017 }
3018 f->close_section();
3019 }
3020
3021 void EImportStart::generate_test_instances(std::list<EImportStart*>& ls)
3022 {
3023 ls.push_back(new EImportStart);
3024 }
3025
3026 // -----------------------
3027 // EImportFinish
3028
3029 void EImportFinish::replay(MDSRank *mds)
3030 {
3031 if (mds->mdcache->have_ambiguous_import(base)) {
3032 dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl;
3033 if (success) {
3034 mds->mdcache->finish_ambiguous_import(base);
3035 } else {
3036 CDir *dir = mds->mdcache->get_dirfrag(base);
3037 ceph_assert(dir);
3038 vector<dirfrag_t> bounds;
3039 mds->mdcache->get_ambiguous_import_bounds(base, bounds);
3040 mds->mdcache->adjust_bounded_subtree_auth(dir, bounds, CDIR_AUTH_UNDEF);
3041 mds->mdcache->cancel_ambiguous_import(dir);
3042 mds->mdcache->try_trim_non_auth_subtree(dir);
3043 }
3044 } else {
3045 // this shouldn't happen unless this is an old journal
3046 dout(10) << "EImportFinish.replay " << base << " success=" << success
3047 << " on subtree not marked as ambiguous"
3048 << dendl;
3049 mds->clog->error() << "failure replaying journal (EImportFinish)";
3050 mds->damaged();
3051 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3052 }
3053 }
3054
3055 void EImportFinish::encode(bufferlist& bl, uint64_t features) const
3056 {
3057 ENCODE_START(3, 3, bl);
3058 encode(stamp, bl);
3059 encode(base, bl);
3060 encode(success, bl);
3061 ENCODE_FINISH(bl);
3062 }
3063
3064 void EImportFinish::decode(bufferlist::const_iterator &bl)
3065 {
3066 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
3067 if (struct_v >= 2)
3068 decode(stamp, bl);
3069 decode(base, bl);
3070 decode(success, bl);
3071 DECODE_FINISH(bl);
3072 }
3073
3074 void EImportFinish::dump(Formatter *f) const
3075 {
3076 f->dump_stream("base dirfrag") << base;
3077 f->dump_string("success", success ? "true" : "false");
3078 }
3079 void EImportFinish::generate_test_instances(std::list<EImportFinish*>& ls)
3080 {
3081 ls.push_back(new EImportFinish);
3082 ls.push_back(new EImportFinish);
3083 ls.back()->success = true;
3084 }
3085
3086
3087 // ------------------------
3088 // EResetJournal
3089
3090 void EResetJournal::encode(bufferlist& bl, uint64_t features) const
3091 {
3092 ENCODE_START(2, 2, bl);
3093 encode(stamp, bl);
3094 ENCODE_FINISH(bl);
3095 }
3096
3097 void EResetJournal::decode(bufferlist::const_iterator &bl)
3098 {
3099 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
3100 decode(stamp, bl);
3101 DECODE_FINISH(bl);
3102 }
3103
3104 void EResetJournal::dump(Formatter *f) const
3105 {
3106 f->dump_stream("timestamp") << stamp;
3107 }
3108
3109 void EResetJournal::generate_test_instances(std::list<EResetJournal*>& ls)
3110 {
3111 ls.push_back(new EResetJournal());
3112 }
3113
3114 void EResetJournal::replay(MDSRank *mds)
3115 {
3116 dout(1) << "EResetJournal" << dendl;
3117
3118 mds->sessionmap.wipe();
3119 mds->inotable->replay_reset();
3120
3121 if (mds->mdsmap->get_root() == mds->get_nodeid()) {
3122 CDir *rootdir = mds->mdcache->get_root()->get_or_open_dirfrag(mds->mdcache, frag_t());
3123 mds->mdcache->adjust_subtree_auth(rootdir, mds->get_nodeid());
3124 }
3125
3126 CDir *mydir = mds->mdcache->get_myin()->get_or_open_dirfrag(mds->mdcache, frag_t());
3127 mds->mdcache->adjust_subtree_auth(mydir, mds->get_nodeid());
3128
3129 mds->mdcache->recalc_auth_bits(true);
3130
3131 mds->mdcache->show_subtrees();
3132 }
3133
3134
3135 void ENoOp::encode(bufferlist &bl, uint64_t features) const
3136 {
3137 ENCODE_START(2, 2, bl);
3138 encode(pad_size, bl);
3139 uint8_t const pad = 0xff;
3140 for (unsigned int i = 0; i < pad_size; ++i) {
3141 encode(pad, bl);
3142 }
3143 ENCODE_FINISH(bl);
3144 }
3145
3146
3147 void ENoOp::decode(bufferlist::const_iterator &bl)
3148 {
3149 DECODE_START(2, bl);
3150 decode(pad_size, bl);
3151 if (bl.get_remaining() != pad_size) {
3152 // This is spiritually an assertion, but expressing in a way that will let
3153 // journal debug tools catch it and recognise a malformed entry.
3154 throw buffer::end_of_buffer();
3155 } else {
3156 bl += pad_size;
3157 }
3158 DECODE_FINISH(bl);
3159 }
3160
3161
3162 void ENoOp::replay(MDSRank *mds)
3163 {
3164 dout(4) << "ENoOp::replay, " << pad_size << " bytes skipped in journal" << dendl;
3165 }
3166
3167 /**
3168 * If re-formatting an old journal that used absolute log position
3169 * references as segment sequence numbers, use this function to update
3170 * it.
3171 *
3172 * @param mds
3173 * MDSRank instance, just used for logging
3174 * @param old_to_new
3175 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3176 *
3177 * @return
3178 * True if the event was modified.
3179 */
3180 bool EMetaBlob::rewrite_truncate_finish(MDSRank const *mds,
3181 std::map<LogSegment::seq_t, LogSegment::seq_t> const &old_to_new)
3182 {
3183 bool modified = false;
3184 map<inodeno_t, LogSegment::seq_t> new_trunc_finish;
3185 for (const auto& p : truncate_finish) {
3186 auto q = old_to_new.find(p.second);
3187 if (q != old_to_new.end()) {
3188 dout(20) << __func__ << " applying segment seq mapping "
3189 << p.second << " -> " << q->second << dendl;
3190 new_trunc_finish.emplace(p.first, q->second);
3191 modified = true;
3192 } else {
3193 dout(20) << __func__ << " no segment seq mapping found for "
3194 << p.second << dendl;
3195 new_trunc_finish.insert(p);
3196 }
3197 }
3198 truncate_finish.swap(new_trunc_finish);
3199
3200 return modified;
3201 }