]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/journal.cc
import 15.2.4
[ceph.git] / ceph / src / mds / journal.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "common/config.h"
16 #include "osdc/Journaler.h"
17 #include "events/ESubtreeMap.h"
18 #include "events/ESession.h"
19 #include "events/ESessions.h"
20
21 #include "events/EMetaBlob.h"
22 #include "events/EResetJournal.h"
23 #include "events/ENoOp.h"
24
25 #include "events/EUpdate.h"
26 #include "events/ESlaveUpdate.h"
27 #include "events/EOpen.h"
28 #include "events/ECommitted.h"
29 #include "events/EPurged.h"
30
31 #include "events/EExport.h"
32 #include "events/EImportStart.h"
33 #include "events/EImportFinish.h"
34 #include "events/EFragment.h"
35
36 #include "events/ETableClient.h"
37 #include "events/ETableServer.h"
38
39 #include "include/stringify.h"
40
41 #include "LogSegment.h"
42
43 #include "MDSRank.h"
44 #include "MDLog.h"
45 #include "MDCache.h"
46 #include "Server.h"
47 #include "Migrator.h"
48 #include "Mutation.h"
49
50 #include "InoTable.h"
51 #include "MDSTableClient.h"
52 #include "MDSTableServer.h"
53
54 #include "Locker.h"
55
56 #define dout_context g_ceph_context
57 #define dout_subsys ceph_subsys_mds
58 #undef dout_prefix
59 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
60
61
62 // -----------------------
63 // LogSegment
64
65 void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio)
66 {
67 set<CDir*> commit;
68
69 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire" << dendl;
70
71 ceph_assert(g_conf()->mds_kill_journal_expire_at != 1);
72
73 // commit dirs
74 for (elist<CDir*>::iterator p = new_dirfrags.begin(); !p.end(); ++p) {
75 dout(20) << " new_dirfrag " << **p << dendl;
76 ceph_assert((*p)->is_auth());
77 commit.insert(*p);
78 }
79 for (elist<CDir*>::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) {
80 dout(20) << " dirty_dirfrag " << **p << dendl;
81 ceph_assert((*p)->is_auth());
82 commit.insert(*p);
83 }
84 for (elist<CDentry*>::iterator p = dirty_dentries.begin(); !p.end(); ++p) {
85 dout(20) << " dirty_dentry " << **p << dendl;
86 ceph_assert((*p)->is_auth());
87 commit.insert((*p)->get_dir());
88 }
89 for (elist<CInode*>::iterator p = dirty_inodes.begin(); !p.end(); ++p) {
90 dout(20) << " dirty_inode " << **p << dendl;
91 ceph_assert((*p)->is_auth());
92 if ((*p)->is_base()) {
93 (*p)->store(gather_bld.new_sub());
94 } else
95 commit.insert((*p)->get_parent_dn()->get_dir());
96 }
97
98 if (!commit.empty()) {
99 for (set<CDir*>::iterator p = commit.begin();
100 p != commit.end();
101 ++p) {
102 CDir *dir = *p;
103 ceph_assert(dir->is_auth());
104 if (dir->can_auth_pin()) {
105 dout(15) << "try_to_expire committing " << *dir << dendl;
106 dir->commit(0, gather_bld.new_sub(), false, op_prio);
107 } else {
108 dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl;
109 dir->add_waiter(CDir::WAIT_UNFREEZE, gather_bld.new_sub());
110 }
111 }
112 }
113
114 // master ops with possibly uncommitted slaves
115 for (set<metareqid_t>::iterator p = uncommitted_masters.begin();
116 p != uncommitted_masters.end();
117 ++p) {
118 dout(10) << "try_to_expire waiting for slaves to ack commit on " << *p << dendl;
119 mds->mdcache->wait_for_uncommitted_master(*p, gather_bld.new_sub());
120 }
121
122 // slave ops that haven't been committed
123 for (set<metareqid_t>::iterator p = uncommitted_slaves.begin();
124 p != uncommitted_slaves.end();
125 ++p) {
126 dout(10) << "try_to_expire waiting for master to ack OP_FINISH on " << *p << dendl;
127 mds->mdcache->wait_for_uncommitted_slave(*p, gather_bld.new_sub());
128 }
129
130 // uncommitted fragments
131 for (set<dirfrag_t>::iterator p = uncommitted_fragments.begin();
132 p != uncommitted_fragments.end();
133 ++p) {
134 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p << dendl;
135 mds->mdcache->wait_for_uncommitted_fragment(*p, gather_bld.new_sub());
136 }
137
138 // nudge scatterlocks
139 for (elist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) {
140 CInode *in = *p;
141 dout(10) << "try_to_expire waiting for dirlock flush on " << *in << dendl;
142 mds->locker->scatter_nudge(&in->filelock, gather_bld.new_sub());
143 }
144 for (elist<CInode*>::iterator p = dirty_dirfrag_dirfragtree.begin(); !p.end(); ++p) {
145 CInode *in = *p;
146 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in << dendl;
147 mds->locker->scatter_nudge(&in->dirfragtreelock, gather_bld.new_sub());
148 }
149 for (elist<CInode*>::iterator p = dirty_dirfrag_nest.begin(); !p.end(); ++p) {
150 CInode *in = *p;
151 dout(10) << "try_to_expire waiting for nest flush on " << *in << dendl;
152 mds->locker->scatter_nudge(&in->nestlock, gather_bld.new_sub());
153 }
154
155 ceph_assert(g_conf()->mds_kill_journal_expire_at != 2);
156
157 // open files and snap inodes
158 if (!open_files.empty()) {
159 ceph_assert(!mds->mdlog->is_capped()); // hmm FIXME
160 EOpen *le = 0;
161 LogSegment *ls = mds->mdlog->get_current_segment();
162 ceph_assert(ls != this);
163 elist<CInode*>::iterator p = open_files.begin(member_offset(CInode, item_open_file));
164 while (!p.end()) {
165 CInode *in = *p;
166 ++p;
167 if (in->last != CEPH_NOSNAP && in->is_auth() && !in->client_snap_caps.empty()) {
168 // journal snap inodes that need flush. This simplify the mds failover hanlding
169 dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl;
170 if (!le) {
171 le = new EOpen(mds->mdlog);
172 mds->mdlog->start_entry(le);
173 }
174 le->add_clean_inode(in);
175 ls->open_files.push_back(&in->item_open_file);
176 } else {
177 // open files are tracked by open file table, no need to journal them again
178 in->item_open_file.remove_myself();
179 }
180 }
181 if (le) {
182 mds->mdlog->submit_entry(le);
183 mds->mdlog->wait_for_safe(gather_bld.new_sub());
184 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl;
185 }
186 }
187
188 ceph_assert(g_conf()->mds_kill_journal_expire_at != 3);
189
190 // backtraces to be stored/updated
191 for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) {
192 CInode *in = *p;
193 ceph_assert(in->is_auth());
194 if (in->can_auth_pin()) {
195 dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl;
196 in->store_backtrace(gather_bld.new_sub(), op_prio);
197 } else {
198 dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl;
199 in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub());
200 }
201 }
202
203 ceph_assert(g_conf()->mds_kill_journal_expire_at != 4);
204
205 // idalloc
206 if (inotablev > mds->inotable->get_committed_version()) {
207 dout(10) << "try_to_expire saving inotable table, need " << inotablev
208 << ", committed is " << mds->inotable->get_committed_version()
209 << " (" << mds->inotable->get_committing_version() << ")"
210 << dendl;
211 mds->inotable->save(gather_bld.new_sub(), inotablev);
212 }
213
214 // sessionmap
215 if (sessionmapv > mds->sessionmap.get_committed()) {
216 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
217 << ", committed is " << mds->sessionmap.get_committed()
218 << " (" << mds->sessionmap.get_committing() << ")"
219 << dendl;
220 mds->sessionmap.save(gather_bld.new_sub(), sessionmapv);
221 }
222
223 // updates to sessions for completed_requests
224 mds->sessionmap.save_if_dirty(touched_sessions, &gather_bld);
225 touched_sessions.clear();
226
227 // pending commit atids
228 for (map<int, ceph::unordered_set<version_t> >::iterator p = pending_commit_tids.begin();
229 p != pending_commit_tids.end();
230 ++p) {
231 MDSTableClient *client = mds->get_table_client(p->first);
232 ceph_assert(client);
233 for (ceph::unordered_set<version_t>::iterator q = p->second.begin();
234 q != p->second.end();
235 ++q) {
236 dout(10) << "try_to_expire " << get_mdstable_name(p->first) << " transaction " << *q
237 << " pending commit (not yet acked), waiting" << dendl;
238 ceph_assert(!client->has_committed(*q));
239 client->wait_for_ack(*q, gather_bld.new_sub());
240 }
241 }
242
243 // table servers
244 for (map<int, version_t>::iterator p = tablev.begin();
245 p != tablev.end();
246 ++p) {
247 MDSTableServer *server = mds->get_table_server(p->first);
248 ceph_assert(server);
249 if (p->second > server->get_committed_version()) {
250 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p->first)
251 << " to save, need " << p->second << dendl;
252 server->save(gather_bld.new_sub());
253 }
254 }
255
256 // truncating
257 for (set<CInode*>::iterator p = truncating_inodes.begin();
258 p != truncating_inodes.end();
259 ++p) {
260 dout(10) << "try_to_expire waiting for truncate of " << **p << dendl;
261 (*p)->add_waiter(CInode::WAIT_TRUNC, gather_bld.new_sub());
262 }
263 // purge inodes
264 dout(10) << "try_to_expire waiting for purge of " << purge_inodes << dendl;
265 if (purge_inodes.size())
266 set_purged_cb(gather_bld.new_sub());
267
268 if (gather_bld.has_subs()) {
269 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire waiting" << dendl;
270 mds->mdlog->flush();
271 } else {
272 ceph_assert(g_conf()->mds_kill_journal_expire_at != 5);
273 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire success" << dendl;
274 }
275 }
276
277 // -----------------------
278 // EMetaBlob
279
280 void EMetaBlob::add_dir_context(CDir *dir, int mode)
281 {
282 MDSRank *mds = dir->cache->mds;
283
284 list<CDentry*> parents;
285
286 // it may be okay not to include the maybe items, if
287 // - we journaled the maybe child inode in this segment
288 // - that subtree turns out to be unambiguously auth
289 list<CDentry*> maybe;
290 bool maybenot = false;
291
292 while (true) {
293 // already have this dir? (we must always add in order)
294 if (lump_map.count(dir->dirfrag())) {
295 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") have lump " << dir->dirfrag() << dendl;
296 break;
297 }
298
299 // stop at root/stray
300 CInode *diri = dir->get_inode();
301 CDentry *parent = diri->get_projected_parent_dn();
302
303 if (mode == TO_AUTH_SUBTREE_ROOT) {
304 // subtree root?
305 if (dir->is_subtree_root()) {
306 // match logic in MDCache::create_subtree_map()
307 if (dir->get_dir_auth().first == mds->get_nodeid()) {
308 mds_authority_t parent_auth = parent ? parent->authority() : CDIR_AUTH_UNDEF;
309 if (parent_auth.first == dir->get_dir_auth().first) {
310 if (parent_auth.second == CDIR_AUTH_UNKNOWN &&
311 !dir->is_ambiguous_dir_auth() &&
312 !dir->state_test(CDir::STATE_EXPORTBOUND) &&
313 !dir->state_test(CDir::STATE_AUXSUBTREE) &&
314 !diri->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
315 dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir << dendl;
316 ceph_abort();
317 }
318 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") ambiguous or transient subtree " << dendl;
319 } else {
320 // it's an auth subtree, we don't need maybe (if any), and we're done.
321 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
322 << " at " << *dir << dendl;
323 maybe.clear();
324 break;
325 }
326 } else {
327 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached ambig or !auth subtree, need " << maybe
328 << " at " << *dir << dendl;
329 // we need the maybe list after all!
330 parents.splice(parents.begin(), maybe);
331 maybenot = false;
332 }
333 }
334
335 // was the inode journaled in this blob?
336 if (event_seq && diri->last_journaled == event_seq) {
337 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri this blob " << *diri << dendl;
338 break;
339 }
340
341 // have we journaled this inode since the last subtree map?
342 if (!maybenot && last_subtree_map && diri->last_journaled >= last_subtree_map) {
343 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri in this segment ("
344 << diri->last_journaled << " >= " << last_subtree_map << "), setting maybenot flag "
345 << *diri << dendl;
346 maybenot = true;
347 }
348 }
349
350 if (!parent)
351 break;
352
353 if (maybenot) {
354 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") maybe " << *parent << dendl;
355 maybe.push_front(parent);
356 } else {
357 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") definitely " << *parent << dendl;
358 parents.push_front(parent);
359 }
360
361 dir = parent->get_dir();
362 }
363
364 parents.splice(parents.begin(), maybe);
365
366 dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl;
367 for (const auto& dentry : parents) {
368 ceph_assert(dentry->get_projected_linkage()->is_primary());
369 add_dentry(dentry, false);
370 }
371 }
372
373 void EMetaBlob::update_segment(LogSegment *ls)
374 {
375 // dirty inode mtimes
376 // -> handled directly by Server.cc, replay()
377
378 // alloc table update?
379 if (inotablev)
380 ls->inotablev = inotablev;
381 if (sessionmapv)
382 ls->sessionmapv = sessionmapv;
383
384 // truncated inodes
385 // -> handled directly by Server.cc
386
387 // client requests
388 // note the newest request per client
389 //if (!client_reqs.empty())
390 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
391 }
392
393 // EMetaBlob::fullbit
394
395 void EMetaBlob::fullbit::encode(bufferlist& bl, uint64_t features) const {
396 ENCODE_START(8, 5, bl);
397 encode(dn, bl);
398 encode(dnfirst, bl);
399 encode(dnlast, bl);
400 encode(dnv, bl);
401 encode(inode, bl, features);
402 encode(xattrs, bl);
403 if (inode.is_symlink())
404 encode(symlink, bl);
405 if (inode.is_dir()) {
406 encode(dirfragtree, bl);
407 encode(snapbl, bl);
408 }
409 encode(state, bl);
410 if (old_inodes.empty()) {
411 encode(false, bl);
412 } else {
413 encode(true, bl);
414 encode(old_inodes, bl, features);
415 }
416 if (!inode.is_dir())
417 encode(snapbl, bl);
418 encode(oldest_snap, bl);
419 ENCODE_FINISH(bl);
420 }
421
422 void EMetaBlob::fullbit::decode(bufferlist::const_iterator &bl) {
423 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
424 decode(dn, bl);
425 decode(dnfirst, bl);
426 decode(dnlast, bl);
427 decode(dnv, bl);
428 decode(inode, bl);
429 decode_noshare(xattrs, bl);
430 if (inode.is_symlink())
431 decode(symlink, bl);
432 if (inode.is_dir()) {
433 decode(dirfragtree, bl);
434 decode(snapbl, bl);
435 if ((struct_v == 2) || (struct_v == 3)) {
436 bool dir_layout_exists;
437 decode(dir_layout_exists, bl);
438 if (dir_layout_exists) {
439 __u8 dir_struct_v;
440 decode(dir_struct_v, bl); // default_file_layout version
441 decode(inode.layout, bl); // and actual layout, that we care about
442 }
443 }
444 }
445 if (struct_v >= 6) {
446 decode(state, bl);
447 } else {
448 bool dirty;
449 decode(dirty, bl);
450 state = dirty ? EMetaBlob::fullbit::STATE_DIRTY : 0;
451 }
452
453 if (struct_v >= 3) {
454 bool old_inodes_present;
455 decode(old_inodes_present, bl);
456 if (old_inodes_present) {
457 decode(old_inodes, bl);
458 }
459 }
460 if (!inode.is_dir()) {
461 if (struct_v >= 7)
462 decode(snapbl, bl);
463 }
464 if (struct_v >= 8)
465 decode(oldest_snap, bl);
466 else
467 oldest_snap = CEPH_NOSNAP;
468
469 DECODE_FINISH(bl);
470 }
471
472 void EMetaBlob::fullbit::dump(Formatter *f) const
473 {
474 f->dump_string("dentry", dn);
475 f->dump_stream("snapid.first") << dnfirst;
476 f->dump_stream("snapid.last") << dnlast;
477 f->dump_int("dentry version", dnv);
478 f->open_object_section("inode");
479 inode.dump(f);
480 f->close_section(); // inode
481 f->open_object_section("xattrs");
482 for (const auto &p : xattrs) {
483 std::string s(p.second.c_str(), p.second.length());
484 f->dump_string(p.first.c_str(), s);
485 }
486 f->close_section(); // xattrs
487 if (inode.is_symlink()) {
488 f->dump_string("symlink", symlink);
489 }
490 if (inode.is_dir()) {
491 f->dump_stream("frag tree") << dirfragtree;
492 f->dump_string("has_snapbl", snapbl.length() ? "true" : "false");
493 if (inode.has_layout()) {
494 f->open_object_section("file layout policy");
495 // FIXME
496 f->dump_string("layout", "the layout exists");
497 f->close_section(); // file layout policy
498 }
499 }
500 f->dump_string("state", state_string());
501 if (!old_inodes.empty()) {
502 f->open_array_section("old inodes");
503 for (const auto &p : old_inodes) {
504 f->open_object_section("inode");
505 f->dump_int("snapid", p.first);
506 p.second.dump(f);
507 f->close_section(); // inode
508 }
509 f->close_section(); // old inodes
510 }
511 }
512
513 void EMetaBlob::fullbit::generate_test_instances(std::list<EMetaBlob::fullbit*>& ls)
514 {
515 CInode::mempool_inode inode;
516 fragtree_t fragtree;
517 CInode::mempool_xattr_map empty_xattrs;
518 bufferlist empty_snapbl;
519 fullbit *sample = new fullbit("/testdn", 0, 0, 0,
520 inode, fragtree, empty_xattrs, "", 0, empty_snapbl,
521 false, NULL);
522 ls.push_back(sample);
523 }
524
525 void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in)
526 {
527 in->inode = inode;
528 in->xattrs = xattrs;
529 in->maybe_export_pin();
530 if (in->inode.is_dir()) {
531 if (!(in->dirfragtree == dirfragtree)) {
532 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> "
533 << dirfragtree << " on " << *in << dendl;
534 in->dirfragtree = dirfragtree;
535 in->force_dirfrags();
536 if (in->get_num_dirfrags() && in->authority() == CDIR_AUTH_UNDEF) {
537 auto&& ls = in->get_nested_dirfrags();
538 for (const auto& dir : ls) {
539 if (dir->get_num_any() == 0 &&
540 mds->mdcache->can_trim_non_auth_dirfrag(dir)) {
541 dout(10) << " closing empty non-auth dirfrag " << *dir << dendl;
542 in->close_dirfrag(dir->get_frag());
543 }
544 }
545 }
546 }
547 } else if (in->inode.is_symlink()) {
548 in->symlink = symlink;
549 }
550 in->old_inodes = old_inodes;
551 if (!in->old_inodes.empty()) {
552 snapid_t min_first = in->old_inodes.rbegin()->first + 1;
553 if (min_first > in->first)
554 in->first = min_first;
555 }
556
557 /*
558 * we can do this before linking hte inode bc the split_at would
559 * be a no-op.. we have no children (namely open snaprealms) to
560 * divy up
561 */
562 in->oldest_snap = oldest_snap;
563 in->decode_snap_blob(snapbl);
564
565 /*
566 * In case there was anything malformed in the journal that we are
567 * replaying, do sanity checks on the inodes we're replaying and
568 * go damaged instead of letting any trash into a live cache
569 */
570 if (in->is_file()) {
571 // Files must have valid layouts with a pool set
572 if (in->inode.layout.pool_id == -1 || !in->inode.layout.is_valid()) {
573 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
574 << ": " << in->inode.layout << dendl;
575 std::ostringstream oss;
576 oss << "Invalid layout for inode " << in->ino() << " in journal";
577 mds->clog->error() << oss.str();
578 mds->damaged();
579 ceph_abort(); // Should be unreachable because damaged() calls respawn()
580 }
581 }
582 }
583
584 // EMetaBlob::remotebit
585
586 void EMetaBlob::remotebit::encode(bufferlist& bl) const
587 {
588 ENCODE_START(2, 2, bl);
589 encode(dn, bl);
590 encode(dnfirst, bl);
591 encode(dnlast, bl);
592 encode(dnv, bl);
593 encode(ino, bl);
594 encode(d_type, bl);
595 encode(dirty, bl);
596 ENCODE_FINISH(bl);
597 }
598
599 void EMetaBlob::remotebit::decode(bufferlist::const_iterator &bl)
600 {
601 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
602 decode(dn, bl);
603 decode(dnfirst, bl);
604 decode(dnlast, bl);
605 decode(dnv, bl);
606 decode(ino, bl);
607 decode(d_type, bl);
608 decode(dirty, bl);
609 DECODE_FINISH(bl);
610 }
611
612 void EMetaBlob::remotebit::dump(Formatter *f) const
613 {
614 f->dump_string("dentry", dn);
615 f->dump_int("snapid.first", dnfirst);
616 f->dump_int("snapid.last", dnlast);
617 f->dump_int("dentry version", dnv);
618 f->dump_int("inodeno", ino);
619 uint32_t type = DTTOIF(d_type) & S_IFMT; // convert to type entries
620 string type_string;
621 switch(type) {
622 case S_IFREG:
623 type_string = "file"; break;
624 case S_IFLNK:
625 type_string = "symlink"; break;
626 case S_IFDIR:
627 type_string = "directory"; break;
628 case S_IFIFO:
629 type_string = "fifo"; break;
630 case S_IFCHR:
631 type_string = "chr"; break;
632 case S_IFBLK:
633 type_string = "blk"; break;
634 case S_IFSOCK:
635 type_string = "sock"; break;
636 default:
637 assert (0 == "unknown d_type!");
638 }
639 f->dump_string("d_type", type_string);
640 f->dump_string("dirty", dirty ? "true" : "false");
641 }
642
643 void EMetaBlob::remotebit::
644 generate_test_instances(std::list<EMetaBlob::remotebit*>& ls)
645 {
646 remotebit *remote = new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG), false);
647 ls.push_back(remote);
648 }
649
650 // EMetaBlob::nullbit
651
652 void EMetaBlob::nullbit::encode(bufferlist& bl) const
653 {
654 ENCODE_START(2, 2, bl);
655 encode(dn, bl);
656 encode(dnfirst, bl);
657 encode(dnlast, bl);
658 encode(dnv, bl);
659 encode(dirty, bl);
660 ENCODE_FINISH(bl);
661 }
662
663 void EMetaBlob::nullbit::decode(bufferlist::const_iterator &bl)
664 {
665 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
666 decode(dn, bl);
667 decode(dnfirst, bl);
668 decode(dnlast, bl);
669 decode(dnv, bl);
670 decode(dirty, bl);
671 DECODE_FINISH(bl);
672 }
673
674 void EMetaBlob::nullbit::dump(Formatter *f) const
675 {
676 f->dump_string("dentry", dn);
677 f->dump_int("snapid.first", dnfirst);
678 f->dump_int("snapid.last", dnlast);
679 f->dump_int("dentry version", dnv);
680 f->dump_string("dirty", dirty ? "true" : "false");
681 }
682
683 void EMetaBlob::nullbit::generate_test_instances(std::list<nullbit*>& ls)
684 {
685 nullbit *sample = new nullbit("/test/dentry", 0, 10, 15, false);
686 nullbit *sample2 = new nullbit("/test/dirty", 10, 20, 25, true);
687 ls.push_back(sample);
688 ls.push_back(sample2);
689 }
690
691 // EMetaBlob::dirlump
692
693 void EMetaBlob::dirlump::encode(bufferlist& bl, uint64_t features) const
694 {
695 ENCODE_START(2, 2, bl);
696 encode(fnode, bl);
697 encode(state, bl);
698 encode(nfull, bl);
699 encode(nremote, bl);
700 encode(nnull, bl);
701 _encode_bits(features);
702 encode(dnbl, bl);
703 ENCODE_FINISH(bl);
704 }
705
706 void EMetaBlob::dirlump::decode(bufferlist::const_iterator &bl)
707 {
708 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl)
709 decode(fnode, bl);
710 decode(state, bl);
711 decode(nfull, bl);
712 decode(nremote, bl);
713 decode(nnull, bl);
714 decode(dnbl, bl);
715 dn_decoded = false; // don't decode bits unless we need them.
716 DECODE_FINISH(bl);
717 }
718
719 void EMetaBlob::dirlump::dump(Formatter *f) const
720 {
721 if (!dn_decoded) {
722 dirlump *me = const_cast<dirlump*>(this);
723 me->_decode_bits();
724 }
725 f->open_object_section("fnode");
726 fnode.dump(f);
727 f->close_section(); // fnode
728 f->dump_string("state", state_string());
729 f->dump_int("nfull", nfull);
730 f->dump_int("nremote", nremote);
731 f->dump_int("nnull", nnull);
732
733 f->open_array_section("full bits");
734 for (const auto& iter : dfull) {
735 f->open_object_section("fullbit");
736 iter.dump(f);
737 f->close_section(); // fullbit
738 }
739 f->close_section(); // full bits
740 f->open_array_section("remote bits");
741 for (const auto& iter : dremote) {
742 f->open_object_section("remotebit");
743 iter.dump(f);
744 f->close_section(); // remotebit
745 }
746 f->close_section(); // remote bits
747 f->open_array_section("null bits");
748 for (const auto& iter : dnull) {
749 f->open_object_section("null bit");
750 iter.dump(f);
751 f->close_section(); // null bit
752 }
753 f->close_section(); // null bits
754 }
755
756 void EMetaBlob::dirlump::generate_test_instances(std::list<dirlump*>& ls)
757 {
758 ls.push_back(new dirlump());
759 }
760
761 /**
762 * EMetaBlob proper
763 */
764 void EMetaBlob::encode(bufferlist& bl, uint64_t features) const
765 {
766 ENCODE_START(8, 5, bl);
767 encode(lump_order, bl);
768 encode(lump_map, bl, features);
769 encode(roots, bl, features);
770 encode(table_tids, bl);
771 encode(opened_ino, bl);
772 encode(allocated_ino, bl);
773 encode(used_preallocated_ino, bl);
774 encode(preallocated_inos, bl);
775 encode(client_name, bl);
776 encode(inotablev, bl);
777 encode(sessionmapv, bl);
778 encode(truncate_start, bl);
779 encode(truncate_finish, bl);
780 encode(destroyed_inodes, bl);
781 encode(client_reqs, bl);
782 encode(renamed_dirino, bl);
783 encode(renamed_dir_frags, bl);
784 {
785 // make MDSRank use v6 format happy
786 int64_t i = -1;
787 bool b = false;
788 encode(i, bl);
789 encode(b, bl);
790 }
791 encode(client_flushes, bl);
792 ENCODE_FINISH(bl);
793 }
794 void EMetaBlob::decode(bufferlist::const_iterator &bl)
795 {
796 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
797 decode(lump_order, bl);
798 decode(lump_map, bl);
799 if (struct_v >= 4) {
800 decode(roots, bl);
801 } else {
802 bufferlist rootbl;
803 decode(rootbl, bl);
804 if (rootbl.length()) {
805 auto p = rootbl.cbegin();
806 roots.emplace_back(p);
807 }
808 }
809 decode(table_tids, bl);
810 decode(opened_ino, bl);
811 decode(allocated_ino, bl);
812 decode(used_preallocated_ino, bl);
813 decode(preallocated_inos, bl);
814 decode(client_name, bl);
815 decode(inotablev, bl);
816 decode(sessionmapv, bl);
817 decode(truncate_start, bl);
818 decode(truncate_finish, bl);
819 decode(destroyed_inodes, bl);
820 if (struct_v >= 2) {
821 decode(client_reqs, bl);
822 } else {
823 list<metareqid_t> r;
824 decode(r, bl);
825 while (!r.empty()) {
826 client_reqs.push_back(pair<metareqid_t,uint64_t>(r.front(), 0));
827 r.pop_front();
828 }
829 }
830 if (struct_v >= 3) {
831 decode(renamed_dirino, bl);
832 decode(renamed_dir_frags, bl);
833 }
834 if (struct_v >= 6) {
835 // ignore
836 int64_t i;
837 bool b;
838 decode(i, bl);
839 decode(b, bl);
840 }
841 if (struct_v >= 8) {
842 decode(client_flushes, bl);
843 }
844 DECODE_FINISH(bl);
845 }
846
847
848 /**
849 * Get all inodes touched by this metablob. Includes the 'bits' within
850 * dirlumps, and the inodes of the dirs themselves.
851 */
852 void EMetaBlob::get_inodes(
853 std::set<inodeno_t> &inodes) const
854 {
855 // For all dirlumps in this metablob
856 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
857 // Record inode of dirlump
858 inodeno_t const dir_ino = i->first.ino;
859 inodes.insert(dir_ino);
860
861 // Decode dirlump bits
862 dirlump const &dl = i->second;
863 dl._decode_bits();
864
865 // Record inodes of fullbits
866 for (const auto& iter : dl.get_dfull()) {
867 inodes.insert(iter.inode.ino);
868 }
869
870 // Record inodes of remotebits
871 for (const auto& iter : dl.get_dremote()) {
872 inodes.insert(iter.ino);
873 }
874 }
875 }
876
877
878 /**
879 * Get a map of dirfrag to set of dentries in that dirfrag which are
880 * touched in this operation.
881 */
882 void EMetaBlob::get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const
883 {
884 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
885 dirlump const &dl = i->second;
886 dirfrag_t const &df = i->first;
887
888 // Get all bits
889 dl._decode_bits();
890
891 // For all bits, store dentry
892 for (const auto& iter : dl.get_dfull()) {
893 dentries[df].insert(iter.dn);
894 }
895 for (const auto& iter : dl.get_dremote()) {
896 dentries[df].insert(iter.dn);
897 }
898 for (const auto& iter : dl.get_dnull()) {
899 dentries[df].insert(iter.dn);
900 }
901 }
902 }
903
904
905
906 /**
907 * Calculate all paths that we can infer are touched by this metablob. Only uses
908 * information local to this metablob so it may only be the path within the
909 * subtree.
910 */
911 void EMetaBlob::get_paths(
912 std::vector<std::string> &paths) const
913 {
914 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
915 typedef std::pair<inodeno_t, std::string> Location;
916
917 // Whenever we see a dentry within a dirlump, we remember it as a child of
918 // the dirlump's inode
919 std::map<inodeno_t, std::vector<std::string> > children;
920
921 // Whenever we see a location for an inode, remember it: this allows us to
922 // build a path given an inode
923 std::map<inodeno_t, Location> ino_locations;
924
925 // Special case: operations on root inode populate roots but not dirlumps
926 if (lump_map.empty() && !roots.empty()) {
927 paths.push_back("/");
928 return;
929 }
930
931 // First pass
932 // ==========
933 // Build a tiny local metadata cache for the path structure in this metablob
934 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
935 inodeno_t const dir_ino = i->first.ino;
936 dirlump const &dl = i->second;
937 dl._decode_bits();
938
939 for (const auto& iter : dl.get_dfull()) {
940 std::string_view dentry = iter.dn;
941 children[dir_ino].emplace_back(dentry);
942 ino_locations[iter.inode.ino] = Location(dir_ino, dentry);
943 }
944
945 for (const auto& iter : dl.get_dremote()) {
946 std::string_view dentry = iter.dn;
947 children[dir_ino].emplace_back(dentry);
948 }
949
950 for (const auto& iter : dl.get_dnull()) {
951 std::string_view dentry = iter.dn;
952 children[dir_ino].emplace_back(dentry);
953 }
954 }
955
956 std::vector<Location> leaf_locations;
957
958 // Second pass
959 // ===========
960 // Output paths for all childless nodes in the metablob
961 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
962 inodeno_t const dir_ino = i->first.ino;
963 dirlump const &dl = i->second;
964 dl._decode_bits();
965
966 for (const auto& iter : dl.get_dfull()) {
967 std::string_view dentry = iter.dn;
968 if (children.find(iter.inode.ino) == children.end()) {
969 leaf_locations.push_back(Location(dir_ino, dentry));
970 }
971 }
972
973 for (const auto& iter : dl.get_dremote()) {
974 std::string_view dentry = iter.dn;
975 leaf_locations.push_back(Location(dir_ino, dentry));
976 }
977
978 for (const auto& iter : dl.get_dnull()) {
979 std::string_view dentry = iter.dn;
980 leaf_locations.push_back(Location(dir_ino, dentry));
981 }
982 }
983
984 // For all the leaf locations identified, generate paths
985 for (std::vector<Location>::iterator i = leaf_locations.begin(); i != leaf_locations.end(); ++i) {
986 Location const &loc = *i;
987 std::string path = loc.second;
988 inodeno_t ino = loc.first;
989 std::map<inodeno_t, Location>::iterator iter = ino_locations.find(ino);
990 while(iter != ino_locations.end()) {
991 Location const &loc = iter->second;
992 if (!path.empty()) {
993 path = loc.second + "/" + path;
994 } else {
995 path = loc.second + path;
996 }
997 iter = ino_locations.find(loc.first);
998 }
999
1000 paths.push_back(path);
1001 }
1002 }
1003
1004
1005 void EMetaBlob::dump(Formatter *f) const
1006 {
1007 f->open_array_section("lumps");
1008 for (const auto& d : lump_order) {
1009 f->open_object_section("lump");
1010 f->open_object_section("dirfrag");
1011 f->dump_stream("dirfrag") << d;
1012 f->close_section(); // dirfrag
1013 f->open_object_section("dirlump");
1014 lump_map.at(d).dump(f);
1015 f->close_section(); // dirlump
1016 f->close_section(); // lump
1017 }
1018 f->close_section(); // lumps
1019
1020 f->open_array_section("roots");
1021 for (const auto& iter : roots) {
1022 f->open_object_section("root");
1023 iter.dump(f);
1024 f->close_section(); // root
1025 }
1026 f->close_section(); // roots
1027
1028 f->open_array_section("tableclient tranactions");
1029 for (const auto& p : table_tids) {
1030 f->open_object_section("transaction");
1031 f->dump_int("tid", p.first);
1032 f->dump_int("version", p.second);
1033 f->close_section(); // transaction
1034 }
1035 f->close_section(); // tableclient transactions
1036
1037 f->dump_int("renamed directory inodeno", renamed_dirino);
1038
1039 f->open_array_section("renamed directory fragments");
1040 for (const auto& p : renamed_dir_frags) {
1041 f->dump_int("frag", p);
1042 }
1043 f->close_section(); // renamed directory fragments
1044
1045 f->dump_int("inotable version", inotablev);
1046 f->dump_int("SessionMap version", sessionmapv);
1047 f->dump_int("allocated ino", allocated_ino);
1048
1049 f->dump_stream("preallocated inos") << preallocated_inos;
1050 f->dump_int("used preallocated ino", used_preallocated_ino);
1051
1052 f->open_object_section("client name");
1053 client_name.dump(f);
1054 f->close_section(); // client name
1055
1056 f->open_array_section("inodes starting a truncate");
1057 for(const auto& ino : truncate_start) {
1058 f->dump_int("inodeno", ino);
1059 }
1060 f->close_section(); // truncate inodes
1061 f->open_array_section("inodes finishing a truncated");
1062 for(const auto& p : truncate_finish) {
1063 f->open_object_section("inode+segment");
1064 f->dump_int("inodeno", p.first);
1065 f->dump_int("truncate starting segment", p.second);
1066 f->close_section(); // truncated inode
1067 }
1068 f->close_section(); // truncate finish inodes
1069
1070 f->open_array_section("destroyed inodes");
1071 for(vector<inodeno_t>::const_iterator i = destroyed_inodes.begin();
1072 i != destroyed_inodes.end(); ++i) {
1073 f->dump_int("inodeno", *i);
1074 }
1075 f->close_section(); // destroyed inodes
1076
1077 f->open_array_section("client requests");
1078 for(const auto& p : client_reqs) {
1079 f->open_object_section("Client request");
1080 f->dump_stream("request ID") << p.first;
1081 f->dump_int("oldest request on client", p.second);
1082 f->close_section(); // request
1083 }
1084 f->close_section(); // client requests
1085 }
1086
1087 void EMetaBlob::generate_test_instances(std::list<EMetaBlob*>& ls)
1088 {
1089 ls.push_back(new EMetaBlob());
1090 }
1091
1092 void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
1093 {
1094 dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
1095
1096 ceph_assert(logseg);
1097
1098 ceph_assert(g_conf()->mds_kill_journal_replay_at != 1);
1099
1100 for (auto& p : roots) {
1101 CInode *in = mds->mdcache->get_inode(p.inode.ino);
1102 bool isnew = in ? false:true;
1103 if (!in)
1104 in = new CInode(mds->mdcache, false, 2, CEPH_NOSNAP);
1105 p.update_inode(mds, in);
1106
1107 if (isnew)
1108 mds->mdcache->add_inode(in);
1109 if (p.is_dirty()) in->_mark_dirty(logseg);
1110 dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl;
1111 }
1112
1113 CInode *renamed_diri = 0;
1114 CDir *olddir = 0;
1115 if (renamed_dirino) {
1116 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
1117 if (renamed_diri)
1118 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri << dendl;
1119 else
1120 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino << dendl;
1121
1122 int nnull = 0;
1123 for (const auto& lp : lump_order) {
1124 dirlump &lump = lump_map[lp];
1125 if (lump.nnull) {
1126 dout(10) << "EMetaBlob.replay found null dentry in dir " << lp << dendl;
1127 nnull += lump.nnull;
1128 }
1129 }
1130 ceph_assert(nnull <= 1);
1131 }
1132
1133 // keep track of any inodes we unlink and don't relink elsewhere
1134 map<CInode*, CDir*> unlinked;
1135 set<CInode*> linked;
1136
1137 // walk through my dirs (in order!)
1138 for (const auto& lp : lump_order) {
1139 dout(10) << "EMetaBlob.replay dir " << lp << dendl;
1140 dirlump &lump = lump_map[lp];
1141
1142 // the dir
1143 CDir *dir = mds->mdcache->get_force_dirfrag(lp, true);
1144 if (!dir) {
1145 // hmm. do i have the inode?
1146 CInode *diri = mds->mdcache->get_inode((lp).ino);
1147 if (!diri) {
1148 if (MDS_INO_IS_MDSDIR(lp.ino)) {
1149 ceph_assert(MDS_INO_MDSDIR(mds->get_nodeid()) != lp.ino);
1150 diri = mds->mdcache->create_system_inode(lp.ino, S_IFDIR|0755);
1151 diri->state_clear(CInode::STATE_AUTH);
1152 dout(10) << "EMetaBlob.replay created base " << *diri << dendl;
1153 } else {
1154 dout(0) << "EMetaBlob.replay missing dir ino " << lp.ino << dendl;
1155 mds->clog->error() << "failure replaying journal (EMetaBlob)";
1156 mds->damaged();
1157 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1158 }
1159 }
1160
1161 // create the dirfrag
1162 dir = diri->get_or_open_dirfrag(mds->mdcache, lp.frag);
1163
1164 if (MDS_INO_IS_BASE(lp.ino))
1165 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
1166
1167 dout(10) << "EMetaBlob.replay added dir " << *dir << dendl;
1168 }
1169 dir->set_version( lump.fnode.version );
1170 dir->fnode = lump.fnode;
1171
1172 if (lump.is_importing()) {
1173 dir->state_set(CDir::STATE_AUTH);
1174 dir->state_clear(CDir::STATE_COMPLETE);
1175 }
1176 if (lump.is_dirty()) {
1177 dir->_mark_dirty(logseg);
1178
1179 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
1180 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir << dendl;
1181 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
1182 logseg->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
1183 } else {
1184 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir << dendl;
1185 }
1186 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
1187 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir << dendl;
1188 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
1189 logseg->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
1190 } else {
1191 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir << dendl;
1192 }
1193 }
1194 if (lump.is_dirty_dft()) {
1195 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir << dendl;
1196 dir->state_set(CDir::STATE_DIRTYDFT);
1197 mds->locker->mark_updated_scatterlock(&dir->inode->dirfragtreelock);
1198 logseg->dirty_dirfrag_dirfragtree.push_back(&dir->inode->item_dirty_dirfrag_dirfragtree);
1199 }
1200 if (lump.is_new())
1201 dir->mark_new(logseg);
1202 if (lump.is_complete())
1203 dir->mark_complete();
1204
1205 dout(10) << "EMetaBlob.replay updated dir " << *dir << dendl;
1206
1207 // decode bits
1208 lump._decode_bits();
1209
1210 // full dentry+inode pairs
1211 for (auto& fb : lump._get_dfull()) {
1212 CDentry *dn = dir->lookup_exact_snap(fb.dn, fb.dnlast);
1213 if (!dn) {
1214 dn = dir->add_null_dentry(fb.dn, fb.dnfirst, fb.dnlast);
1215 dn->set_version(fb.dnv);
1216 if (fb.is_dirty()) dn->_mark_dirty(logseg);
1217 dout(10) << "EMetaBlob.replay added (full) " << *dn << dendl;
1218 } else {
1219 dn->set_version(fb.dnv);
1220 if (fb.is_dirty()) dn->_mark_dirty(logseg);
1221 dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *dn << dendl;
1222 dn->first = fb.dnfirst;
1223 ceph_assert(dn->last == fb.dnlast);
1224 }
1225 if (lump.is_importing())
1226 dn->state_set(CDentry::STATE_AUTH);
1227
1228 CInode *in = mds->mdcache->get_inode(fb.inode.ino, fb.dnlast);
1229 if (!in) {
1230 in = new CInode(mds->mdcache, dn->is_auth(), fb.dnfirst, fb.dnlast);
1231 fb.update_inode(mds, in);
1232 mds->mdcache->add_inode(in);
1233 if (!dn->get_linkage()->is_null()) {
1234 if (dn->get_linkage()->is_primary()) {
1235 unlinked[dn->get_linkage()->get_inode()] = dir;
1236 stringstream ss;
1237 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1238 << " " << *dn->get_linkage()->get_inode() << " should be " << fb.inode.ino;
1239 dout(0) << ss.str() << dendl;
1240 mds->clog->warn(ss);
1241 }
1242 dir->unlink_inode(dn, false);
1243 }
1244 if (unlinked.count(in))
1245 linked.insert(in);
1246 dir->link_primary_inode(dn, in);
1247 dout(10) << "EMetaBlob.replay added " << *in << dendl;
1248 } else {
1249 in->first = fb.dnfirst;
1250 fb.update_inode(mds, in);
1251 if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) {
1252 dout(10) << "EMetaBlob.replay unlinking " << *in << dendl;
1253 unlinked[in] = in->get_parent_dir();
1254 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
1255 }
1256 if (dn->get_linkage()->get_inode() != in) {
1257 if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1258 if (dn->get_linkage()->is_primary()) {
1259 unlinked[dn->get_linkage()->get_inode()] = dir;
1260 stringstream ss;
1261 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1262 << " " << *dn->get_linkage()->get_inode() << " should be " << fb.inode.ino;
1263 dout(0) << ss.str() << dendl;
1264 mds->clog->warn(ss);
1265 }
1266 dir->unlink_inode(dn, false);
1267 }
1268 if (unlinked.count(in))
1269 linked.insert(in);
1270 dir->link_primary_inode(dn, in);
1271 dout(10) << "EMetaBlob.replay linked " << *in << dendl;
1272 } else {
1273 dout(10) << "EMetaBlob.replay for [" << fb.dnfirst << "," << fb.dnlast << "] had " << *in << dendl;
1274 }
1275 ceph_assert(in->first == fb.dnfirst ||
1276 (in->is_multiversion() && in->first > fb.dnfirst));
1277 }
1278 if (fb.is_dirty())
1279 in->_mark_dirty(logseg);
1280 if (fb.is_dirty_parent())
1281 in->mark_dirty_parent(logseg, fb.is_dirty_pool());
1282 if (fb.need_snapflush())
1283 logseg->open_files.push_back(&in->item_open_file);
1284 if (dn->is_auth())
1285 in->state_set(CInode::STATE_AUTH);
1286 else
1287 in->state_clear(CInode::STATE_AUTH);
1288 ceph_assert(g_conf()->mds_kill_journal_replay_at != 2);
1289 }
1290
1291 // remote dentries
1292 for (const auto& rb : lump.get_dremote()) {
1293 CDentry *dn = dir->lookup_exact_snap(rb.dn, rb.dnlast);
1294 if (!dn) {
1295 dn = dir->add_remote_dentry(rb.dn, rb.ino, rb.d_type, rb.dnfirst, rb.dnlast);
1296 dn->set_version(rb.dnv);
1297 if (rb.dirty) dn->_mark_dirty(logseg);
1298 dout(10) << "EMetaBlob.replay added " << *dn << dendl;
1299 } else {
1300 if (!dn->get_linkage()->is_null()) {
1301 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1302 if (dn->get_linkage()->is_primary()) {
1303 unlinked[dn->get_linkage()->get_inode()] = dir;
1304 stringstream ss;
1305 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1306 << " " << *dn->get_linkage()->get_inode() << " should be remote " << rb.ino;
1307 dout(0) << ss.str() << dendl;
1308 }
1309 dir->unlink_inode(dn, false);
1310 }
1311 dir->link_remote_inode(dn, rb.ino, rb.d_type);
1312 dn->set_version(rb.dnv);
1313 if (rb.dirty) dn->_mark_dirty(logseg);
1314 dout(10) << "EMetaBlob.replay for [" << rb.dnfirst << "," << rb.dnlast << "] had " << *dn << dendl;
1315 dn->first = rb.dnfirst;
1316 ceph_assert(dn->last == rb.dnlast);
1317 }
1318 if (lump.is_importing())
1319 dn->state_set(CDentry::STATE_AUTH);
1320 }
1321
1322 // null dentries
1323 for (const auto& nb : lump.get_dnull()) {
1324 CDentry *dn = dir->lookup_exact_snap(nb.dn, nb.dnlast);
1325 if (!dn) {
1326 dn = dir->add_null_dentry(nb.dn, nb.dnfirst, nb.dnlast);
1327 dn->set_version(nb.dnv);
1328 if (nb.dirty) dn->_mark_dirty(logseg);
1329 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn << dendl;
1330 } else {
1331 dn->first = nb.dnfirst;
1332 if (!dn->get_linkage()->is_null()) {
1333 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1334 CInode *in = dn->get_linkage()->get_inode();
1335 // For renamed inode, We may call CInode::force_dirfrag() later.
1336 // CInode::force_dirfrag() doesn't work well when inode is detached
1337 // from the hierarchy.
1338 if (!renamed_diri || renamed_diri != in) {
1339 if (dn->get_linkage()->is_primary())
1340 unlinked[in] = dir;
1341 dir->unlink_inode(dn);
1342 }
1343 }
1344 dn->set_version(nb.dnv);
1345 if (nb.dirty) dn->_mark_dirty(logseg);
1346 dout(10) << "EMetaBlob.replay had " << *dn << dendl;
1347 ceph_assert(dn->last == nb.dnlast);
1348 }
1349 olddir = dir;
1350 if (lump.is_importing())
1351 dn->state_set(CDentry::STATE_AUTH);
1352
1353 // Make null dentries the first things we trim
1354 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn << dendl;
1355 }
1356 }
1357
1358 ceph_assert(g_conf()->mds_kill_journal_replay_at != 3);
1359
1360 if (renamed_dirino) {
1361 if (renamed_diri) {
1362 ceph_assert(unlinked.count(renamed_diri));
1363 ceph_assert(linked.count(renamed_diri));
1364 olddir = unlinked[renamed_diri];
1365 } else {
1366 // we imported a diri we haven't seen before
1367 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
1368 ceph_assert(renamed_diri); // it was in the metablob
1369 }
1370
1371 if (olddir) {
1372 if (olddir->authority() != CDIR_AUTH_UNDEF &&
1373 renamed_diri->authority() == CDIR_AUTH_UNDEF) {
1374 ceph_assert(slaveup); // auth to non-auth, must be slave prepare
1375 frag_vec_t leaves;
1376 renamed_diri->dirfragtree.get_leaves(leaves);
1377 for (const auto& leaf : leaves) {
1378 CDir *dir = renamed_diri->get_dirfrag(leaf);
1379 ceph_assert(dir);
1380 if (dir->get_dir_auth() == CDIR_AUTH_UNDEF)
1381 // preserve subtree bound until slave commit
1382 slaveup->olddirs.insert(dir->inode);
1383 else
1384 dir->state_set(CDir::STATE_AUTH);
1385 }
1386 }
1387
1388 mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
1389
1390 // see if we can discard the subtree we renamed out of
1391 CDir *root = mds->mdcache->get_subtree_root(olddir);
1392 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
1393 if (slaveup) // preserve the old dir until slave commit
1394 slaveup->olddirs.insert(olddir->inode);
1395 else
1396 mds->mdcache->try_trim_non_auth_subtree(root);
1397 }
1398 }
1399
1400 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1401 if (renamed_diri->authority() != CDIR_AUTH_UNDEF) {
1402 for (const auto& p : renamed_dir_frags) {
1403 CDir *dir = renamed_diri->get_dirfrag(p);
1404 if (dir) {
1405 // we already had the inode before, and we already adjusted this subtree accordingly.
1406 dout(10) << " already had+adjusted rename import bound " << *dir << dendl;
1407 ceph_assert(olddir);
1408 continue;
1409 }
1410 dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, p);
1411 dout(10) << " creating new rename import bound " << *dir << dendl;
1412 dir->state_clear(CDir::STATE_AUTH);
1413 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
1414 }
1415 }
1416
1417 // rename may overwrite an empty directory and move it into stray dir.
1418 unlinked.erase(renamed_diri);
1419 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
1420 if (!linked.count(p->first))
1421 continue;
1422 ceph_assert(p->first->is_dir());
1423 mds->mdcache->adjust_subtree_after_rename(p->first, p->second, false);
1424 }
1425 }
1426
1427 if (!unlinked.empty()) {
1428 for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); ++p)
1429 unlinked.erase(*p);
1430 dout(10) << " unlinked set contains " << unlinked << dendl;
1431 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
1432 CInode *in = p->first;
1433 if (slaveup) { // preserve unlinked inodes until slave commit
1434 slaveup->unlinked.insert(in);
1435 if (in->snaprealm)
1436 in->snaprealm->adjust_parent();
1437 } else
1438 mds->mdcache->remove_inode_recursive(in);
1439 }
1440 }
1441
1442 // table client transactions
1443 for (const auto& p : table_tids) {
1444 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p.first)
1445 << " transaction " << p.second << dendl;
1446 MDSTableClient *client = mds->get_table_client(p.first);
1447 if (client)
1448 client->got_journaled_agree(p.second, logseg);
1449 }
1450
1451 // opened ino?
1452 if (opened_ino) {
1453 CInode *in = mds->mdcache->get_inode(opened_ino);
1454 ceph_assert(in);
1455 dout(10) << "EMetaBlob.replay noting opened inode " << *in << dendl;
1456 logseg->open_files.push_back(&in->item_open_file);
1457 }
1458
1459 // allocated_inos
1460 if (inotablev) {
1461 if (mds->inotable->get_version() >= inotablev) {
1462 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1463 << " <= table " << mds->inotable->get_version() << dendl;
1464 } else {
1465 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1466 << " - 1 == table " << mds->inotable->get_version()
1467 << " allocated+used " << allocated_ino
1468 << " prealloc " << preallocated_inos
1469 << dendl;
1470 if (allocated_ino)
1471 mds->inotable->replay_alloc_id(allocated_ino);
1472 if (preallocated_inos.size())
1473 mds->inotable->replay_alloc_ids(preallocated_inos);
1474
1475 // [repair bad inotable updates]
1476 if (inotablev > mds->inotable->get_version()) {
1477 mds->clog->error() << "journal replay inotablev mismatch "
1478 << mds->inotable->get_version() << " -> " << inotablev;
1479 mds->inotable->force_replay_version(inotablev);
1480 }
1481
1482 ceph_assert(inotablev == mds->inotable->get_version());
1483 }
1484 }
1485 if (sessionmapv) {
1486 unsigned diff = (used_preallocated_ino && !preallocated_inos.empty()) ? 2 : 1;
1487 if (mds->sessionmap.get_version() >= sessionmapv) {
1488 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1489 << " <= table " << mds->sessionmap.get_version() << dendl;
1490 } else if (mds->sessionmap.get_version() + diff == sessionmapv) {
1491 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1492 << " - " << diff << " == table " << mds->sessionmap.get_version()
1493 << " prealloc " << preallocated_inos
1494 << " used " << used_preallocated_ino
1495 << dendl;
1496 Session *session = mds->sessionmap.get_session(client_name);
1497 if (session) {
1498 dout(20) << " (session prealloc " << session->info.prealloc_inos << ")" << dendl;
1499 if (used_preallocated_ino) {
1500 if (!session->info.prealloc_inos.empty()) {
1501 inodeno_t i = session->take_ino(used_preallocated_ino);
1502 ceph_assert(i == used_preallocated_ino);
1503 session->info.used_inos.clear();
1504 }
1505 mds->sessionmap.replay_dirty_session(session);
1506 }
1507 if (!preallocated_inos.empty()) {
1508 session->info.prealloc_inos.insert(preallocated_inos);
1509 mds->sessionmap.replay_dirty_session(session);
1510 }
1511
1512 } else {
1513 dout(10) << "EMetaBlob.replay no session for " << client_name << dendl;
1514 if (used_preallocated_ino)
1515 mds->sessionmap.replay_advance_version();
1516
1517 if (!preallocated_inos.empty())
1518 mds->sessionmap.replay_advance_version();
1519 }
1520 ceph_assert(sessionmapv == mds->sessionmap.get_version());
1521 } else {
1522 mds->clog->error() << "EMetaBlob.replay sessionmap v " << sessionmapv
1523 << " - " << diff << " > table " << mds->sessionmap.get_version();
1524 ceph_assert(g_conf()->mds_wipe_sessions);
1525 mds->sessionmap.wipe();
1526 mds->sessionmap.set_version(sessionmapv);
1527 }
1528 }
1529
1530 // truncating inodes
1531 for (const auto& ino : truncate_start) {
1532 CInode *in = mds->mdcache->get_inode(ino);
1533 ceph_assert(in);
1534 mds->mdcache->add_recovered_truncate(in, logseg);
1535 }
1536 for (const auto& p : truncate_finish) {
1537 LogSegment *ls = mds->mdlog->get_segment(p.second);
1538 if (ls) {
1539 CInode *in = mds->mdcache->get_inode(p.first);
1540 ceph_assert(in);
1541 mds->mdcache->remove_recovered_truncate(in, ls);
1542 }
1543 }
1544
1545 // destroyed inodes
1546 if (!destroyed_inodes.empty()) {
1547 for (vector<inodeno_t>::iterator p = destroyed_inodes.begin();
1548 p != destroyed_inodes.end();
1549 ++p) {
1550 CInode *in = mds->mdcache->get_inode(*p);
1551 if (in) {
1552 dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl;
1553 CDentry *parent = in->get_parent_dn();
1554 mds->mdcache->remove_inode(in);
1555 if (parent) {
1556 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl;
1557 ceph_assert(parent->get_linkage()->is_null());
1558 }
1559 } else {
1560 dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl;
1561 }
1562 }
1563 mds->mdcache->open_file_table.note_destroyed_inos(logseg->seq, destroyed_inodes);
1564 }
1565
1566 // client requests
1567 for (const auto& p : client_reqs) {
1568 if (p.first.name.is_client()) {
1569 dout(10) << "EMetaBlob.replay request " << p.first << " trim_to " << p.second << dendl;
1570 inodeno_t created = allocated_ino ? allocated_ino : used_preallocated_ino;
1571 // if we allocated an inode, there should be exactly one client request id.
1572 ceph_assert(created == inodeno_t() || client_reqs.size() == 1);
1573
1574 Session *session = mds->sessionmap.get_session(p.first.name);
1575 if (session) {
1576 session->add_completed_request(p.first.tid, created);
1577 if (p.second)
1578 session->trim_completed_requests(p.second);
1579 }
1580 }
1581 }
1582
1583 // client flushes
1584 for (const auto& p : client_flushes) {
1585 if (p.first.name.is_client()) {
1586 dout(10) << "EMetaBlob.replay flush " << p.first << " trim_to " << p.second << dendl;
1587 Session *session = mds->sessionmap.get_session(p.first.name);
1588 if (session) {
1589 session->add_completed_flush(p.first.tid);
1590 if (p.second)
1591 session->trim_completed_flushes(p.second);
1592 }
1593 }
1594 }
1595
1596 // update segment
1597 update_segment(logseg);
1598
1599 ceph_assert(g_conf()->mds_kill_journal_replay_at != 4);
1600 }
1601
1602 // -----------------------
1603 // EPurged
1604 void EPurged::update_segment()
1605 {
1606 if (inos.size() && inotablev)
1607 get_segment()->inotablev = inotablev;
1608 return;
1609 }
1610
1611 void EPurged::replay(MDSRank *mds)
1612 {
1613 if (inos.size()) {
1614 LogSegment *ls = mds->mdlog->get_segment(seq);
1615 if (ls) {
1616 ls->purge_inodes.subtract(inos);
1617 }
1618 if (mds->inotable->get_version() >= inotablev) {
1619 dout(10) << "EPurged.replay inotable " << mds->inotable->get_version()
1620 << " >= " << inotablev << ", noop" << dendl;
1621 } else {
1622 dout(10) << "EPurged.replay inotable " << mds->inotable->get_version()
1623 << " < " << inotablev << " " << dendl;
1624 mds->inotable->replay_release_ids(inos);
1625 assert(mds->inotable->get_version() == inotablev);
1626 }
1627 }
1628 update_segment();
1629 }
1630
1631 void EPurged::encode(bufferlist& bl, uint64_t features) const
1632 {
1633 ENCODE_START(1, 1, bl);
1634 encode(inos, bl);
1635 encode(inotablev, bl);
1636 encode(seq, bl);
1637 ENCODE_FINISH(bl);
1638 }
1639
1640 void EPurged::decode(bufferlist::const_iterator& bl)
1641 {
1642 DECODE_START(1, bl);
1643 decode(inos, bl);
1644 decode(inotablev, bl);
1645 decode(seq, bl);
1646 DECODE_FINISH(bl);
1647 }
1648
1649 void EPurged::dump(Formatter *f) const
1650 {
1651 f->dump_stream("inos") << inos;
1652 f->dump_int("inotable version", inotablev);
1653 f->dump_int("segment seq", seq);
1654 }
1655
1656 // -----------------------
1657 // ESession
1658
1659 void ESession::update_segment()
1660 {
1661 get_segment()->sessionmapv = cmapv;
1662 if (inos.size() && inotablev)
1663 get_segment()->inotablev = inotablev;
1664 }
1665
1666 void ESession::replay(MDSRank *mds)
1667 {
1668 if (purge_inos.size())
1669 get_segment()->purge_inodes.insert(purge_inos);
1670
1671 if (mds->sessionmap.get_version() >= cmapv) {
1672 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1673 << " >= " << cmapv << ", noop" << dendl;
1674 } else if (mds->sessionmap.get_version() + 1 == cmapv) {
1675 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1676 << " < " << cmapv << " " << (open ? "open":"close") << " " << client_inst << dendl;
1677 Session *session;
1678 if (open) {
1679 session = mds->sessionmap.get_or_add_session(client_inst);
1680 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1681 session->set_client_metadata(client_metadata);
1682 dout(10) << " opened session " << session->info.inst << dendl;
1683 } else {
1684 session = mds->sessionmap.get_session(client_inst.name);
1685 if (session) { // there always should be a session, but there's a bug
1686 if (session->get_connection() == NULL) {
1687 dout(10) << " removed session " << session->info.inst << dendl;
1688 mds->sessionmap.remove_session(session);
1689 session = NULL;
1690 } else {
1691 session->clear(); // the client has reconnected; keep the Session, but reset
1692 dout(10) << " reset session " << session->info.inst << " (they reconnected)" << dendl;
1693 }
1694 } else {
1695 mds->clog->error() << "replayed stray Session close event for " << client_inst
1696 << " from time " << stamp << ", ignoring";
1697 }
1698 }
1699 if (session) {
1700 mds->sessionmap.replay_dirty_session(session);
1701 } else {
1702 mds->sessionmap.replay_advance_version();
1703 }
1704 ceph_assert(mds->sessionmap.get_version() == cmapv);
1705 } else {
1706 mds->clog->error() << "ESession.replay sessionmap v " << cmapv
1707 << " - 1 > table " << mds->sessionmap.get_version();
1708 ceph_assert(g_conf()->mds_wipe_sessions);
1709 mds->sessionmap.wipe();
1710 mds->sessionmap.set_version(cmapv);
1711 }
1712
1713 if (inos.size() && inotablev) {
1714 if (mds->inotable->get_version() >= inotablev) {
1715 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1716 << " >= " << inotablev << ", noop" << dendl;
1717 } else {
1718 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1719 << " < " << inotablev << " " << (open ? "add":"remove") << dendl;
1720 ceph_assert(!open); // for now
1721 mds->inotable->replay_release_ids(inos);
1722 ceph_assert(mds->inotable->get_version() == inotablev);
1723 }
1724 }
1725
1726 update_segment();
1727 }
1728
1729 void ESession::encode(bufferlist &bl, uint64_t features) const
1730 {
1731 ENCODE_START(6, 5, bl);
1732 encode(stamp, bl);
1733 encode(client_inst, bl, features);
1734 encode(open, bl);
1735 encode(cmapv, bl);
1736 encode(inos, bl);
1737 encode(inotablev, bl);
1738 encode(client_metadata, bl);
1739 encode(purge_inos, bl);
1740 ENCODE_FINISH(bl);
1741 }
1742
1743 void ESession::decode(bufferlist::const_iterator &bl)
1744 {
1745 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
1746 if (struct_v >= 2)
1747 decode(stamp, bl);
1748 decode(client_inst, bl);
1749 decode(open, bl);
1750 decode(cmapv, bl);
1751 decode(inos, bl);
1752 decode(inotablev, bl);
1753 if (struct_v == 4) {
1754 decode(client_metadata.kv_map, bl);
1755 } else if (struct_v >= 5) {
1756 decode(client_metadata, bl);
1757 }
1758 if (struct_v >= 6){
1759 decode(purge_inos, bl);
1760 }
1761
1762 DECODE_FINISH(bl);
1763 }
1764
1765 void ESession::dump(Formatter *f) const
1766 {
1767 f->dump_stream("client instance") << client_inst;
1768 f->dump_string("open", open ? "true" : "false");
1769 f->dump_int("client map version", cmapv);
1770 f->dump_stream("inos") << inos;
1771 f->dump_int("inotable version", inotablev);
1772 f->open_object_section("client_metadata");
1773 client_metadata.dump(f);
1774 f->close_section(); // client_metadata
1775 }
1776
1777 void ESession::generate_test_instances(std::list<ESession*>& ls)
1778 {
1779 ls.push_back(new ESession);
1780 }
1781
1782 // -----------------------
1783 // ESessions
1784
1785 void ESessions::encode(bufferlist &bl, uint64_t features) const
1786 {
1787 ENCODE_START(2, 1, bl);
1788 encode(client_map, bl, features);
1789 encode(cmapv, bl);
1790 encode(stamp, bl);
1791 encode(client_metadata_map, bl);
1792 ENCODE_FINISH(bl);
1793 }
1794
1795 void ESessions::decode_old(bufferlist::const_iterator &bl)
1796 {
1797 using ceph::decode;
1798 decode(client_map, bl);
1799 decode(cmapv, bl);
1800 if (!bl.end())
1801 decode(stamp, bl);
1802 }
1803
1804 void ESessions::decode_new(bufferlist::const_iterator &bl)
1805 {
1806 DECODE_START(2, bl);
1807 decode(client_map, bl);
1808 decode(cmapv, bl);
1809 decode(stamp, bl);
1810 if (struct_v >= 2)
1811 decode(client_metadata_map, bl);
1812 DECODE_FINISH(bl);
1813 }
1814
1815 void ESessions::dump(Formatter *f) const
1816 {
1817 f->dump_int("client map version", cmapv);
1818
1819 f->open_array_section("client map");
1820 for (map<client_t,entity_inst_t>::const_iterator i = client_map.begin();
1821 i != client_map.end(); ++i) {
1822 f->open_object_section("client");
1823 f->dump_int("client id", i->first.v);
1824 f->dump_stream("client entity") << i->second;
1825 f->close_section(); // client
1826 }
1827 f->close_section(); // client map
1828 }
1829
1830 void ESessions::generate_test_instances(std::list<ESessions*>& ls)
1831 {
1832 ls.push_back(new ESessions());
1833 }
1834
1835 void ESessions::update_segment()
1836 {
1837 get_segment()->sessionmapv = cmapv;
1838 }
1839
1840 void ESessions::replay(MDSRank *mds)
1841 {
1842 if (mds->sessionmap.get_version() >= cmapv) {
1843 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1844 << " >= " << cmapv << ", noop" << dendl;
1845 } else {
1846 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1847 << " < " << cmapv << dendl;
1848 mds->sessionmap.replay_open_sessions(cmapv, client_map, client_metadata_map);
1849 }
1850 update_segment();
1851 }
1852
1853
1854 // -----------------------
1855 // ETableServer
1856
1857 void ETableServer::encode(bufferlist& bl, uint64_t features) const
1858 {
1859 ENCODE_START(3, 3, bl);
1860 encode(stamp, bl);
1861 encode(table, bl);
1862 encode(op, bl);
1863 encode(reqid, bl);
1864 encode(bymds, bl);
1865 encode(mutation, bl);
1866 encode(tid, bl);
1867 encode(version, bl);
1868 ENCODE_FINISH(bl);
1869 }
1870
1871 void ETableServer::decode(bufferlist::const_iterator &bl)
1872 {
1873 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
1874 if (struct_v >= 2)
1875 decode(stamp, bl);
1876 decode(table, bl);
1877 decode(op, bl);
1878 decode(reqid, bl);
1879 decode(bymds, bl);
1880 decode(mutation, bl);
1881 decode(tid, bl);
1882 decode(version, bl);
1883 DECODE_FINISH(bl);
1884 }
1885
1886 void ETableServer::dump(Formatter *f) const
1887 {
1888 f->dump_int("table id", table);
1889 f->dump_int("op", op);
1890 f->dump_int("request id", reqid);
1891 f->dump_int("by mds", bymds);
1892 f->dump_int("tid", tid);
1893 f->dump_int("version", version);
1894 }
1895
1896 void ETableServer::generate_test_instances(std::list<ETableServer*>& ls)
1897 {
1898 ls.push_back(new ETableServer());
1899 }
1900
1901
1902 void ETableServer::update_segment()
1903 {
1904 get_segment()->tablev[table] = version;
1905 }
1906
1907 void ETableServer::replay(MDSRank *mds)
1908 {
1909 MDSTableServer *server = mds->get_table_server(table);
1910 if (!server)
1911 return;
1912
1913 if (server->get_version() >= version) {
1914 dout(10) << "ETableServer.replay " << get_mdstable_name(table)
1915 << " " << get_mdstableserver_opname(op)
1916 << " event " << version
1917 << " <= table " << server->get_version() << dendl;
1918 return;
1919 }
1920
1921 dout(10) << " ETableServer.replay " << get_mdstable_name(table)
1922 << " " << get_mdstableserver_opname(op)
1923 << " event " << version << " - 1 == table " << server->get_version() << dendl;
1924 ceph_assert(version-1 == server->get_version());
1925
1926 switch (op) {
1927 case TABLESERVER_OP_PREPARE: {
1928 server->_note_prepare(bymds, reqid, true);
1929 bufferlist out;
1930 server->_prepare(mutation, reqid, bymds, out);
1931 mutation = std::move(out);
1932 break;
1933 }
1934 case TABLESERVER_OP_COMMIT:
1935 server->_commit(tid, ref_t<MMDSTableRequest>());
1936 server->_note_commit(tid, true);
1937 break;
1938 case TABLESERVER_OP_ROLLBACK:
1939 server->_rollback(tid);
1940 server->_note_rollback(tid, true);
1941 break;
1942 case TABLESERVER_OP_SERVER_UPDATE:
1943 server->_server_update(mutation);
1944 server->_note_server_update(mutation, true);
1945 break;
1946 default:
1947 mds->clog->error() << "invalid tableserver op in ETableServer";
1948 mds->damaged();
1949 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1950 }
1951
1952 ceph_assert(version == server->get_version());
1953 update_segment();
1954 }
1955
1956
1957 // ---------------------
1958 // ETableClient
1959
1960 void ETableClient::encode(bufferlist& bl, uint64_t features) const
1961 {
1962 ENCODE_START(3, 3, bl);
1963 encode(stamp, bl);
1964 encode(table, bl);
1965 encode(op, bl);
1966 encode(tid, bl);
1967 ENCODE_FINISH(bl);
1968 }
1969
1970 void ETableClient::decode(bufferlist::const_iterator &bl)
1971 {
1972 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
1973 if (struct_v >= 2)
1974 decode(stamp, bl);
1975 decode(table, bl);
1976 decode(op, bl);
1977 decode(tid, bl);
1978 DECODE_FINISH(bl);
1979 }
1980
1981 void ETableClient::dump(Formatter *f) const
1982 {
1983 f->dump_int("table", table);
1984 f->dump_int("op", op);
1985 f->dump_int("tid", tid);
1986 }
1987
1988 void ETableClient::generate_test_instances(std::list<ETableClient*>& ls)
1989 {
1990 ls.push_back(new ETableClient());
1991 }
1992
1993 void ETableClient::replay(MDSRank *mds)
1994 {
1995 dout(10) << " ETableClient.replay " << get_mdstable_name(table)
1996 << " op " << get_mdstableserver_opname(op)
1997 << " tid " << tid << dendl;
1998
1999 MDSTableClient *client = mds->get_table_client(table);
2000 if (!client)
2001 return;
2002
2003 ceph_assert(op == TABLESERVER_OP_ACK);
2004 client->got_journaled_ack(tid);
2005 }
2006
2007
2008 // -----------------------
2009 // ESnap
2010 /*
2011 void ESnap::update_segment()
2012 {
2013 get_segment()->tablev[TABLE_SNAP] = version;
2014 }
2015
2016 void ESnap::replay(MDSRank *mds)
2017 {
2018 if (mds->snaptable->get_version() >= version) {
2019 dout(10) << "ESnap.replay event " << version
2020 << " <= table " << mds->snaptable->get_version() << dendl;
2021 return;
2022 }
2023
2024 dout(10) << " ESnap.replay event " << version
2025 << " - 1 == table " << mds->snaptable->get_version() << dendl;
2026 ceph_assert(version-1 == mds->snaptable->get_version());
2027
2028 if (create) {
2029 version_t v;
2030 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
2031 ceph_assert(s == snap.snapid);
2032 } else {
2033 mds->snaptable->remove(snap.snapid);
2034 }
2035
2036 ceph_assert(version == mds->snaptable->get_version());
2037 }
2038 */
2039
2040
2041
2042 // -----------------------
2043 // EUpdate
2044
2045 void EUpdate::encode(bufferlist &bl, uint64_t features) const
2046 {
2047 ENCODE_START(4, 4, bl);
2048 encode(stamp, bl);
2049 encode(type, bl);
2050 encode(metablob, bl, features);
2051 encode(client_map, bl);
2052 encode(cmapv, bl);
2053 encode(reqid, bl);
2054 encode(had_slaves, bl);
2055 ENCODE_FINISH(bl);
2056 }
2057
2058 void EUpdate::decode(bufferlist::const_iterator &bl)
2059 {
2060 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl);
2061 if (struct_v >= 2)
2062 decode(stamp, bl);
2063 decode(type, bl);
2064 decode(metablob, bl);
2065 decode(client_map, bl);
2066 if (struct_v >= 3)
2067 decode(cmapv, bl);
2068 decode(reqid, bl);
2069 decode(had_slaves, bl);
2070 DECODE_FINISH(bl);
2071 }
2072
2073 void EUpdate::dump(Formatter *f) const
2074 {
2075 f->open_object_section("metablob");
2076 metablob.dump(f);
2077 f->close_section(); // metablob
2078
2079 f->dump_string("type", type);
2080 f->dump_int("client map length", client_map.length());
2081 f->dump_int("client map version", cmapv);
2082 f->dump_stream("reqid") << reqid;
2083 f->dump_string("had slaves", had_slaves ? "true" : "false");
2084 }
2085
2086 void EUpdate::generate_test_instances(std::list<EUpdate*>& ls)
2087 {
2088 ls.push_back(new EUpdate());
2089 }
2090
2091
2092 void EUpdate::update_segment()
2093 {
2094 auto&& segment = get_segment();
2095 metablob.update_segment(segment);
2096
2097 if (client_map.length())
2098 segment->sessionmapv = cmapv;
2099
2100 if (had_slaves)
2101 segment->uncommitted_masters.insert(reqid);
2102 }
2103
2104 void EUpdate::replay(MDSRank *mds)
2105 {
2106 auto&& segment = get_segment();
2107 metablob.replay(mds, segment);
2108
2109 if (had_slaves) {
2110 dout(10) << "EUpdate.replay " << reqid << " had slaves, expecting a matching ECommitted" << dendl;
2111 segment->uncommitted_masters.insert(reqid);
2112 set<mds_rank_t> slaves;
2113 mds->mdcache->add_uncommitted_master(reqid, segment, slaves, true);
2114 }
2115
2116 if (client_map.length()) {
2117 if (mds->sessionmap.get_version() >= cmapv) {
2118 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2119 << " <= table " << mds->sessionmap.get_version() << dendl;
2120 } else {
2121 dout(10) << "EUpdate.replay sessionmap " << mds->sessionmap.get_version()
2122 << " < " << cmapv << dendl;
2123 // open client sessions?
2124 map<client_t,entity_inst_t> cm;
2125 map<client_t,client_metadata_t> cmm;
2126 auto blp = client_map.cbegin();
2127 using ceph::decode;
2128 decode(cm, blp);
2129 if (!blp.end())
2130 decode(cmm, blp);
2131 mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
2132 }
2133 }
2134 update_segment();
2135 }
2136
2137
2138 // ------------------------
2139 // EOpen
2140
2141 void EOpen::encode(bufferlist &bl, uint64_t features) const {
2142 ENCODE_START(4, 3, bl);
2143 encode(stamp, bl);
2144 encode(metablob, bl, features);
2145 encode(inos, bl);
2146 encode(snap_inos, bl);
2147 ENCODE_FINISH(bl);
2148 }
2149
2150 void EOpen::decode(bufferlist::const_iterator &bl) {
2151 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2152 if (struct_v >= 2)
2153 decode(stamp, bl);
2154 decode(metablob, bl);
2155 decode(inos, bl);
2156 if (struct_v >= 4)
2157 decode(snap_inos, bl);
2158 DECODE_FINISH(bl);
2159 }
2160
2161 void EOpen::dump(Formatter *f) const
2162 {
2163 f->open_object_section("metablob");
2164 metablob.dump(f);
2165 f->close_section(); // metablob
2166 f->open_array_section("inos involved");
2167 for (vector<inodeno_t>::const_iterator i = inos.begin();
2168 i != inos.end(); ++i) {
2169 f->dump_int("ino", *i);
2170 }
2171 f->close_section(); // inos
2172 }
2173
2174 void EOpen::generate_test_instances(std::list<EOpen*>& ls)
2175 {
2176 ls.push_back(new EOpen());
2177 ls.push_back(new EOpen());
2178 ls.back()->add_ino(0);
2179 }
2180
2181 void EOpen::update_segment()
2182 {
2183 // ??
2184 }
2185
2186 void EOpen::replay(MDSRank *mds)
2187 {
2188 dout(10) << "EOpen.replay " << dendl;
2189 auto&& segment = get_segment();
2190 metablob.replay(mds, segment);
2191
2192 // note which segments inodes belong to, so we don't have to start rejournaling them
2193 for (const auto &ino : inos) {
2194 CInode *in = mds->mdcache->get_inode(ino);
2195 if (!in) {
2196 dout(0) << "EOpen.replay ino " << ino << " not in metablob" << dendl;
2197 ceph_assert(in);
2198 }
2199 segment->open_files.push_back(&in->item_open_file);
2200 }
2201 for (const auto &vino : snap_inos) {
2202 CInode *in = mds->mdcache->get_inode(vino);
2203 if (!in) {
2204 dout(0) << "EOpen.replay ino " << vino << " not in metablob" << dendl;
2205 ceph_assert(in);
2206 }
2207 segment->open_files.push_back(&in->item_open_file);
2208 }
2209 }
2210
2211
2212 // -----------------------
2213 // ECommitted
2214
2215 void ECommitted::replay(MDSRank *mds)
2216 {
2217 if (mds->mdcache->uncommitted_masters.count(reqid)) {
2218 dout(10) << "ECommitted.replay " << reqid << dendl;
2219 mds->mdcache->uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2220 mds->mdcache->uncommitted_masters.erase(reqid);
2221 } else {
2222 dout(10) << "ECommitted.replay " << reqid << " -- didn't see original op" << dendl;
2223 }
2224 }
2225
2226 void ECommitted::encode(bufferlist& bl, uint64_t features) const
2227 {
2228 ENCODE_START(3, 3, bl);
2229 encode(stamp, bl);
2230 encode(reqid, bl);
2231 ENCODE_FINISH(bl);
2232 }
2233
2234 void ECommitted::decode(bufferlist::const_iterator& bl)
2235 {
2236 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2237 if (struct_v >= 2)
2238 decode(stamp, bl);
2239 decode(reqid, bl);
2240 DECODE_FINISH(bl);
2241 }
2242
2243 void ECommitted::dump(Formatter *f) const {
2244 f->dump_stream("stamp") << stamp;
2245 f->dump_stream("reqid") << reqid;
2246 }
2247
2248 void ECommitted::generate_test_instances(std::list<ECommitted*>& ls)
2249 {
2250 ls.push_back(new ECommitted);
2251 ls.push_back(new ECommitted);
2252 ls.back()->stamp = utime_t(1, 2);
2253 ls.back()->reqid = metareqid_t(entity_name_t::CLIENT(123), 456);
2254 }
2255
2256 // -----------------------
2257 // ESlaveUpdate
2258
2259 void link_rollback::encode(bufferlist &bl) const
2260 {
2261 ENCODE_START(3, 2, bl);
2262 encode(reqid, bl);
2263 encode(ino, bl);
2264 encode(was_inc, bl);
2265 encode(old_ctime, bl);
2266 encode(old_dir_mtime, bl);
2267 encode(old_dir_rctime, bl);
2268 encode(snapbl, bl);
2269 ENCODE_FINISH(bl);
2270 }
2271
2272 void link_rollback::decode(bufferlist::const_iterator &bl)
2273 {
2274 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2275 decode(reqid, bl);
2276 decode(ino, bl);
2277 decode(was_inc, bl);
2278 decode(old_ctime, bl);
2279 decode(old_dir_mtime, bl);
2280 decode(old_dir_rctime, bl);
2281 if (struct_v >= 3)
2282 decode(snapbl, bl);
2283 DECODE_FINISH(bl);
2284 }
2285
2286 void link_rollback::dump(Formatter *f) const
2287 {
2288 f->dump_stream("metareqid") << reqid;
2289 f->dump_int("ino", ino);
2290 f->dump_string("was incremented", was_inc ? "true" : "false");
2291 f->dump_stream("old_ctime") << old_ctime;
2292 f->dump_stream("old_dir_mtime") << old_dir_mtime;
2293 f->dump_stream("old_dir_rctime") << old_dir_rctime;
2294 }
2295
2296 void link_rollback::generate_test_instances(std::list<link_rollback*>& ls)
2297 {
2298 ls.push_back(new link_rollback());
2299 }
2300
2301 void rmdir_rollback::encode(bufferlist& bl) const
2302 {
2303 ENCODE_START(3, 2, bl);
2304 encode(reqid, bl);
2305 encode(src_dir, bl);
2306 encode(src_dname, bl);
2307 encode(dest_dir, bl);
2308 encode(dest_dname, bl);
2309 encode(snapbl, bl);
2310 ENCODE_FINISH(bl);
2311 }
2312
2313 void rmdir_rollback::decode(bufferlist::const_iterator& bl)
2314 {
2315 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2316 decode(reqid, bl);
2317 decode(src_dir, bl);
2318 decode(src_dname, bl);
2319 decode(dest_dir, bl);
2320 decode(dest_dname, bl);
2321 if (struct_v >= 3)
2322 decode(snapbl, bl);
2323 DECODE_FINISH(bl);
2324 }
2325
2326 void rmdir_rollback::dump(Formatter *f) const
2327 {
2328 f->dump_stream("metareqid") << reqid;
2329 f->dump_stream("source directory") << src_dir;
2330 f->dump_string("source dname", src_dname);
2331 f->dump_stream("destination directory") << dest_dir;
2332 f->dump_string("destination dname", dest_dname);
2333 }
2334
2335 void rmdir_rollback::generate_test_instances(std::list<rmdir_rollback*>& ls)
2336 {
2337 ls.push_back(new rmdir_rollback());
2338 }
2339
2340 void rename_rollback::drec::encode(bufferlist &bl) const
2341 {
2342 ENCODE_START(2, 2, bl);
2343 encode(dirfrag, bl);
2344 encode(dirfrag_old_mtime, bl);
2345 encode(dirfrag_old_rctime, bl);
2346 encode(ino, bl);
2347 encode(remote_ino, bl);
2348 encode(dname, bl);
2349 encode(remote_d_type, bl);
2350 encode(old_ctime, bl);
2351 ENCODE_FINISH(bl);
2352 }
2353
2354 void rename_rollback::drec::decode(bufferlist::const_iterator &bl)
2355 {
2356 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2357 decode(dirfrag, bl);
2358 decode(dirfrag_old_mtime, bl);
2359 decode(dirfrag_old_rctime, bl);
2360 decode(ino, bl);
2361 decode(remote_ino, bl);
2362 decode(dname, bl);
2363 decode(remote_d_type, bl);
2364 decode(old_ctime, bl);
2365 DECODE_FINISH(bl);
2366 }
2367
2368 void rename_rollback::drec::dump(Formatter *f) const
2369 {
2370 f->dump_stream("directory fragment") << dirfrag;
2371 f->dump_stream("directory old mtime") << dirfrag_old_mtime;
2372 f->dump_stream("directory old rctime") << dirfrag_old_rctime;
2373 f->dump_int("ino", ino);
2374 f->dump_int("remote ino", remote_ino);
2375 f->dump_string("dname", dname);
2376 uint32_t type = DTTOIF(remote_d_type) & S_IFMT; // convert to type entries
2377 string type_string;
2378 switch(type) {
2379 case S_IFREG:
2380 type_string = "file"; break;
2381 case S_IFLNK:
2382 type_string = "symlink"; break;
2383 case S_IFDIR:
2384 type_string = "directory"; break;
2385 default:
2386 type_string = "UNKNOWN-" + stringify((int)type); break;
2387 }
2388 f->dump_string("remote dtype", type_string);
2389 f->dump_stream("old ctime") << old_ctime;
2390 }
2391
2392 void rename_rollback::drec::generate_test_instances(std::list<drec*>& ls)
2393 {
2394 ls.push_back(new drec());
2395 ls.back()->remote_d_type = IFTODT(S_IFREG);
2396 }
2397
2398 void rename_rollback::encode(bufferlist &bl) const
2399 {
2400 ENCODE_START(3, 2, bl);
2401 encode(reqid, bl);
2402 encode(orig_src, bl);
2403 encode(orig_dest, bl);
2404 encode(stray, bl);
2405 encode(ctime, bl);
2406 encode(srci_snapbl, bl);
2407 encode(desti_snapbl, bl);
2408 ENCODE_FINISH(bl);
2409 }
2410
2411 void rename_rollback::decode(bufferlist::const_iterator &bl)
2412 {
2413 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
2414 decode(reqid, bl);
2415 decode(orig_src, bl);
2416 decode(orig_dest, bl);
2417 decode(stray, bl);
2418 decode(ctime, bl);
2419 if (struct_v >= 3) {
2420 decode(srci_snapbl, bl);
2421 decode(desti_snapbl, bl);
2422 }
2423 DECODE_FINISH(bl);
2424 }
2425
2426 void rename_rollback::dump(Formatter *f) const
2427 {
2428 f->dump_stream("request id") << reqid;
2429 f->open_object_section("original src drec");
2430 orig_src.dump(f);
2431 f->close_section(); // original src drec
2432 f->open_object_section("original dest drec");
2433 orig_dest.dump(f);
2434 f->close_section(); // original dest drec
2435 f->open_object_section("stray drec");
2436 stray.dump(f);
2437 f->close_section(); // stray drec
2438 f->dump_stream("ctime") << ctime;
2439 }
2440
2441 void rename_rollback::generate_test_instances(std::list<rename_rollback*>& ls)
2442 {
2443 ls.push_back(new rename_rollback());
2444 ls.back()->orig_src.remote_d_type = IFTODT(S_IFREG);
2445 ls.back()->orig_dest.remote_d_type = IFTODT(S_IFREG);
2446 ls.back()->stray.remote_d_type = IFTODT(S_IFREG);
2447 }
2448
2449 void ESlaveUpdate::encode(bufferlist &bl, uint64_t features) const
2450 {
2451 ENCODE_START(3, 3, bl);
2452 encode(stamp, bl);
2453 encode(type, bl);
2454 encode(reqid, bl);
2455 encode(master, bl);
2456 encode(op, bl);
2457 encode(origop, bl);
2458 encode(commit, bl, features);
2459 encode(rollback, bl);
2460 ENCODE_FINISH(bl);
2461 }
2462
2463 void ESlaveUpdate::decode(bufferlist::const_iterator &bl)
2464 {
2465 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2466 if (struct_v >= 2)
2467 decode(stamp, bl);
2468 decode(type, bl);
2469 decode(reqid, bl);
2470 decode(master, bl);
2471 decode(op, bl);
2472 decode(origop, bl);
2473 decode(commit, bl);
2474 decode(rollback, bl);
2475 DECODE_FINISH(bl);
2476 }
2477
2478 void ESlaveUpdate::dump(Formatter *f) const
2479 {
2480 f->open_object_section("metablob");
2481 commit.dump(f);
2482 f->close_section(); // metablob
2483
2484 f->dump_int("rollback length", rollback.length());
2485 f->dump_string("type", type);
2486 f->dump_stream("metareqid") << reqid;
2487 f->dump_int("master", master);
2488 f->dump_int("op", op);
2489 f->dump_int("original op", origop);
2490 }
2491
2492 void ESlaveUpdate::generate_test_instances(std::list<ESlaveUpdate*>& ls)
2493 {
2494 ls.push_back(new ESlaveUpdate());
2495 }
2496
2497 void ESlaveUpdate::replay(MDSRank *mds)
2498 {
2499 MDSlaveUpdate *su;
2500 auto&& segment = get_segment();
2501 switch (op) {
2502 case ESlaveUpdate::OP_PREPARE:
2503 dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds." << master
2504 << ": applying commit, saving rollback info" << dendl;
2505 su = new MDSlaveUpdate(origop, rollback);
2506 commit.replay(mds, segment, su);
2507 mds->mdcache->add_uncommitted_slave(reqid, segment, master, su);
2508 break;
2509
2510 case ESlaveUpdate::OP_COMMIT:
2511 dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << dendl;
2512 mds->mdcache->finish_uncommitted_slave(reqid, false);
2513 break;
2514
2515 case ESlaveUpdate::OP_ROLLBACK:
2516 dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
2517 << ": applying rollback commit blob" << dendl;
2518 commit.replay(mds, segment);
2519 mds->mdcache->finish_uncommitted_slave(reqid, false);
2520 break;
2521
2522 default:
2523 mds->clog->error() << "invalid op in ESlaveUpdate";
2524 mds->damaged();
2525 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2526 }
2527 }
2528
2529
2530 // -----------------------
2531 // ESubtreeMap
2532
2533 void ESubtreeMap::encode(bufferlist& bl, uint64_t features) const
2534 {
2535 ENCODE_START(6, 5, bl);
2536 encode(stamp, bl);
2537 encode(metablob, bl, features);
2538 encode(subtrees, bl);
2539 encode(ambiguous_subtrees, bl);
2540 encode(expire_pos, bl);
2541 encode(event_seq, bl);
2542 ENCODE_FINISH(bl);
2543 }
2544
2545 void ESubtreeMap::decode(bufferlist::const_iterator &bl)
2546 {
2547 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
2548 if (struct_v >= 2)
2549 decode(stamp, bl);
2550 decode(metablob, bl);
2551 decode(subtrees, bl);
2552 if (struct_v >= 4)
2553 decode(ambiguous_subtrees, bl);
2554 if (struct_v >= 3)
2555 decode(expire_pos, bl);
2556 if (struct_v >= 6)
2557 decode(event_seq, bl);
2558 DECODE_FINISH(bl);
2559 }
2560
2561 void ESubtreeMap::dump(Formatter *f) const
2562 {
2563 f->open_object_section("metablob");
2564 metablob.dump(f);
2565 f->close_section(); // metablob
2566
2567 f->open_array_section("subtrees");
2568 for(map<dirfrag_t,vector<dirfrag_t> >::const_iterator i = subtrees.begin();
2569 i != subtrees.end(); ++i) {
2570 f->open_object_section("tree");
2571 f->dump_stream("root dirfrag") << i->first;
2572 for (vector<dirfrag_t>::const_iterator j = i->second.begin();
2573 j != i->second.end(); ++j) {
2574 f->dump_stream("bound dirfrag") << *j;
2575 }
2576 f->close_section(); // tree
2577 }
2578 f->close_section(); // subtrees
2579
2580 f->open_array_section("ambiguous subtrees");
2581 for(set<dirfrag_t>::const_iterator i = ambiguous_subtrees.begin();
2582 i != ambiguous_subtrees.end(); ++i) {
2583 f->dump_stream("dirfrag") << *i;
2584 }
2585 f->close_section(); // ambiguous subtrees
2586
2587 f->dump_int("expire position", expire_pos);
2588 }
2589
2590 void ESubtreeMap::generate_test_instances(std::list<ESubtreeMap*>& ls)
2591 {
2592 ls.push_back(new ESubtreeMap());
2593 }
2594
2595 void ESubtreeMap::replay(MDSRank *mds)
2596 {
2597 if (expire_pos && expire_pos > mds->mdlog->journaler->get_expire_pos())
2598 mds->mdlog->journaler->set_expire_pos(expire_pos);
2599
2600 // suck up the subtree map?
2601 if (mds->mdcache->is_subtrees()) {
2602 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl;
2603 int errors = 0;
2604
2605 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2606 p != subtrees.end();
2607 ++p) {
2608 CDir *dir = mds->mdcache->get_dirfrag(p->first);
2609 if (!dir) {
2610 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2611 << " subtree root " << p->first << " not in cache";
2612 ++errors;
2613 continue;
2614 }
2615
2616 if (!mds->mdcache->is_subtree(dir)) {
2617 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2618 << " subtree root " << p->first << " not a subtree in cache";
2619 ++errors;
2620 continue;
2621 }
2622 if (dir->get_dir_auth().first != mds->get_nodeid()) {
2623 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2624 << " subtree root " << p->first
2625 << " is not mine in cache (it's " << dir->get_dir_auth() << ")";
2626 ++errors;
2627 continue;
2628 }
2629
2630 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2631 mds->mdcache->get_force_dirfrag(*q, true);
2632
2633 set<CDir*> bounds;
2634 mds->mdcache->get_subtree_bounds(dir, bounds);
2635 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2636 CDir *b = mds->mdcache->get_dirfrag(*q);
2637 if (!b) {
2638 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2639 << " subtree " << p->first << " bound " << *q << " not in cache";
2640 ++errors;
2641 continue;
2642 }
2643 if (bounds.count(b) == 0) {
2644 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2645 << " subtree " << p->first << " bound " << *q << " not a bound in cache";
2646 ++errors;
2647 continue;
2648 }
2649 bounds.erase(b);
2650 }
2651 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q) {
2652 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2653 << " subtree " << p->first << " has extra bound in cache " << (*q)->dirfrag();
2654 ++errors;
2655 }
2656
2657 if (ambiguous_subtrees.count(p->first)) {
2658 if (!mds->mdcache->have_ambiguous_import(p->first)) {
2659 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2660 << " subtree " << p->first << " is ambiguous but is not in our cache";
2661 ++errors;
2662 }
2663 } else {
2664 if (mds->mdcache->have_ambiguous_import(p->first)) {
2665 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2666 << " subtree " << p->first << " is not ambiguous but is in our cache";
2667 ++errors;
2668 }
2669 }
2670 }
2671
2672 std::vector<CDir*> dirs;
2673 mds->mdcache->get_subtrees(dirs);
2674 for (const auto& dir : dirs) {
2675 if (dir->get_dir_auth().first != mds->get_nodeid())
2676 continue;
2677 if (subtrees.count(dir->dirfrag()) == 0) {
2678 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2679 << " does not include cache subtree " << dir->dirfrag();
2680 ++errors;
2681 }
2682 }
2683
2684 if (errors) {
2685 dout(0) << "journal subtrees: " << subtrees << dendl;
2686 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees << dendl;
2687 mds->mdcache->show_subtrees();
2688 ceph_assert(!g_conf()->mds_debug_subtrees || errors == 0);
2689 }
2690 return;
2691 }
2692
2693 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl;
2694
2695 // first, stick the spanning tree in my cache
2696 //metablob.print(*_dout);
2697 metablob.replay(mds, get_segment());
2698
2699 // restore import/export maps
2700 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2701 p != subtrees.end();
2702 ++p) {
2703 CDir *dir = mds->mdcache->get_dirfrag(p->first);
2704 ceph_assert(dir);
2705 if (ambiguous_subtrees.count(p->first)) {
2706 // ambiguous!
2707 mds->mdcache->add_ambiguous_import(p->first, p->second);
2708 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second,
2709 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
2710 } else {
2711 // not ambiguous
2712 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid());
2713 }
2714 }
2715
2716 mds->mdcache->recalc_auth_bits(true);
2717
2718 mds->mdcache->show_subtrees();
2719 }
2720
2721
2722
2723 // -----------------------
2724 // EFragment
2725
2726 void EFragment::replay(MDSRank *mds)
2727 {
2728 dout(10) << "EFragment.replay " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << dendl;
2729
2730 std::vector<CDir*> resultfrags;
2731 MDSContext::vec waiters;
2732
2733 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2734 // it will be once we replay the metablob , but first we need to
2735 // refragment anything we already have in the cache.
2736 CInode *in = mds->mdcache->get_inode(ino);
2737
2738 auto&& segment = get_segment();
2739 switch (op) {
2740 case OP_PREPARE:
2741 mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, segment, &rollback);
2742
2743 if (in)
2744 mds->mdcache->adjust_dir_fragments(in, basefrag, bits, &resultfrags, waiters, true);
2745 break;
2746
2747 case OP_ROLLBACK: {
2748 frag_vec_t old_frags;
2749 if (in) {
2750 in->dirfragtree.get_leaves_under(basefrag, old_frags);
2751 if (orig_frags.empty()) {
2752 // old format EFragment
2753 mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, &resultfrags, waiters, true);
2754 } else {
2755 for (const auto& fg : orig_frags)
2756 mds->mdcache->force_dir_fragment(in, fg);
2757 }
2758 }
2759 mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), std::move(old_frags));
2760 break;
2761 }
2762
2763 case OP_COMMIT:
2764 case OP_FINISH:
2765 mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op);
2766 break;
2767
2768 default:
2769 ceph_abort();
2770 }
2771
2772 metablob.replay(mds, segment);
2773 if (in && g_conf()->mds_debug_frag)
2774 in->verify_dirfrags();
2775 }
2776
2777 void EFragment::encode(bufferlist &bl, uint64_t features) const {
2778 ENCODE_START(5, 4, bl);
2779 encode(stamp, bl);
2780 encode(op, bl);
2781 encode(ino, bl);
2782 encode(basefrag, bl);
2783 encode(bits, bl);
2784 encode(metablob, bl, features);
2785 encode(orig_frags, bl);
2786 encode(rollback, bl);
2787 ENCODE_FINISH(bl);
2788 }
2789
2790 void EFragment::decode(bufferlist::const_iterator &bl) {
2791 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
2792 if (struct_v >= 2)
2793 decode(stamp, bl);
2794 if (struct_v >= 3)
2795 decode(op, bl);
2796 decode(ino, bl);
2797 decode(basefrag, bl);
2798 decode(bits, bl);
2799 decode(metablob, bl);
2800 if (struct_v >= 5) {
2801 decode(orig_frags, bl);
2802 decode(rollback, bl);
2803 }
2804 DECODE_FINISH(bl);
2805 }
2806
2807 void EFragment::dump(Formatter *f) const
2808 {
2809 /*f->open_object_section("Metablob");
2810 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2811 f->close_section();*/
2812 f->dump_string("op", op_name(op));
2813 f->dump_stream("ino") << ino;
2814 f->dump_stream("base frag") << basefrag;
2815 f->dump_int("bits", bits);
2816 }
2817
2818 void EFragment::generate_test_instances(std::list<EFragment*>& ls)
2819 {
2820 ls.push_back(new EFragment);
2821 ls.push_back(new EFragment);
2822 ls.back()->op = OP_PREPARE;
2823 ls.back()->ino = 1;
2824 ls.back()->bits = 5;
2825 }
2826
2827 void dirfrag_rollback::encode(bufferlist &bl) const
2828 {
2829 ENCODE_START(1, 1, bl);
2830 encode(fnode, bl);
2831 ENCODE_FINISH(bl);
2832 }
2833
2834 void dirfrag_rollback::decode(bufferlist::const_iterator &bl)
2835 {
2836 DECODE_START(1, bl);
2837 decode(fnode, bl);
2838 DECODE_FINISH(bl);
2839 }
2840
2841
2842
2843 // =========================================================================
2844
2845 // -----------------------
2846 // EExport
2847
2848 void EExport::replay(MDSRank *mds)
2849 {
2850 dout(10) << "EExport.replay " << base << dendl;
2851 auto&& segment = get_segment();
2852 metablob.replay(mds, segment);
2853
2854 CDir *dir = mds->mdcache->get_dirfrag(base);
2855 ceph_assert(dir);
2856
2857 set<CDir*> realbounds;
2858 for (set<dirfrag_t>::iterator p = bounds.begin();
2859 p != bounds.end();
2860 ++p) {
2861 CDir *bd = mds->mdcache->get_dirfrag(*p);
2862 ceph_assert(bd);
2863 realbounds.insert(bd);
2864 }
2865
2866 // adjust auth away
2867 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, CDIR_AUTH_UNDEF);
2868
2869 mds->mdcache->try_trim_non_auth_subtree(dir);
2870 }
2871
2872 void EExport::encode(bufferlist& bl, uint64_t features) const
2873 {
2874 ENCODE_START(4, 3, bl);
2875 encode(stamp, bl);
2876 encode(metablob, bl, features);
2877 encode(base, bl);
2878 encode(bounds, bl);
2879 encode(target, bl);
2880 ENCODE_FINISH(bl);
2881 }
2882
2883 void EExport::decode(bufferlist::const_iterator &bl)
2884 {
2885 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2886 if (struct_v >= 2)
2887 decode(stamp, bl);
2888 decode(metablob, bl);
2889 decode(base, bl);
2890 decode(bounds, bl);
2891 if (struct_v >= 4)
2892 decode(target, bl);
2893 DECODE_FINISH(bl);
2894 }
2895
2896 void EExport::dump(Formatter *f) const
2897 {
2898 f->dump_float("stamp", (double)stamp);
2899 /*f->open_object_section("Metablob");
2900 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2901 f->close_section();*/
2902 f->dump_stream("base dirfrag") << base;
2903 f->open_array_section("bounds dirfrags");
2904 for (set<dirfrag_t>::const_iterator i = bounds.begin();
2905 i != bounds.end(); ++i) {
2906 f->dump_stream("dirfrag") << *i;
2907 }
2908 f->close_section(); // bounds dirfrags
2909 }
2910
2911 void EExport::generate_test_instances(std::list<EExport*>& ls)
2912 {
2913 EExport *sample = new EExport();
2914 ls.push_back(sample);
2915 }
2916
2917
2918 // -----------------------
2919 // EImportStart
2920
2921 void EImportStart::update_segment()
2922 {
2923 get_segment()->sessionmapv = cmapv;
2924 }
2925
2926 void EImportStart::replay(MDSRank *mds)
2927 {
2928 dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl;
2929 //metablob.print(*_dout);
2930 auto&& segment = get_segment();
2931 metablob.replay(mds, segment);
2932
2933 // put in ambiguous import list
2934 mds->mdcache->add_ambiguous_import(base, bounds);
2935
2936 // set auth partially to us so we don't trim it
2937 CDir *dir = mds->mdcache->get_dirfrag(base);
2938 ceph_assert(dir);
2939
2940 set<CDir*> realbounds;
2941 for (vector<dirfrag_t>::iterator p = bounds.begin();
2942 p != bounds.end();
2943 ++p) {
2944 CDir *bd = mds->mdcache->get_dirfrag(*p);
2945 ceph_assert(bd);
2946 if (!bd->is_subtree_root())
2947 bd->state_clear(CDir::STATE_AUTH);
2948 realbounds.insert(bd);
2949 }
2950
2951 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds,
2952 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
2953
2954 // open client sessions?
2955 if (mds->sessionmap.get_version() >= cmapv) {
2956 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
2957 << " >= " << cmapv << ", noop" << dendl;
2958 } else {
2959 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
2960 << " < " << cmapv << dendl;
2961 map<client_t,entity_inst_t> cm;
2962 map<client_t,client_metadata_t> cmm;
2963 auto blp = client_map.cbegin();
2964 using ceph::decode;
2965 decode(cm, blp);
2966 if (!blp.end())
2967 decode(cmm, blp);
2968 mds->sessionmap.replay_open_sessions(cmapv, cm, cmm);
2969 }
2970 update_segment();
2971 }
2972
2973 void EImportStart::encode(bufferlist &bl, uint64_t features) const {
2974 ENCODE_START(4, 3, bl);
2975 encode(stamp, bl);
2976 encode(base, bl);
2977 encode(metablob, bl, features);
2978 encode(bounds, bl);
2979 encode(cmapv, bl);
2980 encode(client_map, bl);
2981 encode(from, bl);
2982 ENCODE_FINISH(bl);
2983 }
2984
2985 void EImportStart::decode(bufferlist::const_iterator &bl) {
2986 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2987 if (struct_v >= 2)
2988 decode(stamp, bl);
2989 decode(base, bl);
2990 decode(metablob, bl);
2991 decode(bounds, bl);
2992 decode(cmapv, bl);
2993 decode(client_map, bl);
2994 if (struct_v >= 4)
2995 decode(from, bl);
2996 DECODE_FINISH(bl);
2997 }
2998
2999 void EImportStart::dump(Formatter *f) const
3000 {
3001 f->dump_stream("base dirfrag") << base;
3002 f->open_array_section("boundary dirfrags");
3003 for (vector<dirfrag_t>::const_iterator iter = bounds.begin();
3004 iter != bounds.end(); ++iter) {
3005 f->dump_stream("frag") << *iter;
3006 }
3007 f->close_section();
3008 }
3009
3010 void EImportStart::generate_test_instances(std::list<EImportStart*>& ls)
3011 {
3012 ls.push_back(new EImportStart);
3013 }
3014
3015 // -----------------------
3016 // EImportFinish
3017
3018 void EImportFinish::replay(MDSRank *mds)
3019 {
3020 if (mds->mdcache->have_ambiguous_import(base)) {
3021 dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl;
3022 if (success) {
3023 mds->mdcache->finish_ambiguous_import(base);
3024 } else {
3025 CDir *dir = mds->mdcache->get_dirfrag(base);
3026 ceph_assert(dir);
3027 vector<dirfrag_t> bounds;
3028 mds->mdcache->get_ambiguous_import_bounds(base, bounds);
3029 mds->mdcache->adjust_bounded_subtree_auth(dir, bounds, CDIR_AUTH_UNDEF);
3030 mds->mdcache->cancel_ambiguous_import(dir);
3031 mds->mdcache->try_trim_non_auth_subtree(dir);
3032 }
3033 } else {
3034 // this shouldn't happen unless this is an old journal
3035 dout(10) << "EImportFinish.replay " << base << " success=" << success
3036 << " on subtree not marked as ambiguous"
3037 << dendl;
3038 mds->clog->error() << "failure replaying journal (EImportFinish)";
3039 mds->damaged();
3040 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3041 }
3042 }
3043
3044 void EImportFinish::encode(bufferlist& bl, uint64_t features) const
3045 {
3046 ENCODE_START(3, 3, bl);
3047 encode(stamp, bl);
3048 encode(base, bl);
3049 encode(success, bl);
3050 ENCODE_FINISH(bl);
3051 }
3052
3053 void EImportFinish::decode(bufferlist::const_iterator &bl)
3054 {
3055 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
3056 if (struct_v >= 2)
3057 decode(stamp, bl);
3058 decode(base, bl);
3059 decode(success, bl);
3060 DECODE_FINISH(bl);
3061 }
3062
3063 void EImportFinish::dump(Formatter *f) const
3064 {
3065 f->dump_stream("base dirfrag") << base;
3066 f->dump_string("success", success ? "true" : "false");
3067 }
3068 void EImportFinish::generate_test_instances(std::list<EImportFinish*>& ls)
3069 {
3070 ls.push_back(new EImportFinish);
3071 ls.push_back(new EImportFinish);
3072 ls.back()->success = true;
3073 }
3074
3075
3076 // ------------------------
3077 // EResetJournal
3078
3079 void EResetJournal::encode(bufferlist& bl, uint64_t features) const
3080 {
3081 ENCODE_START(2, 2, bl);
3082 encode(stamp, bl);
3083 ENCODE_FINISH(bl);
3084 }
3085
3086 void EResetJournal::decode(bufferlist::const_iterator &bl)
3087 {
3088 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
3089 decode(stamp, bl);
3090 DECODE_FINISH(bl);
3091 }
3092
3093 void EResetJournal::dump(Formatter *f) const
3094 {
3095 f->dump_stream("timestamp") << stamp;
3096 }
3097
3098 void EResetJournal::generate_test_instances(std::list<EResetJournal*>& ls)
3099 {
3100 ls.push_back(new EResetJournal());
3101 }
3102
3103 void EResetJournal::replay(MDSRank *mds)
3104 {
3105 dout(1) << "EResetJournal" << dendl;
3106
3107 mds->sessionmap.wipe();
3108 mds->inotable->replay_reset();
3109
3110 if (mds->mdsmap->get_root() == mds->get_nodeid()) {
3111 CDir *rootdir = mds->mdcache->get_root()->get_or_open_dirfrag(mds->mdcache, frag_t());
3112 mds->mdcache->adjust_subtree_auth(rootdir, mds->get_nodeid());
3113 }
3114
3115 CDir *mydir = mds->mdcache->get_myin()->get_or_open_dirfrag(mds->mdcache, frag_t());
3116 mds->mdcache->adjust_subtree_auth(mydir, mds->get_nodeid());
3117
3118 mds->mdcache->recalc_auth_bits(true);
3119
3120 mds->mdcache->show_subtrees();
3121 }
3122
3123
3124 void ENoOp::encode(bufferlist &bl, uint64_t features) const
3125 {
3126 ENCODE_START(2, 2, bl);
3127 encode(pad_size, bl);
3128 uint8_t const pad = 0xff;
3129 for (unsigned int i = 0; i < pad_size; ++i) {
3130 encode(pad, bl);
3131 }
3132 ENCODE_FINISH(bl);
3133 }
3134
3135
3136 void ENoOp::decode(bufferlist::const_iterator &bl)
3137 {
3138 DECODE_START(2, bl);
3139 decode(pad_size, bl);
3140 if (bl.get_remaining() != pad_size) {
3141 // This is spiritually an assertion, but expressing in a way that will let
3142 // journal debug tools catch it and recognise a malformed entry.
3143 throw buffer::end_of_buffer();
3144 } else {
3145 bl += pad_size;
3146 }
3147 DECODE_FINISH(bl);
3148 }
3149
3150
3151 void ENoOp::replay(MDSRank *mds)
3152 {
3153 dout(4) << "ENoOp::replay, " << pad_size << " bytes skipped in journal" << dendl;
3154 }
3155
3156 /**
3157 * If re-formatting an old journal that used absolute log position
3158 * references as segment sequence numbers, use this function to update
3159 * it.
3160 *
3161 * @param mds
3162 * MDSRank instance, just used for logging
3163 * @param old_to_new
3164 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3165 *
3166 * @return
3167 * True if the event was modified.
3168 */
3169 bool EMetaBlob::rewrite_truncate_finish(MDSRank const *mds,
3170 std::map<LogSegment::seq_t, LogSegment::seq_t> const &old_to_new)
3171 {
3172 bool modified = false;
3173 map<inodeno_t, LogSegment::seq_t> new_trunc_finish;
3174 for (const auto& p : truncate_finish) {
3175 auto q = old_to_new.find(p.second);
3176 if (q != old_to_new.end()) {
3177 dout(20) << __func__ << " applying segment seq mapping "
3178 << p.second << " -> " << q->second << dendl;
3179 new_trunc_finish.emplace(p.first, q->second);
3180 modified = true;
3181 } else {
3182 dout(20) << __func__ << " no segment seq mapping found for "
3183 << p.second << dendl;
3184 new_trunc_finish.insert(p);
3185 }
3186 }
3187 truncate_finish.swap(new_trunc_finish);
3188
3189 return modified;
3190 }