]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/journal.cc
bump version to 12.2.5-pve1
[ceph.git] / ceph / src / mds / journal.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "common/config.h"
16 #include "osdc/Journaler.h"
17 #include "events/ESubtreeMap.h"
18 #include "events/ESession.h"
19 #include "events/ESessions.h"
20
21 #include "events/EMetaBlob.h"
22 #include "events/EResetJournal.h"
23 #include "events/ENoOp.h"
24
25 #include "events/EUpdate.h"
26 #include "events/ESlaveUpdate.h"
27 #include "events/EOpen.h"
28 #include "events/ECommitted.h"
29
30 #include "events/EExport.h"
31 #include "events/EImportStart.h"
32 #include "events/EImportFinish.h"
33 #include "events/EFragment.h"
34
35 #include "events/ETableClient.h"
36 #include "events/ETableServer.h"
37
38 #include "include/stringify.h"
39
40 #include "LogSegment.h"
41
42 #include "MDSRank.h"
43 #include "MDLog.h"
44 #include "MDCache.h"
45 #include "Server.h"
46 #include "Migrator.h"
47 #include "Mutation.h"
48
49 #include "InoTable.h"
50 #include "MDSTableClient.h"
51 #include "MDSTableServer.h"
52
53 #include "Locker.h"
54
55 #define dout_context g_ceph_context
56 #define dout_subsys ceph_subsys_mds
57 #undef dout_prefix
58 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
59
60
61 // -----------------------
62 // LogSegment
63
64 void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio)
65 {
66 set<CDir*> commit;
67
68 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire" << dendl;
69
70 assert(g_conf->mds_kill_journal_expire_at != 1);
71
72 // commit dirs
73 for (elist<CDir*>::iterator p = new_dirfrags.begin(); !p.end(); ++p) {
74 dout(20) << " new_dirfrag " << **p << dendl;
75 assert((*p)->is_auth());
76 commit.insert(*p);
77 }
78 for (elist<CDir*>::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) {
79 dout(20) << " dirty_dirfrag " << **p << dendl;
80 assert((*p)->is_auth());
81 commit.insert(*p);
82 }
83 for (elist<CDentry*>::iterator p = dirty_dentries.begin(); !p.end(); ++p) {
84 dout(20) << " dirty_dentry " << **p << dendl;
85 assert((*p)->is_auth());
86 commit.insert((*p)->get_dir());
87 }
88 for (elist<CInode*>::iterator p = dirty_inodes.begin(); !p.end(); ++p) {
89 dout(20) << " dirty_inode " << **p << dendl;
90 assert((*p)->is_auth());
91 if ((*p)->is_base()) {
92 (*p)->store(gather_bld.new_sub());
93 } else
94 commit.insert((*p)->get_parent_dn()->get_dir());
95 }
96
97 if (!commit.empty()) {
98 for (set<CDir*>::iterator p = commit.begin();
99 p != commit.end();
100 ++p) {
101 CDir *dir = *p;
102 assert(dir->is_auth());
103 if (dir->can_auth_pin()) {
104 dout(15) << "try_to_expire committing " << *dir << dendl;
105 dir->commit(0, gather_bld.new_sub(), false, op_prio);
106 } else {
107 dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl;
108 dir->add_waiter(CDir::WAIT_UNFREEZE, gather_bld.new_sub());
109 }
110 }
111 }
112
113 // master ops with possibly uncommitted slaves
114 for (set<metareqid_t>::iterator p = uncommitted_masters.begin();
115 p != uncommitted_masters.end();
116 ++p) {
117 dout(10) << "try_to_expire waiting for slaves to ack commit on " << *p << dendl;
118 mds->mdcache->wait_for_uncommitted_master(*p, gather_bld.new_sub());
119 }
120
121 // uncommitted fragments
122 for (set<dirfrag_t>::iterator p = uncommitted_fragments.begin();
123 p != uncommitted_fragments.end();
124 ++p) {
125 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p << dendl;
126 mds->mdcache->wait_for_uncommitted_fragment(*p, gather_bld.new_sub());
127 }
128
129 // nudge scatterlocks
130 for (elist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) {
131 CInode *in = *p;
132 dout(10) << "try_to_expire waiting for dirlock flush on " << *in << dendl;
133 mds->locker->scatter_nudge(&in->filelock, gather_bld.new_sub());
134 }
135 for (elist<CInode*>::iterator p = dirty_dirfrag_dirfragtree.begin(); !p.end(); ++p) {
136 CInode *in = *p;
137 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in << dendl;
138 mds->locker->scatter_nudge(&in->dirfragtreelock, gather_bld.new_sub());
139 }
140 for (elist<CInode*>::iterator p = dirty_dirfrag_nest.begin(); !p.end(); ++p) {
141 CInode *in = *p;
142 dout(10) << "try_to_expire waiting for nest flush on " << *in << dendl;
143 mds->locker->scatter_nudge(&in->nestlock, gather_bld.new_sub());
144 }
145
146 assert(g_conf->mds_kill_journal_expire_at != 2);
147
148 // open files and snap inodes
149 if (!open_files.empty()) {
150 assert(!mds->mdlog->is_capped()); // hmm FIXME
151 EOpen *le = 0;
152 LogSegment *ls = mds->mdlog->get_current_segment();
153 assert(ls != this);
154 elist<CInode*>::iterator p = open_files.begin(member_offset(CInode, item_open_file));
155 while (!p.end()) {
156 CInode *in = *p;
157 ++p;
158 if (in->last == CEPH_NOSNAP && in->is_auth() &&
159 !in->is_ambiguous_auth() && in->is_any_caps()) {
160 if (in->is_any_caps_wanted()) {
161 dout(20) << "try_to_expire requeueing open file " << *in << dendl;
162 if (!le) {
163 le = new EOpen(mds->mdlog);
164 mds->mdlog->start_entry(le);
165 }
166 le->add_clean_inode(in);
167 ls->open_files.push_back(&in->item_open_file);
168 } else {
169 // drop inodes that aren't wanted
170 dout(20) << "try_to_expire not requeueing and delisting unwanted file " << *in << dendl;
171 in->item_open_file.remove_myself();
172 }
173 } else if (in->last != CEPH_NOSNAP && !in->client_snap_caps.empty()) {
174 // journal snap inodes that need flush. This simplify the mds failover hanlding
175 dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl;
176 if (!le) {
177 le = new EOpen(mds->mdlog);
178 mds->mdlog->start_entry(le);
179 }
180 le->add_clean_inode(in);
181 ls->open_files.push_back(&in->item_open_file);
182 } else {
183 /*
184 * we can get a capless inode here if we replay an open file, the client fails to
185 * reconnect it, but does REPLAY an open request (that adds it to the logseg). AFAICS
186 * it's ok for the client to replay an open on a file it doesn't have in it's cache
187 * anymore.
188 *
189 * this makes the mds less sensitive to strict open_file consistency, although it does
190 * make it easier to miss subtle problems.
191 */
192 dout(20) << "try_to_expire not requeueing and delisting capless file " << *in << dendl;
193 in->item_open_file.remove_myself();
194 }
195 }
196 if (le) {
197 mds->mdlog->submit_entry(le);
198 mds->mdlog->wait_for_safe(gather_bld.new_sub());
199 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl;
200 }
201 }
202
203 assert(g_conf->mds_kill_journal_expire_at != 3);
204
205 // backtraces to be stored/updated
206 for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) {
207 CInode *in = *p;
208 assert(in->is_auth());
209 if (in->can_auth_pin()) {
210 dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl;
211 in->store_backtrace(gather_bld.new_sub(), op_prio);
212 } else {
213 dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl;
214 in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub());
215 }
216 }
217
218 assert(g_conf->mds_kill_journal_expire_at != 4);
219
220 // slave updates
221 for (elist<MDSlaveUpdate*>::iterator p = slave_updates.begin(member_offset(MDSlaveUpdate,
222 item));
223 !p.end(); ++p) {
224 MDSlaveUpdate *su = *p;
225 dout(10) << "try_to_expire waiting on slave update " << su << dendl;
226 assert(su->waiter == 0);
227 su->waiter = gather_bld.new_sub();
228 }
229
230 // idalloc
231 if (inotablev > mds->inotable->get_committed_version()) {
232 dout(10) << "try_to_expire saving inotable table, need " << inotablev
233 << ", committed is " << mds->inotable->get_committed_version()
234 << " (" << mds->inotable->get_committing_version() << ")"
235 << dendl;
236 mds->inotable->save(gather_bld.new_sub(), inotablev);
237 }
238
239 // sessionmap
240 if (sessionmapv > mds->sessionmap.get_committed()) {
241 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
242 << ", committed is " << mds->sessionmap.get_committed()
243 << " (" << mds->sessionmap.get_committing() << ")"
244 << dendl;
245 mds->sessionmap.save(gather_bld.new_sub(), sessionmapv);
246 }
247
248 // updates to sessions for completed_requests
249 mds->sessionmap.save_if_dirty(touched_sessions, &gather_bld);
250 touched_sessions.clear();
251
252 // pending commit atids
253 for (map<int, ceph::unordered_set<version_t> >::iterator p = pending_commit_tids.begin();
254 p != pending_commit_tids.end();
255 ++p) {
256 MDSTableClient *client = mds->get_table_client(p->first);
257 assert(client);
258 for (ceph::unordered_set<version_t>::iterator q = p->second.begin();
259 q != p->second.end();
260 ++q) {
261 dout(10) << "try_to_expire " << get_mdstable_name(p->first) << " transaction " << *q
262 << " pending commit (not yet acked), waiting" << dendl;
263 assert(!client->has_committed(*q));
264 client->wait_for_ack(*q, gather_bld.new_sub());
265 }
266 }
267
268 // table servers
269 for (map<int, version_t>::iterator p = tablev.begin();
270 p != tablev.end();
271 ++p) {
272 MDSTableServer *server = mds->get_table_server(p->first);
273 assert(server);
274 if (p->second > server->get_committed_version()) {
275 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p->first)
276 << " to save, need " << p->second << dendl;
277 server->save(gather_bld.new_sub());
278 }
279 }
280
281 // truncating
282 for (set<CInode*>::iterator p = truncating_inodes.begin();
283 p != truncating_inodes.end();
284 ++p) {
285 dout(10) << "try_to_expire waiting for truncate of " << **p << dendl;
286 (*p)->add_waiter(CInode::WAIT_TRUNC, gather_bld.new_sub());
287 }
288
289 if (gather_bld.has_subs()) {
290 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire waiting" << dendl;
291 mds->mdlog->flush();
292 } else {
293 assert(g_conf->mds_kill_journal_expire_at != 5);
294 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire success" << dendl;
295 }
296 }
297
298
299 // -----------------------
300 // EMetaBlob
301
302 EMetaBlob::EMetaBlob(MDLog *mdlog) : opened_ino(0), renamed_dirino(0),
303 inotablev(0), sessionmapv(0), allocated_ino(0),
304 last_subtree_map(0), event_seq(0)
305 { }
306
307 void EMetaBlob::add_dir_context(CDir *dir, int mode)
308 {
309 MDSRank *mds = dir->cache->mds;
310
311 list<CDentry*> parents;
312
313 // it may be okay not to include the maybe items, if
314 // - we journaled the maybe child inode in this segment
315 // - that subtree turns out to be unambiguously auth
316 list<CDentry*> maybe;
317 bool maybenot = false;
318
319 while (true) {
320 // already have this dir? (we must always add in order)
321 if (lump_map.count(dir->dirfrag())) {
322 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") have lump " << dir->dirfrag() << dendl;
323 break;
324 }
325
326 // stop at root/stray
327 CInode *diri = dir->get_inode();
328 CDentry *parent = diri->get_projected_parent_dn();
329
330 if (mode == TO_AUTH_SUBTREE_ROOT) {
331 // subtree root?
332 if (dir->is_subtree_root()) {
333 // match logic in MDCache::create_subtree_map()
334 if (dir->get_dir_auth().first == mds->get_nodeid()) {
335 mds_authority_t parent_auth = parent ? parent->authority() : CDIR_AUTH_UNDEF;
336 if (parent_auth.first == dir->get_dir_auth().first) {
337 if (parent_auth.second == CDIR_AUTH_UNKNOWN &&
338 !dir->is_ambiguous_dir_auth() &&
339 !dir->state_test(CDir::STATE_EXPORTBOUND) &&
340 !dir->state_test(CDir::STATE_AUXSUBTREE) &&
341 !diri->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
342 dout(0) << "EMetaBlob::add_dir_context unexpected subtree " << *dir << dendl;
343 assert(0);
344 }
345 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") ambiguous or transient subtree " << dendl;
346 } else {
347 // it's an auth subtree, we don't need maybe (if any), and we're done.
348 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
349 << " at " << *dir << dendl;
350 maybe.clear();
351 break;
352 }
353 } else {
354 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached ambig or !auth subtree, need " << maybe
355 << " at " << *dir << dendl;
356 // we need the maybe list after all!
357 parents.splice(parents.begin(), maybe);
358 maybenot = false;
359 }
360 }
361
362 // was the inode journaled in this blob?
363 if (event_seq && diri->last_journaled == event_seq) {
364 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri this blob " << *diri << dendl;
365 break;
366 }
367
368 // have we journaled this inode since the last subtree map?
369 if (!maybenot && last_subtree_map && diri->last_journaled >= last_subtree_map) {
370 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri in this segment ("
371 << diri->last_journaled << " >= " << last_subtree_map << "), setting maybenot flag "
372 << *diri << dendl;
373 maybenot = true;
374 }
375 }
376
377 if (!parent)
378 break;
379
380 if (maybenot) {
381 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") maybe " << *parent << dendl;
382 maybe.push_front(parent);
383 } else {
384 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") definitely " << *parent << dendl;
385 parents.push_front(parent);
386 }
387
388 dir = parent->get_dir();
389 }
390
391 parents.splice(parents.begin(), maybe);
392
393 dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl;
394 for (list<CDentry*>::iterator p = parents.begin(); p != parents.end(); ++p) {
395 assert((*p)->get_projected_linkage()->is_primary());
396 add_dentry(*p, false);
397 }
398 }
399
400 void EMetaBlob::update_segment(LogSegment *ls)
401 {
402 // dirty inode mtimes
403 // -> handled directly by Server.cc, replay()
404
405 // alloc table update?
406 if (inotablev)
407 ls->inotablev = inotablev;
408 if (sessionmapv)
409 ls->sessionmapv = sessionmapv;
410
411 // truncated inodes
412 // -> handled directly by Server.cc
413
414 // client requests
415 // note the newest request per client
416 //if (!client_reqs.empty())
417 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
418 }
419
420 // EMetaBlob::fullbit
421
422 void EMetaBlob::fullbit::encode(bufferlist& bl, uint64_t features) const {
423 ENCODE_START(8, 5, bl);
424 ::encode(dn, bl);
425 ::encode(dnfirst, bl);
426 ::encode(dnlast, bl);
427 ::encode(dnv, bl);
428 ::encode(inode, bl, features);
429 ::encode(xattrs, bl);
430 if (inode.is_symlink())
431 ::encode(symlink, bl);
432 if (inode.is_dir()) {
433 ::encode(dirfragtree, bl);
434 ::encode(snapbl, bl);
435 }
436 ::encode(state, bl);
437 if (old_inodes.empty()) {
438 ::encode(false, bl);
439 } else {
440 ::encode(true, bl);
441 ::encode(old_inodes, bl, features);
442 }
443 if (!inode.is_dir())
444 ::encode(snapbl, bl);
445 ::encode(oldest_snap, bl);
446 ENCODE_FINISH(bl);
447 }
448
449 void EMetaBlob::fullbit::decode(bufferlist::iterator &bl) {
450 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
451 ::decode(dn, bl);
452 ::decode(dnfirst, bl);
453 ::decode(dnlast, bl);
454 ::decode(dnv, bl);
455 ::decode(inode, bl);
456 ::decode(xattrs, bl);
457 if (inode.is_symlink())
458 ::decode(symlink, bl);
459 if (inode.is_dir()) {
460 ::decode(dirfragtree, bl);
461 ::decode(snapbl, bl);
462 if ((struct_v == 2) || (struct_v == 3)) {
463 bool dir_layout_exists;
464 ::decode(dir_layout_exists, bl);
465 if (dir_layout_exists) {
466 __u8 dir_struct_v;
467 ::decode(dir_struct_v, bl); // default_file_layout version
468 ::decode(inode.layout, bl); // and actual layout, that we care about
469 }
470 }
471 }
472 if (struct_v >= 6) {
473 ::decode(state, bl);
474 } else {
475 bool dirty;
476 ::decode(dirty, bl);
477 state = dirty ? EMetaBlob::fullbit::STATE_DIRTY : 0;
478 }
479
480 if (struct_v >= 3) {
481 bool old_inodes_present;
482 ::decode(old_inodes_present, bl);
483 if (old_inodes_present) {
484 ::decode(old_inodes, bl);
485 }
486 }
487 if (!inode.is_dir()) {
488 if (struct_v >= 7)
489 ::decode(snapbl, bl);
490 }
491 if (struct_v >= 8)
492 ::decode(oldest_snap, bl);
493 else
494 oldest_snap = CEPH_NOSNAP;
495
496 DECODE_FINISH(bl);
497 }
498
499 void EMetaBlob::fullbit::dump(Formatter *f) const
500 {
501 f->dump_string("dentry", dn);
502 f->dump_stream("snapid.first") << dnfirst;
503 f->dump_stream("snapid.last") << dnlast;
504 f->dump_int("dentry version", dnv);
505 f->open_object_section("inode");
506 inode.dump(f);
507 f->close_section(); // inode
508 f->open_object_section("xattrs");
509 for (const auto &p : xattrs) {
510 std::string s(p.second.c_str(), p.second.length());
511 f->dump_string(p.first.c_str(), s);
512 }
513 f->close_section(); // xattrs
514 if (inode.is_symlink()) {
515 f->dump_string("symlink", symlink);
516 }
517 if (inode.is_dir()) {
518 f->dump_stream("frag tree") << dirfragtree;
519 f->dump_string("has_snapbl", snapbl.length() ? "true" : "false");
520 if (inode.has_layout()) {
521 f->open_object_section("file layout policy");
522 // FIXME
523 f->dump_string("layout", "the layout exists");
524 f->close_section(); // file layout policy
525 }
526 }
527 f->dump_string("state", state_string());
528 if (!old_inodes.empty()) {
529 f->open_array_section("old inodes");
530 for (const auto &p : old_inodes) {
531 f->open_object_section("inode");
532 f->dump_int("snapid", p.first);
533 p.second.dump(f);
534 f->close_section(); // inode
535 }
536 f->close_section(); // old inodes
537 }
538 }
539
540 void EMetaBlob::fullbit::generate_test_instances(list<EMetaBlob::fullbit*>& ls)
541 {
542 CInode::mempool_inode inode;
543 fragtree_t fragtree;
544 CInode::mempool_xattr_map empty_xattrs;
545 bufferlist empty_snapbl;
546 fullbit *sample = new fullbit("/testdn", 0, 0, 0,
547 inode, fragtree, empty_xattrs, "", 0, empty_snapbl,
548 false, NULL);
549 ls.push_back(sample);
550 }
551
552 void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in)
553 {
554 in->inode = inode;
555 in->xattrs = xattrs;
556 in->maybe_export_pin();
557 if (in->inode.is_dir()) {
558 if (!(in->dirfragtree == dirfragtree)) {
559 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> "
560 << dirfragtree << " on " << *in << dendl;
561 in->dirfragtree = dirfragtree;
562 in->force_dirfrags();
563 if (in->has_dirfrags() && in->authority() == CDIR_AUTH_UNDEF) {
564 list<CDir*> ls;
565 in->get_nested_dirfrags(ls);
566 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
567 CDir *dir = *p;
568 if (dir->get_num_any() == 0 &&
569 mds->mdcache->can_trim_non_auth_dirfrag(dir)) {
570 dout(10) << " closing empty non-auth dirfrag " << *dir << dendl;
571 in->close_dirfrag(dir->get_frag());
572 }
573 }
574 }
575 }
576 } else if (in->inode.is_symlink()) {
577 in->symlink = mempool::mds_co::string(boost::string_view(symlink));
578 }
579 in->old_inodes = old_inodes;
580 if (!in->old_inodes.empty()) {
581 snapid_t min_first = in->old_inodes.rbegin()->first + 1;
582 if (min_first > in->first)
583 in->first = min_first;
584 }
585
586 /*
587 * we can do this before linking hte inode bc the split_at would
588 * be a no-op.. we have no children (namely open snaprealms) to
589 * divy up
590 */
591 in->oldest_snap = oldest_snap;
592 in->decode_snap_blob(snapbl);
593
594 /*
595 * In case there was anything malformed in the journal that we are
596 * replaying, do sanity checks on the inodes we're replaying and
597 * go damaged instead of letting any trash into a live cache
598 */
599 if (in->is_file()) {
600 // Files must have valid layouts with a pool set
601 if (in->inode.layout.pool_id == -1 || !in->inode.layout.is_valid()) {
602 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
603 << ": " << in->inode.layout << dendl;
604 std::ostringstream oss;
605 oss << "Invalid layout for inode 0x" << std::hex << in->inode.ino
606 << std::dec << " in journal";
607 mds->clog->error() << oss.str();
608 mds->damaged();
609 ceph_abort(); // Should be unreachable because damaged() calls respawn()
610 }
611 }
612 }
613
614 // EMetaBlob::remotebit
615
616 void EMetaBlob::remotebit::encode(bufferlist& bl) const
617 {
618 ENCODE_START(2, 2, bl);
619 ::encode(dn, bl);
620 ::encode(dnfirst, bl);
621 ::encode(dnlast, bl);
622 ::encode(dnv, bl);
623 ::encode(ino, bl);
624 ::encode(d_type, bl);
625 ::encode(dirty, bl);
626 ENCODE_FINISH(bl);
627 }
628
629 void EMetaBlob::remotebit::decode(bufferlist::iterator &bl)
630 {
631 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
632 ::decode(dn, bl);
633 ::decode(dnfirst, bl);
634 ::decode(dnlast, bl);
635 ::decode(dnv, bl);
636 ::decode(ino, bl);
637 ::decode(d_type, bl);
638 ::decode(dirty, bl);
639 DECODE_FINISH(bl);
640 }
641
642 void EMetaBlob::remotebit::dump(Formatter *f) const
643 {
644 f->dump_string("dentry", dn);
645 f->dump_int("snapid.first", dnfirst);
646 f->dump_int("snapid.last", dnlast);
647 f->dump_int("dentry version", dnv);
648 f->dump_int("inodeno", ino);
649 uint32_t type = DTTOIF(d_type) & S_IFMT; // convert to type entries
650 string type_string;
651 switch(type) {
652 case S_IFREG:
653 type_string = "file"; break;
654 case S_IFLNK:
655 type_string = "symlink"; break;
656 case S_IFDIR:
657 type_string = "directory"; break;
658 case S_IFIFO:
659 type_string = "fifo"; break;
660 case S_IFCHR:
661 type_string = "chr"; break;
662 case S_IFBLK:
663 type_string = "blk"; break;
664 case S_IFSOCK:
665 type_string = "sock"; break;
666 default:
667 assert (0 == "unknown d_type!");
668 }
669 f->dump_string("d_type", type_string);
670 f->dump_string("dirty", dirty ? "true" : "false");
671 }
672
673 void EMetaBlob::remotebit::
674 generate_test_instances(list<EMetaBlob::remotebit*>& ls)
675 {
676 remotebit *remote = new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG), false);
677 ls.push_back(remote);
678 }
679
680 // EMetaBlob::nullbit
681
682 void EMetaBlob::nullbit::encode(bufferlist& bl) const
683 {
684 ENCODE_START(2, 2, bl);
685 ::encode(dn, bl);
686 ::encode(dnfirst, bl);
687 ::encode(dnlast, bl);
688 ::encode(dnv, bl);
689 ::encode(dirty, bl);
690 ENCODE_FINISH(bl);
691 }
692
693 void EMetaBlob::nullbit::decode(bufferlist::iterator &bl)
694 {
695 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
696 ::decode(dn, bl);
697 ::decode(dnfirst, bl);
698 ::decode(dnlast, bl);
699 ::decode(dnv, bl);
700 ::decode(dirty, bl);
701 DECODE_FINISH(bl);
702 }
703
704 void EMetaBlob::nullbit::dump(Formatter *f) const
705 {
706 f->dump_string("dentry", dn);
707 f->dump_int("snapid.first", dnfirst);
708 f->dump_int("snapid.last", dnlast);
709 f->dump_int("dentry version", dnv);
710 f->dump_string("dirty", dirty ? "true" : "false");
711 }
712
713 void EMetaBlob::nullbit::generate_test_instances(list<nullbit*>& ls)
714 {
715 nullbit *sample = new nullbit("/test/dentry", 0, 10, 15, false);
716 nullbit *sample2 = new nullbit("/test/dirty", 10, 20, 25, true);
717 ls.push_back(sample);
718 ls.push_back(sample2);
719 }
720
721 // EMetaBlob::dirlump
722
723 void EMetaBlob::dirlump::encode(bufferlist& bl, uint64_t features) const
724 {
725 ENCODE_START(2, 2, bl);
726 ::encode(fnode, bl);
727 ::encode(state, bl);
728 ::encode(nfull, bl);
729 ::encode(nremote, bl);
730 ::encode(nnull, bl);
731 _encode_bits(features);
732 ::encode(dnbl, bl);
733 ENCODE_FINISH(bl);
734 }
735
736 void EMetaBlob::dirlump::decode(bufferlist::iterator &bl)
737 {
738 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl)
739 ::decode(fnode, bl);
740 ::decode(state, bl);
741 ::decode(nfull, bl);
742 ::decode(nremote, bl);
743 ::decode(nnull, bl);
744 ::decode(dnbl, bl);
745 dn_decoded = false; // don't decode bits unless we need them.
746 DECODE_FINISH(bl);
747 }
748
749 void EMetaBlob::dirlump::dump(Formatter *f) const
750 {
751 if (!dn_decoded) {
752 dirlump *me = const_cast<dirlump*>(this);
753 me->_decode_bits();
754 }
755 f->open_object_section("fnode");
756 fnode.dump(f);
757 f->close_section(); // fnode
758 f->dump_string("state", state_string());
759 f->dump_int("nfull", nfull);
760 f->dump_int("nremote", nremote);
761 f->dump_int("nnull", nnull);
762
763 f->open_array_section("full bits");
764 for (list<ceph::shared_ptr<fullbit> >::const_iterator
765 iter = dfull.begin(); iter != dfull.end(); ++iter) {
766 f->open_object_section("fullbit");
767 (*iter)->dump(f);
768 f->close_section(); // fullbit
769 }
770 f->close_section(); // full bits
771 f->open_array_section("remote bits");
772 for (list<remotebit>::const_iterator
773 iter = dremote.begin(); iter != dremote.end(); ++iter) {
774 f->open_object_section("remotebit");
775 (*iter).dump(f);
776 f->close_section(); // remotebit
777 }
778 f->close_section(); // remote bits
779 f->open_array_section("null bits");
780 for (list<nullbit>::const_iterator
781 iter = dnull.begin(); iter != dnull.end(); ++iter) {
782 f->open_object_section("null bit");
783 (*iter).dump(f);
784 f->close_section(); // null bit
785 }
786 f->close_section(); // null bits
787 }
788
789 void EMetaBlob::dirlump::generate_test_instances(list<dirlump*>& ls)
790 {
791 ls.push_back(new dirlump());
792 }
793
794 /**
795 * EMetaBlob proper
796 */
797 void EMetaBlob::encode(bufferlist& bl, uint64_t features) const
798 {
799 ENCODE_START(8, 5, bl);
800 ::encode(lump_order, bl);
801 ::encode(lump_map, bl, features);
802 ::encode(roots, bl, features);
803 ::encode(table_tids, bl);
804 ::encode(opened_ino, bl);
805 ::encode(allocated_ino, bl);
806 ::encode(used_preallocated_ino, bl);
807 ::encode(preallocated_inos, bl);
808 ::encode(client_name, bl);
809 ::encode(inotablev, bl);
810 ::encode(sessionmapv, bl);
811 ::encode(truncate_start, bl);
812 ::encode(truncate_finish, bl);
813 ::encode(destroyed_inodes, bl);
814 ::encode(client_reqs, bl);
815 ::encode(renamed_dirino, bl);
816 ::encode(renamed_dir_frags, bl);
817 {
818 // make MDSRank use v6 format happy
819 int64_t i = -1;
820 bool b = false;
821 ::encode(i, bl);
822 ::encode(b, bl);
823 }
824 ::encode(client_flushes, bl);
825 ENCODE_FINISH(bl);
826 }
827 void EMetaBlob::decode(bufferlist::iterator &bl)
828 {
829 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
830 ::decode(lump_order, bl);
831 ::decode(lump_map, bl);
832 if (struct_v >= 4) {
833 ::decode(roots, bl);
834 } else {
835 bufferlist rootbl;
836 ::decode(rootbl, bl);
837 if (rootbl.length()) {
838 bufferlist::iterator p = rootbl.begin();
839 roots.push_back(ceph::shared_ptr<fullbit>(new fullbit(p)));
840 }
841 }
842 ::decode(table_tids, bl);
843 ::decode(opened_ino, bl);
844 ::decode(allocated_ino, bl);
845 ::decode(used_preallocated_ino, bl);
846 ::decode(preallocated_inos, bl);
847 ::decode(client_name, bl);
848 ::decode(inotablev, bl);
849 ::decode(sessionmapv, bl);
850 ::decode(truncate_start, bl);
851 ::decode(truncate_finish, bl);
852 ::decode(destroyed_inodes, bl);
853 if (struct_v >= 2) {
854 ::decode(client_reqs, bl);
855 } else {
856 list<metareqid_t> r;
857 ::decode(r, bl);
858 while (!r.empty()) {
859 client_reqs.push_back(pair<metareqid_t,uint64_t>(r.front(), 0));
860 r.pop_front();
861 }
862 }
863 if (struct_v >= 3) {
864 ::decode(renamed_dirino, bl);
865 ::decode(renamed_dir_frags, bl);
866 }
867 if (struct_v >= 6) {
868 // ignore
869 int64_t i;
870 bool b;
871 ::decode(i, bl);
872 ::decode(b, bl);
873 }
874 if (struct_v >= 8) {
875 ::decode(client_flushes, bl);
876 }
877 DECODE_FINISH(bl);
878 }
879
880
881 /**
882 * Get all inodes touched by this metablob. Includes the 'bits' within
883 * dirlumps, and the inodes of the dirs themselves.
884 */
885 void EMetaBlob::get_inodes(
886 std::set<inodeno_t> &inodes) const
887 {
888 // For all dirlumps in this metablob
889 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
890 // Record inode of dirlump
891 inodeno_t const dir_ino = i->first.ino;
892 inodes.insert(dir_ino);
893
894 // Decode dirlump bits
895 dirlump const &dl = i->second;
896 dl._decode_bits();
897
898 // Record inodes of fullbits
899 list<ceph::shared_ptr<fullbit> > const &fb_list = dl.get_dfull();
900 for (list<ceph::shared_ptr<fullbit> >::const_iterator
901 iter = fb_list.begin(); iter != fb_list.end(); ++iter) {
902 inodes.insert((*iter)->inode.ino);
903 }
904
905 // Record inodes of remotebits
906 list<remotebit> const &rb_list = dl.get_dremote();
907 for (list<remotebit>::const_iterator
908 iter = rb_list.begin(); iter != rb_list.end(); ++iter) {
909 inodes.insert(iter->ino);
910 }
911 }
912 }
913
914
915 /**
916 * Get a map of dirfrag to set of dentries in that dirfrag which are
917 * touched in this operation.
918 */
919 void EMetaBlob::get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const
920 {
921 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
922 dirlump const &dl = i->second;
923 dirfrag_t const &df = i->first;
924
925 // Get all bits
926 dl._decode_bits();
927 list<ceph::shared_ptr<fullbit> > const &fb_list = dl.get_dfull();
928 list<nullbit> const &nb_list = dl.get_dnull();
929 list<remotebit> const &rb_list = dl.get_dremote();
930
931 // For all bits, store dentry
932 for (list<ceph::shared_ptr<fullbit> >::const_iterator
933 iter = fb_list.begin(); iter != fb_list.end(); ++iter) {
934 dentries[df].insert((*iter)->dn);
935
936 }
937 for (list<nullbit>::const_iterator
938 iter = nb_list.begin(); iter != nb_list.end(); ++iter) {
939 dentries[df].insert(iter->dn);
940 }
941 for (list<remotebit>::const_iterator
942 iter = rb_list.begin(); iter != rb_list.end(); ++iter) {
943 dentries[df].insert(iter->dn);
944 }
945 }
946 }
947
948
949
950 /**
951 * Calculate all paths that we can infer are touched by this metablob. Only uses
952 * information local to this metablob so it may only be the path within the
953 * subtree.
954 */
955 void EMetaBlob::get_paths(
956 std::vector<std::string> &paths) const
957 {
958 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
959 typedef std::pair<inodeno_t, std::string> Location;
960
961 // Whenever we see a dentry within a dirlump, we remember it as a child of
962 // the dirlump's inode
963 std::map<inodeno_t, std::list<std::string> > children;
964
965 // Whenever we see a location for an inode, remember it: this allows us to
966 // build a path given an inode
967 std::map<inodeno_t, Location> ino_locations;
968
969 // Special case: operations on root inode populate roots but not dirlumps
970 if (lump_map.empty() && !roots.empty()) {
971 paths.push_back("/");
972 return;
973 }
974
975 // First pass
976 // ==========
977 // Build a tiny local metadata cache for the path structure in this metablob
978 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
979 inodeno_t const dir_ino = i->first.ino;
980 dirlump const &dl = i->second;
981 dl._decode_bits();
982
983 list<ceph::shared_ptr<fullbit> > const &fb_list = dl.get_dfull();
984 list<nullbit> const &nb_list = dl.get_dnull();
985 list<remotebit> const &rb_list = dl.get_dremote();
986
987 for (list<ceph::shared_ptr<fullbit> >::const_iterator
988 iter = fb_list.begin(); iter != fb_list.end(); ++iter) {
989 boost::string_view dentry = (*iter)->dn;
990 children[dir_ino].emplace_back(dentry);
991 ino_locations[(*iter)->inode.ino] = Location(dir_ino, std::string(dentry));
992 }
993
994 for (list<nullbit>::const_iterator
995 iter = nb_list.begin(); iter != nb_list.end(); ++iter) {
996 boost::string_view dentry = iter->dn;
997 children[dir_ino].emplace_back(dentry);
998 }
999
1000 for (list<remotebit>::const_iterator
1001 iter = rb_list.begin(); iter != rb_list.end(); ++iter) {
1002 boost::string_view dentry = iter->dn;
1003 children[dir_ino].emplace_back(dentry);
1004 }
1005 }
1006
1007 std::vector<Location> leaf_locations;
1008
1009 // Second pass
1010 // ===========
1011 // Output paths for all childless nodes in the metablob
1012 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
1013 inodeno_t const dir_ino = i->first.ino;
1014 dirlump const &dl = i->second;
1015 dl._decode_bits();
1016
1017 list<ceph::shared_ptr<fullbit> > const &fb_list = dl.get_dfull();
1018 for (list<ceph::shared_ptr<fullbit> >::const_iterator
1019 iter = fb_list.begin(); iter != fb_list.end(); ++iter) {
1020 std::string dentry((*iter)->dn);
1021 children[dir_ino].push_back(dentry);
1022 ino_locations[(*iter)->inode.ino] = Location(dir_ino, std::string(dentry));
1023 if (children.find((*iter)->inode.ino) == children.end()) {
1024 leaf_locations.push_back(Location(dir_ino, std::string(dentry)));
1025
1026 }
1027 }
1028
1029 list<nullbit> const &nb_list = dl.get_dnull();
1030 for (list<nullbit>::const_iterator
1031 iter = nb_list.begin(); iter != nb_list.end(); ++iter) {
1032 boost::string_view dentry = iter->dn;
1033 leaf_locations.push_back(Location(dir_ino, std::string(dentry)));
1034 }
1035
1036 list<remotebit> const &rb_list = dl.get_dremote();
1037 for (list<remotebit>::const_iterator
1038 iter = rb_list.begin(); iter != rb_list.end(); ++iter) {
1039 boost::string_view dentry = iter->dn;
1040 leaf_locations.push_back(Location(dir_ino, std::string(dentry)));
1041 }
1042 }
1043
1044 // For all the leaf locations identified, generate paths
1045 for (std::vector<Location>::iterator i = leaf_locations.begin(); i != leaf_locations.end(); ++i) {
1046 Location const &loc = *i;
1047 std::string path = loc.second;
1048 inodeno_t ino = loc.first;
1049 while(ino_locations.find(ino) != ino_locations.end()) {
1050 Location const &loc = ino_locations[ino];
1051 if (!path.empty()) {
1052 path = loc.second + "/" + path;
1053 } else {
1054 path = loc.second + path;
1055 }
1056 ino = loc.first;
1057 }
1058
1059 paths.push_back(path);
1060 }
1061 }
1062
1063
1064 void EMetaBlob::dump(Formatter *f) const
1065 {
1066 f->open_array_section("lumps");
1067 for (list<dirfrag_t>::const_iterator i = lump_order.begin();
1068 i != lump_order.end(); ++i) {
1069 f->open_object_section("lump");
1070 f->open_object_section("dirfrag");
1071 f->dump_stream("dirfrag") << *i;
1072 f->close_section(); // dirfrag
1073 f->open_object_section("dirlump");
1074 lump_map.at(*i).dump(f);
1075 f->close_section(); // dirlump
1076 f->close_section(); // lump
1077 }
1078 f->close_section(); // lumps
1079
1080 f->open_array_section("roots");
1081 for (list<ceph::shared_ptr<fullbit> >::const_iterator i = roots.begin();
1082 i != roots.end(); ++i) {
1083 f->open_object_section("root");
1084 (*i)->dump(f);
1085 f->close_section(); // root
1086 }
1087 f->close_section(); // roots
1088
1089 f->open_array_section("tableclient tranactions");
1090 for (list<pair<__u8,version_t> >::const_iterator i = table_tids.begin();
1091 i != table_tids.end(); ++i) {
1092 f->open_object_section("transaction");
1093 f->dump_int("tid", i->first);
1094 f->dump_int("version", i->second);
1095 f->close_section(); // transaction
1096 }
1097 f->close_section(); // tableclient transactions
1098
1099 f->dump_int("renamed directory inodeno", renamed_dirino);
1100
1101 f->open_array_section("renamed directory fragments");
1102 for (list<frag_t>::const_iterator i = renamed_dir_frags.begin();
1103 i != renamed_dir_frags.end(); ++i) {
1104 f->dump_int("frag", *i);
1105 }
1106 f->close_section(); // renamed directory fragments
1107
1108 f->dump_int("inotable version", inotablev);
1109 f->dump_int("SessionMap version", sessionmapv);
1110 f->dump_int("allocated ino", allocated_ino);
1111
1112 f->dump_stream("preallocated inos") << preallocated_inos;
1113 f->dump_int("used preallocated ino", used_preallocated_ino);
1114
1115 f->open_object_section("client name");
1116 client_name.dump(f);
1117 f->close_section(); // client name
1118
1119 f->open_array_section("inodes starting a truncate");
1120 for(list<inodeno_t>::const_iterator i = truncate_start.begin();
1121 i != truncate_start.end(); ++i) {
1122 f->dump_int("inodeno", *i);
1123 }
1124 f->close_section(); // truncate inodes
1125 f->open_array_section("inodes finishing a truncated");
1126 for(map<inodeno_t,uint64_t>::const_iterator i = truncate_finish.begin();
1127 i != truncate_finish.end(); ++i) {
1128 f->open_object_section("inode+segment");
1129 f->dump_int("inodeno", i->first);
1130 f->dump_int("truncate starting segment", i->second);
1131 f->close_section(); // truncated inode
1132 }
1133 f->close_section(); // truncate finish inodes
1134
1135 f->open_array_section("destroyed inodes");
1136 for(vector<inodeno_t>::const_iterator i = destroyed_inodes.begin();
1137 i != destroyed_inodes.end(); ++i) {
1138 f->dump_int("inodeno", *i);
1139 }
1140 f->close_section(); // destroyed inodes
1141
1142 f->open_array_section("client requests");
1143 for(list<pair<metareqid_t,uint64_t> >::const_iterator i = client_reqs.begin();
1144 i != client_reqs.end(); ++i) {
1145 f->open_object_section("Client request");
1146 f->dump_stream("request ID") << i->first;
1147 f->dump_int("oldest request on client", i->second);
1148 f->close_section(); // request
1149 }
1150 f->close_section(); // client requests
1151 }
1152
1153 void EMetaBlob::generate_test_instances(list<EMetaBlob*>& ls)
1154 {
1155 ls.push_back(new EMetaBlob());
1156 }
1157
1158 void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
1159 {
1160 dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
1161
1162 assert(logseg);
1163
1164 assert(g_conf->mds_kill_journal_replay_at != 1);
1165
1166 for (list<ceph::shared_ptr<fullbit> >::iterator p = roots.begin(); p != roots.end(); ++p) {
1167 CInode *in = mds->mdcache->get_inode((*p)->inode.ino);
1168 bool isnew = in ? false:true;
1169 if (!in)
1170 in = new CInode(mds->mdcache, false);
1171 (*p)->update_inode(mds, in);
1172
1173 if (isnew)
1174 mds->mdcache->add_inode(in);
1175 if ((*p)->is_dirty()) in->_mark_dirty(logseg);
1176 dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl;
1177 }
1178
1179 CInode *renamed_diri = 0;
1180 CDir *olddir = 0;
1181 if (renamed_dirino) {
1182 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
1183 if (renamed_diri)
1184 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri << dendl;
1185 else
1186 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino << dendl;
1187
1188 int nnull = 0;
1189 for (list<dirfrag_t>::iterator lp = lump_order.begin(); lp != lump_order.end(); ++lp) {
1190 dirlump &lump = lump_map[*lp];
1191 if (lump.nnull) {
1192 dout(10) << "EMetaBlob.replay found null dentry in dir " << *lp << dendl;
1193 nnull += lump.nnull;
1194 }
1195 }
1196 assert(nnull <= 1);
1197 }
1198
1199 // keep track of any inodes we unlink and don't relink elsewhere
1200 map<CInode*, CDir*> unlinked;
1201 set<CInode*> linked;
1202
1203 // walk through my dirs (in order!)
1204 for (list<dirfrag_t>::iterator lp = lump_order.begin();
1205 lp != lump_order.end();
1206 ++lp) {
1207 dout(10) << "EMetaBlob.replay dir " << *lp << dendl;
1208 dirlump &lump = lump_map[*lp];
1209
1210 // the dir
1211 CDir *dir = mds->mdcache->get_force_dirfrag(*lp, true);
1212 if (!dir) {
1213 // hmm. do i have the inode?
1214 CInode *diri = mds->mdcache->get_inode((*lp).ino);
1215 if (!diri) {
1216 if (MDS_INO_IS_MDSDIR(lp->ino)) {
1217 assert(MDS_INO_MDSDIR(mds->get_nodeid()) != lp->ino);
1218 diri = mds->mdcache->create_system_inode(lp->ino, S_IFDIR|0755);
1219 diri->state_clear(CInode::STATE_AUTH);
1220 dout(10) << "EMetaBlob.replay created base " << *diri << dendl;
1221 } else {
1222 dout(0) << "EMetaBlob.replay missing dir ino " << (*lp).ino << dendl;
1223 mds->clog->error() << "failure replaying journal (EMetaBlob)";
1224 mds->damaged();
1225 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1226 }
1227 }
1228
1229 // create the dirfrag
1230 dir = diri->get_or_open_dirfrag(mds->mdcache, (*lp).frag);
1231
1232 if (MDS_INO_IS_BASE(lp->ino))
1233 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
1234
1235 dout(10) << "EMetaBlob.replay added dir " << *dir << dendl;
1236 }
1237 dir->set_version( lump.fnode.version );
1238 dir->fnode = lump.fnode;
1239
1240 if (lump.is_importing()) {
1241 dir->state_set(CDir::STATE_AUTH);
1242 dir->state_clear(CDir::STATE_COMPLETE);
1243 }
1244 if (lump.is_dirty()) {
1245 dir->_mark_dirty(logseg);
1246
1247 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
1248 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir << dendl;
1249 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
1250 logseg->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
1251 } else {
1252 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir << dendl;
1253 }
1254 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
1255 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir << dendl;
1256 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
1257 logseg->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
1258 } else {
1259 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir << dendl;
1260 }
1261 }
1262 if (lump.is_dirty_dft()) {
1263 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir << dendl;
1264 dir->state_set(CDir::STATE_DIRTYDFT);
1265 mds->locker->mark_updated_scatterlock(&dir->inode->dirfragtreelock);
1266 logseg->dirty_dirfrag_dirfragtree.push_back(&dir->inode->item_dirty_dirfrag_dirfragtree);
1267 }
1268 if (lump.is_new())
1269 dir->mark_new(logseg);
1270 if (lump.is_complete())
1271 dir->mark_complete();
1272
1273 dout(10) << "EMetaBlob.replay updated dir " << *dir << dendl;
1274
1275 // decode bits
1276 lump._decode_bits();
1277
1278 // full dentry+inode pairs
1279 for (list<ceph::shared_ptr<fullbit> >::const_iterator pp = lump.get_dfull().begin();
1280 pp != lump.get_dfull().end();
1281 ++pp) {
1282 ceph::shared_ptr<fullbit> p = *pp;
1283 CDentry *dn = dir->lookup_exact_snap(p->dn, p->dnlast);
1284 if (!dn) {
1285 dn = dir->add_null_dentry(p->dn, p->dnfirst, p->dnlast);
1286 dn->set_version(p->dnv);
1287 if (p->is_dirty()) dn->_mark_dirty(logseg);
1288 dout(10) << "EMetaBlob.replay added (full) " << *dn << dendl;
1289 } else {
1290 dn->set_version(p->dnv);
1291 if (p->is_dirty()) dn->_mark_dirty(logseg);
1292 dout(10) << "EMetaBlob.replay for [" << p->dnfirst << "," << p->dnlast << "] had " << *dn << dendl;
1293 dn->first = p->dnfirst;
1294 assert(dn->last == p->dnlast);
1295 }
1296 if (lump.is_importing())
1297 dn->state_set(CDentry::STATE_AUTH);
1298
1299 CInode *in = mds->mdcache->get_inode(p->inode.ino, p->dnlast);
1300 if (!in) {
1301 in = new CInode(mds->mdcache, dn->is_auth(), p->dnfirst, p->dnlast);
1302 p->update_inode(mds, in);
1303 mds->mdcache->add_inode(in);
1304 if (!dn->get_linkage()->is_null()) {
1305 if (dn->get_linkage()->is_primary()) {
1306 unlinked[dn->get_linkage()->get_inode()] = dir;
1307 stringstream ss;
1308 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1309 << " " << *dn->get_linkage()->get_inode() << " should be " << p->inode.ino;
1310 dout(0) << ss.str() << dendl;
1311 mds->clog->warn(ss);
1312 }
1313 dir->unlink_inode(dn, false);
1314 }
1315 if (unlinked.count(in))
1316 linked.insert(in);
1317 dir->link_primary_inode(dn, in);
1318 dout(10) << "EMetaBlob.replay added " << *in << dendl;
1319 } else {
1320 in->first = p->dnfirst;
1321 p->update_inode(mds, in);
1322 if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) {
1323 dout(10) << "EMetaBlob.replay unlinking " << *in << dendl;
1324 unlinked[in] = in->get_parent_dir();
1325 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
1326 }
1327 if (dn->get_linkage()->get_inode() != in) {
1328 if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1329 if (dn->get_linkage()->is_primary()) {
1330 unlinked[dn->get_linkage()->get_inode()] = dir;
1331 stringstream ss;
1332 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1333 << " " << *dn->get_linkage()->get_inode() << " should be " << p->inode.ino;
1334 dout(0) << ss.str() << dendl;
1335 mds->clog->warn(ss);
1336 }
1337 dir->unlink_inode(dn, false);
1338 }
1339 if (unlinked.count(in))
1340 linked.insert(in);
1341 dir->link_primary_inode(dn, in);
1342 dout(10) << "EMetaBlob.replay linked " << *in << dendl;
1343 } else {
1344 dout(10) << "EMetaBlob.replay for [" << p->dnfirst << "," << p->dnlast << "] had " << *in << dendl;
1345 }
1346 assert(in->first == p->dnfirst ||
1347 (in->is_multiversion() && in->first > p->dnfirst));
1348 }
1349 if (p->is_dirty())
1350 in->_mark_dirty(logseg);
1351 if (p->is_dirty_parent())
1352 in->_mark_dirty_parent(logseg, p->is_dirty_pool());
1353 if (p->need_snapflush())
1354 logseg->open_files.push_back(&in->item_open_file);
1355 if (dn->is_auth())
1356 in->state_set(CInode::STATE_AUTH);
1357 else
1358 in->state_clear(CInode::STATE_AUTH);
1359 assert(g_conf->mds_kill_journal_replay_at != 2);
1360 }
1361
1362 // remote dentries
1363 for (list<remotebit>::const_iterator p = lump.get_dremote().begin();
1364 p != lump.get_dremote().end();
1365 ++p) {
1366 CDentry *dn = dir->lookup_exact_snap(p->dn, p->dnlast);
1367 if (!dn) {
1368 dn = dir->add_remote_dentry(p->dn, p->ino, p->d_type, p->dnfirst, p->dnlast);
1369 dn->set_version(p->dnv);
1370 if (p->dirty) dn->_mark_dirty(logseg);
1371 dout(10) << "EMetaBlob.replay added " << *dn << dendl;
1372 } else {
1373 if (!dn->get_linkage()->is_null()) {
1374 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1375 if (dn->get_linkage()->is_primary()) {
1376 unlinked[dn->get_linkage()->get_inode()] = dir;
1377 stringstream ss;
1378 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1379 << " " << *dn->get_linkage()->get_inode() << " should be remote " << p->ino;
1380 dout(0) << ss.str() << dendl;
1381 }
1382 dir->unlink_inode(dn, false);
1383 }
1384 dir->link_remote_inode(dn, p->ino, p->d_type);
1385 dn->set_version(p->dnv);
1386 if (p->dirty) dn->_mark_dirty(logseg);
1387 dout(10) << "EMetaBlob.replay for [" << p->dnfirst << "," << p->dnlast << "] had " << *dn << dendl;
1388 dn->first = p->dnfirst;
1389 assert(dn->last == p->dnlast);
1390 }
1391 if (lump.is_importing())
1392 dn->state_set(CDentry::STATE_AUTH);
1393 }
1394
1395 // null dentries
1396 for (list<nullbit>::const_iterator p = lump.get_dnull().begin();
1397 p != lump.get_dnull().end();
1398 ++p) {
1399 CDentry *dn = dir->lookup_exact_snap(p->dn, p->dnlast);
1400 if (!dn) {
1401 dn = dir->add_null_dentry(p->dn, p->dnfirst, p->dnlast);
1402 dn->set_version(p->dnv);
1403 if (p->dirty) dn->_mark_dirty(logseg);
1404 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn << dendl;
1405 } else {
1406 dn->first = p->dnfirst;
1407 if (!dn->get_linkage()->is_null()) {
1408 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1409 CInode *in = dn->get_linkage()->get_inode();
1410 // For renamed inode, We may call CInode::force_dirfrag() later.
1411 // CInode::force_dirfrag() doesn't work well when inode is detached
1412 // from the hierarchy.
1413 if (!renamed_diri || renamed_diri != in) {
1414 if (dn->get_linkage()->is_primary())
1415 unlinked[in] = dir;
1416 dir->unlink_inode(dn);
1417 }
1418 }
1419 dn->set_version(p->dnv);
1420 if (p->dirty) dn->_mark_dirty(logseg);
1421 dout(10) << "EMetaBlob.replay had " << *dn << dendl;
1422 assert(dn->last == p->dnlast);
1423 }
1424 olddir = dir;
1425 if (lump.is_importing())
1426 dn->state_set(CDentry::STATE_AUTH);
1427
1428 // Make null dentries the first things we trim
1429 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn << dendl;
1430 }
1431 }
1432
1433 assert(g_conf->mds_kill_journal_replay_at != 3);
1434
1435 if (renamed_dirino) {
1436 if (renamed_diri) {
1437 assert(unlinked.count(renamed_diri));
1438 assert(linked.count(renamed_diri));
1439 olddir = unlinked[renamed_diri];
1440 } else {
1441 // we imported a diri we haven't seen before
1442 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
1443 assert(renamed_diri); // it was in the metablob
1444 }
1445
1446 if (olddir) {
1447 if (olddir->authority() != CDIR_AUTH_UNDEF &&
1448 renamed_diri->authority() == CDIR_AUTH_UNDEF) {
1449 assert(slaveup); // auth to non-auth, must be slave prepare
1450 list<frag_t> leaves;
1451 renamed_diri->dirfragtree.get_leaves(leaves);
1452 for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p) {
1453 CDir *dir = renamed_diri->get_dirfrag(*p);
1454 assert(dir);
1455 if (dir->get_dir_auth() == CDIR_AUTH_UNDEF)
1456 // preserve subtree bound until slave commit
1457 slaveup->olddirs.insert(dir->inode);
1458 else
1459 dir->state_set(CDir::STATE_AUTH);
1460 }
1461 }
1462
1463 mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
1464
1465 // see if we can discard the subtree we renamed out of
1466 CDir *root = mds->mdcache->get_subtree_root(olddir);
1467 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
1468 if (slaveup) // preserve the old dir until slave commit
1469 slaveup->olddirs.insert(olddir->inode);
1470 else
1471 mds->mdcache->try_trim_non_auth_subtree(root);
1472 }
1473 }
1474
1475 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1476 if (renamed_diri->authority() != CDIR_AUTH_UNDEF) {
1477 for (list<frag_t>::iterator p = renamed_dir_frags.begin(); p != renamed_dir_frags.end(); ++p) {
1478 CDir *dir = renamed_diri->get_dirfrag(*p);
1479 if (dir) {
1480 // we already had the inode before, and we already adjusted this subtree accordingly.
1481 dout(10) << " already had+adjusted rename import bound " << *dir << dendl;
1482 assert(olddir);
1483 continue;
1484 }
1485 dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, *p);
1486 dout(10) << " creating new rename import bound " << *dir << dendl;
1487 dir->state_clear(CDir::STATE_AUTH);
1488 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
1489 }
1490 }
1491
1492 // rename may overwrite an empty directory and move it into stray dir.
1493 unlinked.erase(renamed_diri);
1494 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
1495 if (!linked.count(p->first))
1496 continue;
1497 assert(p->first->is_dir());
1498 mds->mdcache->adjust_subtree_after_rename(p->first, p->second, false);
1499 }
1500 }
1501
1502 if (!unlinked.empty()) {
1503 for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); ++p)
1504 unlinked.erase(*p);
1505 dout(10) << " unlinked set contains " << unlinked << dendl;
1506 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
1507 if (slaveup) // preserve unlinked inodes until slave commit
1508 slaveup->unlinked.insert(p->first);
1509 else
1510 mds->mdcache->remove_inode_recursive(p->first);
1511 }
1512 }
1513
1514 // table client transactions
1515 for (list<pair<__u8,version_t> >::iterator p = table_tids.begin();
1516 p != table_tids.end();
1517 ++p) {
1518 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p->first)
1519 << " transaction " << p->second << dendl;
1520 MDSTableClient *client = mds->get_table_client(p->first);
1521 if (client)
1522 client->got_journaled_agree(p->second, logseg);
1523 }
1524
1525 // opened ino?
1526 if (opened_ino) {
1527 CInode *in = mds->mdcache->get_inode(opened_ino);
1528 assert(in);
1529 dout(10) << "EMetaBlob.replay noting opened inode " << *in << dendl;
1530 logseg->open_files.push_back(&in->item_open_file);
1531 }
1532
1533 // allocated_inos
1534 if (inotablev) {
1535 if (mds->inotable->get_version() >= inotablev) {
1536 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1537 << " <= table " << mds->inotable->get_version() << dendl;
1538 } else {
1539 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1540 << " - 1 == table " << mds->inotable->get_version()
1541 << " allocated+used " << allocated_ino
1542 << " prealloc " << preallocated_inos
1543 << dendl;
1544 if (allocated_ino)
1545 mds->inotable->replay_alloc_id(allocated_ino);
1546 if (preallocated_inos.size())
1547 mds->inotable->replay_alloc_ids(preallocated_inos);
1548
1549 // [repair bad inotable updates]
1550 if (inotablev > mds->inotable->get_version()) {
1551 mds->clog->error() << "journal replay inotablev mismatch "
1552 << mds->inotable->get_version() << " -> " << inotablev;
1553 mds->inotable->force_replay_version(inotablev);
1554 }
1555
1556 assert(inotablev == mds->inotable->get_version());
1557 }
1558 }
1559 if (sessionmapv) {
1560 if (mds->sessionmap.get_version() >= sessionmapv) {
1561 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1562 << " <= table " << mds->sessionmap.get_version() << dendl;
1563 } else if (mds->sessionmap.get_version() + 2 >= sessionmapv) {
1564 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1565 << " -(1|2) == table " << mds->sessionmap.get_version()
1566 << " prealloc " << preallocated_inos
1567 << " used " << used_preallocated_ino
1568 << dendl;
1569 Session *session = mds->sessionmap.get_session(client_name);
1570 if (session) {
1571 dout(20) << " (session prealloc " << session->info.prealloc_inos << ")" << dendl;
1572 if (used_preallocated_ino) {
1573 if (!session->info.prealloc_inos.empty()) {
1574 inodeno_t next = session->next_ino();
1575 inodeno_t i = session->take_ino(used_preallocated_ino);
1576 if (next != i)
1577 mds->clog->warn() << " replayed op " << client_reqs << " used ino " << i
1578 << " but session next is " << next;
1579 assert(i == used_preallocated_ino);
1580 session->info.used_inos.clear();
1581 }
1582 mds->sessionmap.replay_dirty_session(session);
1583 }
1584 if (!preallocated_inos.empty()) {
1585 session->info.prealloc_inos.insert(preallocated_inos);
1586 mds->sessionmap.replay_dirty_session(session);
1587 }
1588
1589 } else {
1590 dout(10) << "EMetaBlob.replay no session for " << client_name << dendl;
1591 if (used_preallocated_ino) {
1592 mds->sessionmap.replay_advance_version();
1593 }
1594 if (!preallocated_inos.empty())
1595 mds->sessionmap.replay_advance_version();
1596 }
1597 assert(sessionmapv == mds->sessionmap.get_version());
1598 } else {
1599 mds->clog->error() << "journal replay sessionmap v " << sessionmapv
1600 << " -(1|2) > table " << mds->sessionmap.get_version();
1601 assert(g_conf->mds_wipe_sessions);
1602 mds->sessionmap.wipe();
1603 mds->sessionmap.set_version(sessionmapv);
1604 }
1605 }
1606
1607 // truncating inodes
1608 for (list<inodeno_t>::iterator p = truncate_start.begin();
1609 p != truncate_start.end();
1610 ++p) {
1611 CInode *in = mds->mdcache->get_inode(*p);
1612 assert(in);
1613 mds->mdcache->add_recovered_truncate(in, logseg);
1614 }
1615 for (map<inodeno_t,uint64_t>::iterator p = truncate_finish.begin();
1616 p != truncate_finish.end();
1617 ++p) {
1618 LogSegment *ls = mds->mdlog->get_segment(p->second);
1619 if (ls) {
1620 CInode *in = mds->mdcache->get_inode(p->first);
1621 assert(in);
1622 mds->mdcache->remove_recovered_truncate(in, ls);
1623 }
1624 }
1625
1626 // destroyed inodes
1627 for (vector<inodeno_t>::iterator p = destroyed_inodes.begin();
1628 p != destroyed_inodes.end();
1629 ++p) {
1630 CInode *in = mds->mdcache->get_inode(*p);
1631 if (in) {
1632 dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl;
1633 CDentry *parent = in->get_parent_dn();
1634 mds->mdcache->remove_inode(in);
1635 if (parent) {
1636 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl;
1637 assert(parent->get_linkage()->is_null());
1638 }
1639 } else {
1640 dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl;
1641 }
1642 }
1643
1644 // client requests
1645 for (list<pair<metareqid_t, uint64_t> >::iterator p = client_reqs.begin();
1646 p != client_reqs.end();
1647 ++p) {
1648 if (p->first.name.is_client()) {
1649 dout(10) << "EMetaBlob.replay request " << p->first << " trim_to " << p->second << dendl;
1650 inodeno_t created = allocated_ino ? allocated_ino : used_preallocated_ino;
1651 // if we allocated an inode, there should be exactly one client request id.
1652 assert(created == inodeno_t() || client_reqs.size() == 1);
1653
1654 Session *session = mds->sessionmap.get_session(p->first.name);
1655 if (session) {
1656 session->add_completed_request(p->first.tid, created);
1657 if (p->second)
1658 session->trim_completed_requests(p->second);
1659 }
1660 }
1661 }
1662
1663 // client flushes
1664 for (list<pair<metareqid_t, uint64_t> >::iterator p = client_flushes.begin();
1665 p != client_flushes.end();
1666 ++p) {
1667 if (p->first.name.is_client()) {
1668 dout(10) << "EMetaBlob.replay flush " << p->first << " trim_to " << p->second << dendl;
1669 Session *session = mds->sessionmap.get_session(p->first.name);
1670 if (session) {
1671 session->add_completed_flush(p->first.tid);
1672 if (p->second)
1673 session->trim_completed_flushes(p->second);
1674 }
1675 }
1676 }
1677
1678 // update segment
1679 update_segment(logseg);
1680
1681 assert(g_conf->mds_kill_journal_replay_at != 4);
1682 }
1683
1684 // -----------------------
1685 // ESession
1686
1687 void ESession::update_segment()
1688 {
1689 _segment->sessionmapv = cmapv;
1690 if (inos.size() && inotablev)
1691 _segment->inotablev = inotablev;
1692 }
1693
1694 void ESession::replay(MDSRank *mds)
1695 {
1696 if (mds->sessionmap.get_version() >= cmapv) {
1697 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1698 << " >= " << cmapv << ", noop" << dendl;
1699 } else {
1700 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1701 << " < " << cmapv << " " << (open ? "open":"close") << " " << client_inst << dendl;
1702 Session *session;
1703 if (open) {
1704 session = mds->sessionmap.get_or_add_session(client_inst);
1705 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1706 session->set_client_metadata(client_metadata);
1707 dout(10) << " opened session " << session->info.inst << dendl;
1708 } else {
1709 session = mds->sessionmap.get_session(client_inst.name);
1710 if (session) { // there always should be a session, but there's a bug
1711 if (session->connection == NULL) {
1712 dout(10) << " removed session " << session->info.inst << dendl;
1713 mds->sessionmap.remove_session(session);
1714 session = NULL;
1715 } else {
1716 session->clear(); // the client has reconnected; keep the Session, but reset
1717 dout(10) << " reset session " << session->info.inst << " (they reconnected)" << dendl;
1718 }
1719 } else {
1720 mds->clog->error() << "replayed stray Session close event for " << client_inst
1721 << " from time " << stamp << ", ignoring";
1722 }
1723 }
1724 if (session) {
1725 mds->sessionmap.replay_dirty_session(session);
1726 } else {
1727 mds->sessionmap.replay_advance_version();
1728 }
1729 assert(mds->sessionmap.get_version() == cmapv);
1730 }
1731
1732 if (inos.size() && inotablev) {
1733 if (mds->inotable->get_version() >= inotablev) {
1734 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1735 << " >= " << inotablev << ", noop" << dendl;
1736 } else {
1737 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1738 << " < " << inotablev << " " << (open ? "add":"remove") << dendl;
1739 assert(!open); // for now
1740 mds->inotable->replay_release_ids(inos);
1741 assert(mds->inotable->get_version() == inotablev);
1742 }
1743 }
1744
1745 update_segment();
1746 }
1747
1748 void ESession::encode(bufferlist &bl, uint64_t features) const
1749 {
1750 ENCODE_START(4, 3, bl);
1751 ::encode(stamp, bl);
1752 ::encode(client_inst, bl, features);
1753 ::encode(open, bl);
1754 ::encode(cmapv, bl);
1755 ::encode(inos, bl);
1756 ::encode(inotablev, bl);
1757 ::encode(client_metadata, bl);
1758 ENCODE_FINISH(bl);
1759 }
1760
1761 void ESession::decode(bufferlist::iterator &bl)
1762 {
1763 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
1764 if (struct_v >= 2)
1765 ::decode(stamp, bl);
1766 ::decode(client_inst, bl);
1767 ::decode(open, bl);
1768 ::decode(cmapv, bl);
1769 ::decode(inos, bl);
1770 ::decode(inotablev, bl);
1771 if (struct_v >= 4) {
1772 ::decode(client_metadata, bl);
1773 }
1774 DECODE_FINISH(bl);
1775 }
1776
1777 void ESession::dump(Formatter *f) const
1778 {
1779 f->dump_stream("client instance") << client_inst;
1780 f->dump_string("open", open ? "true" : "false");
1781 f->dump_int("client map version", cmapv);
1782 f->dump_stream("inos") << inos;
1783 f->dump_int("inotable version", inotablev);
1784 f->open_object_section("client_metadata");
1785 for (map<string, string>::const_iterator i = client_metadata.begin();
1786 i != client_metadata.end(); ++i) {
1787 f->dump_string(i->first.c_str(), i->second);
1788 }
1789 f->close_section(); // client_metadata
1790 }
1791
1792 void ESession::generate_test_instances(list<ESession*>& ls)
1793 {
1794 ls.push_back(new ESession);
1795 }
1796
1797 // -----------------------
1798 // ESessions
1799
1800 void ESessions::encode(bufferlist &bl, uint64_t features) const
1801 {
1802 ENCODE_START(1, 1, bl);
1803 ::encode(client_map, bl, features);
1804 ::encode(cmapv, bl);
1805 ::encode(stamp, bl);
1806 ENCODE_FINISH(bl);
1807 }
1808
1809 void ESessions::decode_old(bufferlist::iterator &bl)
1810 {
1811 ::decode(client_map, bl);
1812 ::decode(cmapv, bl);
1813 if (!bl.end())
1814 ::decode(stamp, bl);
1815 }
1816
1817 void ESessions::decode_new(bufferlist::iterator &bl)
1818 {
1819 DECODE_START(1, bl);
1820 ::decode(client_map, bl);
1821 ::decode(cmapv, bl);
1822 if (!bl.end())
1823 ::decode(stamp, bl);
1824 DECODE_FINISH(bl);
1825 }
1826
1827 void ESessions::dump(Formatter *f) const
1828 {
1829 f->dump_int("client map version", cmapv);
1830
1831 f->open_array_section("client map");
1832 for (map<client_t,entity_inst_t>::const_iterator i = client_map.begin();
1833 i != client_map.end(); ++i) {
1834 f->open_object_section("client");
1835 f->dump_int("client id", i->first.v);
1836 f->dump_stream("client entity") << i->second;
1837 f->close_section(); // client
1838 }
1839 f->close_section(); // client map
1840 }
1841
1842 void ESessions::generate_test_instances(list<ESessions*>& ls)
1843 {
1844 ls.push_back(new ESessions());
1845 }
1846
1847 void ESessions::update_segment()
1848 {
1849 _segment->sessionmapv = cmapv;
1850 }
1851
1852 void ESessions::replay(MDSRank *mds)
1853 {
1854 if (mds->sessionmap.get_version() >= cmapv) {
1855 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1856 << " >= " << cmapv << ", noop" << dendl;
1857 } else {
1858 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1859 << " < " << cmapv << dendl;
1860 mds->sessionmap.open_sessions(client_map);
1861 assert(mds->sessionmap.get_version() == cmapv);
1862 mds->sessionmap.set_projected(mds->sessionmap.get_version());
1863 }
1864 update_segment();
1865 }
1866
1867
1868 // -----------------------
1869 // ETableServer
1870
1871 void ETableServer::encode(bufferlist& bl, uint64_t features) const
1872 {
1873 ENCODE_START(3, 3, bl);
1874 ::encode(stamp, bl);
1875 ::encode(table, bl);
1876 ::encode(op, bl);
1877 ::encode(reqid, bl);
1878 ::encode(bymds, bl);
1879 ::encode(mutation, bl);
1880 ::encode(tid, bl);
1881 ::encode(version, bl);
1882 ENCODE_FINISH(bl);
1883 }
1884
1885 void ETableServer::decode(bufferlist::iterator &bl)
1886 {
1887 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
1888 if (struct_v >= 2)
1889 ::decode(stamp, bl);
1890 ::decode(table, bl);
1891 ::decode(op, bl);
1892 ::decode(reqid, bl);
1893 ::decode(bymds, bl);
1894 ::decode(mutation, bl);
1895 ::decode(tid, bl);
1896 ::decode(version, bl);
1897 DECODE_FINISH(bl);
1898 }
1899
1900 void ETableServer::dump(Formatter *f) const
1901 {
1902 f->dump_int("table id", table);
1903 f->dump_int("op", op);
1904 f->dump_int("request id", reqid);
1905 f->dump_int("by mds", bymds);
1906 f->dump_int("tid", tid);
1907 f->dump_int("version", version);
1908 }
1909
1910 void ETableServer::generate_test_instances(list<ETableServer*>& ls)
1911 {
1912 ls.push_back(new ETableServer());
1913 }
1914
1915
1916 void ETableServer::update_segment()
1917 {
1918 _segment->tablev[table] = version;
1919 }
1920
1921 void ETableServer::replay(MDSRank *mds)
1922 {
1923 MDSTableServer *server = mds->get_table_server(table);
1924 if (!server)
1925 return;
1926
1927 if (server->get_version() >= version) {
1928 dout(10) << "ETableServer.replay " << get_mdstable_name(table)
1929 << " " << get_mdstableserver_opname(op)
1930 << " event " << version
1931 << " <= table " << server->get_version() << dendl;
1932 return;
1933 }
1934
1935 dout(10) << " ETableServer.replay " << get_mdstable_name(table)
1936 << " " << get_mdstableserver_opname(op)
1937 << " event " << version << " - 1 == table " << server->get_version() << dendl;
1938 assert(version-1 == server->get_version());
1939
1940 switch (op) {
1941 case TABLESERVER_OP_PREPARE:
1942 server->_prepare(mutation, reqid, bymds);
1943 server->_note_prepare(bymds, reqid);
1944 break;
1945 case TABLESERVER_OP_COMMIT:
1946 server->_commit(tid);
1947 server->_note_commit(tid);
1948 break;
1949 case TABLESERVER_OP_ROLLBACK:
1950 server->_rollback(tid);
1951 server->_note_rollback(tid);
1952 break;
1953 case TABLESERVER_OP_SERVER_UPDATE:
1954 server->_server_update(mutation);
1955 break;
1956 default:
1957 mds->clog->error() << "invalid tableserver op in ETableServer";
1958 mds->damaged();
1959 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1960 }
1961
1962 assert(version == server->get_version());
1963 update_segment();
1964 }
1965
1966
1967 // ---------------------
1968 // ETableClient
1969
1970 void ETableClient::encode(bufferlist& bl, uint64_t features) const
1971 {
1972 ENCODE_START(3, 3, bl);
1973 ::encode(stamp, bl);
1974 ::encode(table, bl);
1975 ::encode(op, bl);
1976 ::encode(tid, bl);
1977 ENCODE_FINISH(bl);
1978 }
1979
1980 void ETableClient::decode(bufferlist::iterator &bl)
1981 {
1982 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
1983 if (struct_v >= 2)
1984 ::decode(stamp, bl);
1985 ::decode(table, bl);
1986 ::decode(op, bl);
1987 ::decode(tid, bl);
1988 DECODE_FINISH(bl);
1989 }
1990
1991 void ETableClient::dump(Formatter *f) const
1992 {
1993 f->dump_int("table", table);
1994 f->dump_int("op", op);
1995 f->dump_int("tid", tid);
1996 }
1997
1998 void ETableClient::generate_test_instances(list<ETableClient*>& ls)
1999 {
2000 ls.push_back(new ETableClient());
2001 }
2002
2003 void ETableClient::replay(MDSRank *mds)
2004 {
2005 dout(10) << " ETableClient.replay " << get_mdstable_name(table)
2006 << " op " << get_mdstableserver_opname(op)
2007 << " tid " << tid << dendl;
2008
2009 MDSTableClient *client = mds->get_table_client(table);
2010 if (!client)
2011 return;
2012
2013 assert(op == TABLESERVER_OP_ACK);
2014 client->got_journaled_ack(tid);
2015 }
2016
2017
2018 // -----------------------
2019 // ESnap
2020 /*
2021 void ESnap::update_segment()
2022 {
2023 _segment->tablev[TABLE_SNAP] = version;
2024 }
2025
2026 void ESnap::replay(MDSRank *mds)
2027 {
2028 if (mds->snaptable->get_version() >= version) {
2029 dout(10) << "ESnap.replay event " << version
2030 << " <= table " << mds->snaptable->get_version() << dendl;
2031 return;
2032 }
2033
2034 dout(10) << " ESnap.replay event " << version
2035 << " - 1 == table " << mds->snaptable->get_version() << dendl;
2036 assert(version-1 == mds->snaptable->get_version());
2037
2038 if (create) {
2039 version_t v;
2040 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
2041 assert(s == snap.snapid);
2042 } else {
2043 mds->snaptable->remove(snap.snapid);
2044 }
2045
2046 assert(version == mds->snaptable->get_version());
2047 }
2048 */
2049
2050
2051
2052 // -----------------------
2053 // EUpdate
2054
2055 void EUpdate::encode(bufferlist &bl, uint64_t features) const
2056 {
2057 ENCODE_START(4, 4, bl);
2058 ::encode(stamp, bl);
2059 ::encode(type, bl);
2060 ::encode(metablob, bl, features);
2061 ::encode(client_map, bl);
2062 ::encode(cmapv, bl);
2063 ::encode(reqid, bl);
2064 ::encode(had_slaves, bl);
2065 ENCODE_FINISH(bl);
2066 }
2067
2068 void EUpdate::decode(bufferlist::iterator &bl)
2069 {
2070 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl);
2071 if (struct_v >= 2)
2072 ::decode(stamp, bl);
2073 ::decode(type, bl);
2074 ::decode(metablob, bl);
2075 ::decode(client_map, bl);
2076 if (struct_v >= 3)
2077 ::decode(cmapv, bl);
2078 ::decode(reqid, bl);
2079 ::decode(had_slaves, bl);
2080 DECODE_FINISH(bl);
2081 }
2082
2083 void EUpdate::dump(Formatter *f) const
2084 {
2085 f->open_object_section("metablob");
2086 metablob.dump(f);
2087 f->close_section(); // metablob
2088
2089 f->dump_string("type", type);
2090 f->dump_int("client map length", client_map.length());
2091 f->dump_int("client map version", cmapv);
2092 f->dump_stream("reqid") << reqid;
2093 f->dump_string("had slaves", had_slaves ? "true" : "false");
2094 }
2095
2096 void EUpdate::generate_test_instances(list<EUpdate*>& ls)
2097 {
2098 ls.push_back(new EUpdate());
2099 }
2100
2101
2102 void EUpdate::update_segment()
2103 {
2104 metablob.update_segment(_segment);
2105
2106 if (client_map.length())
2107 _segment->sessionmapv = cmapv;
2108
2109 if (had_slaves)
2110 _segment->uncommitted_masters.insert(reqid);
2111 }
2112
2113 void EUpdate::replay(MDSRank *mds)
2114 {
2115 metablob.replay(mds, _segment);
2116
2117 if (had_slaves) {
2118 dout(10) << "EUpdate.replay " << reqid << " had slaves, expecting a matching ECommitted" << dendl;
2119 _segment->uncommitted_masters.insert(reqid);
2120 set<mds_rank_t> slaves;
2121 mds->mdcache->add_uncommitted_master(reqid, _segment, slaves, true);
2122 }
2123
2124 if (client_map.length()) {
2125 if (mds->sessionmap.get_version() >= cmapv) {
2126 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2127 << " <= table " << mds->sessionmap.get_version() << dendl;
2128 } else {
2129 dout(10) << "EUpdate.replay sessionmap " << mds->sessionmap.get_version()
2130 << " < " << cmapv << dendl;
2131 // open client sessions?
2132 map<client_t,entity_inst_t> cm;
2133 bufferlist::iterator blp = client_map.begin();
2134 ::decode(cm, blp);
2135 mds->sessionmap.open_sessions(cm);
2136
2137 assert(mds->sessionmap.get_version() == cmapv);
2138 mds->sessionmap.set_projected(mds->sessionmap.get_version());
2139 }
2140 }
2141 update_segment();
2142 }
2143
2144
2145 // ------------------------
2146 // EOpen
2147
2148 void EOpen::encode(bufferlist &bl, uint64_t features) const {
2149 ENCODE_START(4, 3, bl);
2150 ::encode(stamp, bl);
2151 ::encode(metablob, bl, features);
2152 ::encode(inos, bl);
2153 ::encode(snap_inos, bl);
2154 ENCODE_FINISH(bl);
2155 }
2156
2157 void EOpen::decode(bufferlist::iterator &bl) {
2158 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2159 if (struct_v >= 2)
2160 ::decode(stamp, bl);
2161 ::decode(metablob, bl);
2162 ::decode(inos, bl);
2163 if (struct_v >= 4)
2164 ::decode(snap_inos, bl);
2165 DECODE_FINISH(bl);
2166 }
2167
2168 void EOpen::dump(Formatter *f) const
2169 {
2170 f->open_object_section("metablob");
2171 metablob.dump(f);
2172 f->close_section(); // metablob
2173 f->open_array_section("inos involved");
2174 for (vector<inodeno_t>::const_iterator i = inos.begin();
2175 i != inos.end(); ++i) {
2176 f->dump_int("ino", *i);
2177 }
2178 f->close_section(); // inos
2179 }
2180
2181 void EOpen::generate_test_instances(list<EOpen*>& ls)
2182 {
2183 ls.push_back(new EOpen());
2184 ls.push_back(new EOpen());
2185 ls.back()->add_ino(0);
2186 }
2187
2188 void EOpen::update_segment()
2189 {
2190 // ??
2191 }
2192
2193 void EOpen::replay(MDSRank *mds)
2194 {
2195 dout(10) << "EOpen.replay " << dendl;
2196 metablob.replay(mds, _segment);
2197
2198 // note which segments inodes belong to, so we don't have to start rejournaling them
2199 for (const auto &ino : inos) {
2200 CInode *in = mds->mdcache->get_inode(ino);
2201 if (!in) {
2202 dout(0) << "EOpen.replay ino " << ino << " not in metablob" << dendl;
2203 assert(in);
2204 }
2205 _segment->open_files.push_back(&in->item_open_file);
2206 }
2207 for (const auto &vino : snap_inos) {
2208 CInode *in = mds->mdcache->get_inode(vino);
2209 if (!in) {
2210 dout(0) << "EOpen.replay ino " << vino << " not in metablob" << dendl;
2211 assert(in);
2212 }
2213 _segment->open_files.push_back(&in->item_open_file);
2214 }
2215 }
2216
2217
2218 // -----------------------
2219 // ECommitted
2220
2221 void ECommitted::replay(MDSRank *mds)
2222 {
2223 if (mds->mdcache->uncommitted_masters.count(reqid)) {
2224 dout(10) << "ECommitted.replay " << reqid << dendl;
2225 mds->mdcache->uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2226 mds->mdcache->uncommitted_masters.erase(reqid);
2227 } else {
2228 dout(10) << "ECommitted.replay " << reqid << " -- didn't see original op" << dendl;
2229 }
2230 }
2231
2232 void ECommitted::encode(bufferlist& bl, uint64_t features) const
2233 {
2234 ENCODE_START(3, 3, bl);
2235 ::encode(stamp, bl);
2236 ::encode(reqid, bl);
2237 ENCODE_FINISH(bl);
2238 }
2239
2240 void ECommitted::decode(bufferlist::iterator& bl)
2241 {
2242 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2243 if (struct_v >= 2)
2244 ::decode(stamp, bl);
2245 ::decode(reqid, bl);
2246 DECODE_FINISH(bl);
2247 }
2248
2249 void ECommitted::dump(Formatter *f) const {
2250 f->dump_stream("stamp") << stamp;
2251 f->dump_stream("reqid") << reqid;
2252 }
2253
2254 void ECommitted::generate_test_instances(list<ECommitted*>& ls)
2255 {
2256 ls.push_back(new ECommitted);
2257 ls.push_back(new ECommitted);
2258 ls.back()->stamp = utime_t(1, 2);
2259 ls.back()->reqid = metareqid_t(entity_name_t::CLIENT(123), 456);
2260 }
2261
2262 // -----------------------
2263 // ESlaveUpdate
2264
2265 void link_rollback::encode(bufferlist &bl) const
2266 {
2267 ENCODE_START(2, 2, bl);
2268 ::encode(reqid, bl);
2269 ::encode(ino, bl);
2270 ::encode(was_inc, bl);
2271 ::encode(old_ctime, bl);
2272 ::encode(old_dir_mtime, bl);
2273 ::encode(old_dir_rctime, bl);
2274 ENCODE_FINISH(bl);
2275 }
2276
2277 void link_rollback::decode(bufferlist::iterator &bl)
2278 {
2279 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2280 ::decode(reqid, bl);
2281 ::decode(ino, bl);
2282 ::decode(was_inc, bl);
2283 ::decode(old_ctime, bl);
2284 ::decode(old_dir_mtime, bl);
2285 ::decode(old_dir_rctime, bl);
2286 DECODE_FINISH(bl);
2287 }
2288
2289 void link_rollback::dump(Formatter *f) const
2290 {
2291 f->dump_stream("metareqid") << reqid;
2292 f->dump_int("ino", ino);
2293 f->dump_string("was incremented", was_inc ? "true" : "false");
2294 f->dump_stream("old_ctime") << old_ctime;
2295 f->dump_stream("old_dir_mtime") << old_dir_mtime;
2296 f->dump_stream("old_dir_rctime") << old_dir_rctime;
2297 }
2298
2299 void link_rollback::generate_test_instances(list<link_rollback*>& ls)
2300 {
2301 ls.push_back(new link_rollback());
2302 }
2303
2304 void rmdir_rollback::encode(bufferlist& bl) const
2305 {
2306 ENCODE_START(2, 2, bl);
2307 ::encode(reqid, bl);
2308 ::encode(src_dir, bl);
2309 ::encode(src_dname, bl);
2310 ::encode(dest_dir, bl);
2311 ::encode(dest_dname, bl);
2312 ENCODE_FINISH(bl);
2313 }
2314
2315 void rmdir_rollback::decode(bufferlist::iterator& bl)
2316 {
2317 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2318 ::decode(reqid, bl);
2319 ::decode(src_dir, bl);
2320 ::decode(src_dname, bl);
2321 ::decode(dest_dir, bl);
2322 ::decode(dest_dname, bl);
2323 DECODE_FINISH(bl);
2324 }
2325
2326 void rmdir_rollback::dump(Formatter *f) const
2327 {
2328 f->dump_stream("metareqid") << reqid;
2329 f->dump_stream("source directory") << src_dir;
2330 f->dump_string("source dname", src_dname);
2331 f->dump_stream("destination directory") << dest_dir;
2332 f->dump_string("destination dname", dest_dname);
2333 }
2334
2335 void rmdir_rollback::generate_test_instances(list<rmdir_rollback*>& ls)
2336 {
2337 ls.push_back(new rmdir_rollback());
2338 }
2339
2340 void rename_rollback::drec::encode(bufferlist &bl) const
2341 {
2342 ENCODE_START(2, 2, bl);
2343 ::encode(dirfrag, bl);
2344 ::encode(dirfrag_old_mtime, bl);
2345 ::encode(dirfrag_old_rctime, bl);
2346 ::encode(ino, bl);
2347 ::encode(remote_ino, bl);
2348 ::encode(dname, bl);
2349 ::encode(remote_d_type, bl);
2350 ::encode(old_ctime, bl);
2351 ENCODE_FINISH(bl);
2352 }
2353
2354 void rename_rollback::drec::decode(bufferlist::iterator &bl)
2355 {
2356 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2357 ::decode(dirfrag, bl);
2358 ::decode(dirfrag_old_mtime, bl);
2359 ::decode(dirfrag_old_rctime, bl);
2360 ::decode(ino, bl);
2361 ::decode(remote_ino, bl);
2362 ::decode(dname, bl);
2363 ::decode(remote_d_type, bl);
2364 ::decode(old_ctime, bl);
2365 DECODE_FINISH(bl);
2366 }
2367
2368 void rename_rollback::drec::dump(Formatter *f) const
2369 {
2370 f->dump_stream("directory fragment") << dirfrag;
2371 f->dump_stream("directory old mtime") << dirfrag_old_mtime;
2372 f->dump_stream("directory old rctime") << dirfrag_old_rctime;
2373 f->dump_int("ino", ino);
2374 f->dump_int("remote ino", remote_ino);
2375 f->dump_string("dname", dname);
2376 uint32_t type = DTTOIF(remote_d_type) & S_IFMT; // convert to type entries
2377 string type_string;
2378 switch(type) {
2379 case S_IFREG:
2380 type_string = "file"; break;
2381 case S_IFLNK:
2382 type_string = "symlink"; break;
2383 case S_IFDIR:
2384 type_string = "directory"; break;
2385 default:
2386 type_string = "UNKNOWN-" + stringify((int)type); break;
2387 }
2388 f->dump_string("remote dtype", type_string);
2389 f->dump_stream("old ctime") << old_ctime;
2390 }
2391
2392 void rename_rollback::drec::generate_test_instances(list<drec*>& ls)
2393 {
2394 ls.push_back(new drec());
2395 ls.back()->remote_d_type = IFTODT(S_IFREG);
2396 }
2397
2398 void rename_rollback::encode(bufferlist &bl) const
2399 {
2400 ENCODE_START(2, 2, bl);
2401 ::encode(reqid, bl);
2402 encode(orig_src, bl);
2403 encode(orig_dest, bl);
2404 encode(stray, bl);
2405 ::encode(ctime, bl);
2406 ENCODE_FINISH(bl);
2407 }
2408
2409 void rename_rollback::decode(bufferlist::iterator &bl)
2410 {
2411 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2412 ::decode(reqid, bl);
2413 decode(orig_src, bl);
2414 decode(orig_dest, bl);
2415 decode(stray, bl);
2416 ::decode(ctime, bl);
2417 DECODE_FINISH(bl);
2418 }
2419
2420 void rename_rollback::dump(Formatter *f) const
2421 {
2422 f->dump_stream("request id") << reqid;
2423 f->open_object_section("original src drec");
2424 orig_src.dump(f);
2425 f->close_section(); // original src drec
2426 f->open_object_section("original dest drec");
2427 orig_dest.dump(f);
2428 f->close_section(); // original dest drec
2429 f->open_object_section("stray drec");
2430 stray.dump(f);
2431 f->close_section(); // stray drec
2432 f->dump_stream("ctime") << ctime;
2433 }
2434
2435 void rename_rollback::generate_test_instances(list<rename_rollback*>& ls)
2436 {
2437 ls.push_back(new rename_rollback());
2438 ls.back()->orig_src.remote_d_type = IFTODT(S_IFREG);
2439 ls.back()->orig_dest.remote_d_type = IFTODT(S_IFREG);
2440 ls.back()->stray.remote_d_type = IFTODT(S_IFREG);
2441 }
2442
2443 void ESlaveUpdate::encode(bufferlist &bl, uint64_t features) const
2444 {
2445 ENCODE_START(3, 3, bl);
2446 ::encode(stamp, bl);
2447 ::encode(type, bl);
2448 ::encode(reqid, bl);
2449 ::encode(master, bl);
2450 ::encode(op, bl);
2451 ::encode(origop, bl);
2452 ::encode(commit, bl, features);
2453 ::encode(rollback, bl);
2454 ENCODE_FINISH(bl);
2455 }
2456
2457 void ESlaveUpdate::decode(bufferlist::iterator &bl)
2458 {
2459 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2460 if (struct_v >= 2)
2461 ::decode(stamp, bl);
2462 ::decode(type, bl);
2463 ::decode(reqid, bl);
2464 ::decode(master, bl);
2465 ::decode(op, bl);
2466 ::decode(origop, bl);
2467 ::decode(commit, bl);
2468 ::decode(rollback, bl);
2469 DECODE_FINISH(bl);
2470 }
2471
2472 void ESlaveUpdate::dump(Formatter *f) const
2473 {
2474 f->open_object_section("metablob");
2475 commit.dump(f);
2476 f->close_section(); // metablob
2477
2478 f->dump_int("rollback length", rollback.length());
2479 f->dump_string("type", type);
2480 f->dump_stream("metareqid") << reqid;
2481 f->dump_int("master", master);
2482 f->dump_int("op", op);
2483 f->dump_int("original op", origop);
2484 }
2485
2486 void ESlaveUpdate::generate_test_instances(list<ESlaveUpdate*>& ls)
2487 {
2488 ls.push_back(new ESlaveUpdate());
2489 }
2490
2491
2492 void ESlaveUpdate::replay(MDSRank *mds)
2493 {
2494 MDSlaveUpdate *su;
2495 switch (op) {
2496 case ESlaveUpdate::OP_PREPARE:
2497 dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds." << master
2498 << ": applying commit, saving rollback info" << dendl;
2499 su = new MDSlaveUpdate(origop, rollback, _segment->slave_updates);
2500 commit.replay(mds, _segment, su);
2501 mds->mdcache->add_uncommitted_slave_update(reqid, master, su);
2502 break;
2503
2504 case ESlaveUpdate::OP_COMMIT:
2505 su = mds->mdcache->get_uncommitted_slave_update(reqid, master);
2506 if (su) {
2507 dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << dendl;
2508 mds->mdcache->finish_uncommitted_slave_update(reqid, master);
2509 } else {
2510 dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master
2511 << ": ignoring, no previously saved prepare" << dendl;
2512 }
2513 break;
2514
2515 case ESlaveUpdate::OP_ROLLBACK:
2516 dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
2517 << ": applying rollback commit blob" << dendl;
2518 commit.replay(mds, _segment);
2519 su = mds->mdcache->get_uncommitted_slave_update(reqid, master);
2520 if (su)
2521 mds->mdcache->finish_uncommitted_slave_update(reqid, master);
2522 break;
2523
2524 default:
2525 mds->clog->error() << "invalid op in ESlaveUpdate";
2526 mds->damaged();
2527 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2528 }
2529 }
2530
2531
2532 // -----------------------
2533 // ESubtreeMap
2534
2535 void ESubtreeMap::encode(bufferlist& bl, uint64_t features) const
2536 {
2537 ENCODE_START(6, 5, bl);
2538 ::encode(stamp, bl);
2539 ::encode(metablob, bl, features);
2540 ::encode(subtrees, bl);
2541 ::encode(ambiguous_subtrees, bl);
2542 ::encode(expire_pos, bl);
2543 ::encode(event_seq, bl);
2544 ENCODE_FINISH(bl);
2545 }
2546
2547 void ESubtreeMap::decode(bufferlist::iterator &bl)
2548 {
2549 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
2550 if (struct_v >= 2)
2551 ::decode(stamp, bl);
2552 ::decode(metablob, bl);
2553 ::decode(subtrees, bl);
2554 if (struct_v >= 4)
2555 ::decode(ambiguous_subtrees, bl);
2556 if (struct_v >= 3)
2557 ::decode(expire_pos, bl);
2558 if (struct_v >= 6)
2559 ::decode(event_seq, bl);
2560 DECODE_FINISH(bl);
2561 }
2562
2563 void ESubtreeMap::dump(Formatter *f) const
2564 {
2565 f->open_object_section("metablob");
2566 metablob.dump(f);
2567 f->close_section(); // metablob
2568
2569 f->open_array_section("subtrees");
2570 for(map<dirfrag_t,vector<dirfrag_t> >::const_iterator i = subtrees.begin();
2571 i != subtrees.end(); ++i) {
2572 f->open_object_section("tree");
2573 f->dump_stream("root dirfrag") << i->first;
2574 for (vector<dirfrag_t>::const_iterator j = i->second.begin();
2575 j != i->second.end(); ++j) {
2576 f->dump_stream("bound dirfrag") << *j;
2577 }
2578 f->close_section(); // tree
2579 }
2580 f->close_section(); // subtrees
2581
2582 f->open_array_section("ambiguous subtrees");
2583 for(set<dirfrag_t>::const_iterator i = ambiguous_subtrees.begin();
2584 i != ambiguous_subtrees.end(); ++i) {
2585 f->dump_stream("dirfrag") << *i;
2586 }
2587 f->close_section(); // ambiguous subtrees
2588
2589 f->dump_int("expire position", expire_pos);
2590 }
2591
2592 void ESubtreeMap::generate_test_instances(list<ESubtreeMap*>& ls)
2593 {
2594 ls.push_back(new ESubtreeMap());
2595 }
2596
2597 void ESubtreeMap::replay(MDSRank *mds)
2598 {
2599 if (expire_pos && expire_pos > mds->mdlog->journaler->get_expire_pos())
2600 mds->mdlog->journaler->set_expire_pos(expire_pos);
2601
2602 // suck up the subtree map?
2603 if (mds->mdcache->is_subtrees()) {
2604 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl;
2605 int errors = 0;
2606
2607 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2608 p != subtrees.end();
2609 ++p) {
2610 CDir *dir = mds->mdcache->get_dirfrag(p->first);
2611 if (!dir) {
2612 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2613 << " subtree root " << p->first << " not in cache";
2614 ++errors;
2615 continue;
2616 }
2617
2618 if (!mds->mdcache->is_subtree(dir)) {
2619 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2620 << " subtree root " << p->first << " not a subtree in cache";
2621 ++errors;
2622 continue;
2623 }
2624 if (dir->get_dir_auth().first != mds->get_nodeid()) {
2625 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2626 << " subtree root " << p->first
2627 << " is not mine in cache (it's " << dir->get_dir_auth() << ")";
2628 ++errors;
2629 continue;
2630 }
2631
2632 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2633 mds->mdcache->get_force_dirfrag(*q, true);
2634
2635 set<CDir*> bounds;
2636 mds->mdcache->get_subtree_bounds(dir, bounds);
2637 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2638 CDir *b = mds->mdcache->get_dirfrag(*q);
2639 if (!b) {
2640 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2641 << " subtree " << p->first << " bound " << *q << " not in cache";
2642 ++errors;
2643 continue;
2644 }
2645 if (bounds.count(b) == 0) {
2646 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2647 << " subtree " << p->first << " bound " << *q << " not a bound in cache";
2648 ++errors;
2649 continue;
2650 }
2651 bounds.erase(b);
2652 }
2653 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q) {
2654 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2655 << " subtree " << p->first << " has extra bound in cache " << (*q)->dirfrag();
2656 ++errors;
2657 }
2658
2659 if (ambiguous_subtrees.count(p->first)) {
2660 if (!mds->mdcache->have_ambiguous_import(p->first)) {
2661 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2662 << " subtree " << p->first << " is ambiguous but is not in our cache";
2663 ++errors;
2664 }
2665 } else {
2666 if (mds->mdcache->have_ambiguous_import(p->first)) {
2667 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2668 << " subtree " << p->first << " is not ambiguous but is in our cache";
2669 ++errors;
2670 }
2671 }
2672 }
2673
2674 list<CDir*> subs;
2675 mds->mdcache->list_subtrees(subs);
2676 for (list<CDir*>::iterator p = subs.begin(); p != subs.end(); ++p) {
2677 CDir *dir = *p;
2678 if (dir->get_dir_auth().first != mds->get_nodeid())
2679 continue;
2680 if (subtrees.count(dir->dirfrag()) == 0) {
2681 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2682 << " does not include cache subtree " << dir->dirfrag();
2683 ++errors;
2684 }
2685 }
2686
2687 if (errors) {
2688 dout(0) << "journal subtrees: " << subtrees << dendl;
2689 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees << dendl;
2690 mds->mdcache->show_subtrees();
2691 assert(!g_conf->mds_debug_subtrees || errors == 0);
2692 }
2693 return;
2694 }
2695
2696 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl;
2697
2698 // first, stick the spanning tree in my cache
2699 //metablob.print(*_dout);
2700 metablob.replay(mds, _segment);
2701
2702 // restore import/export maps
2703 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2704 p != subtrees.end();
2705 ++p) {
2706 CDir *dir = mds->mdcache->get_dirfrag(p->first);
2707 assert(dir);
2708 if (ambiguous_subtrees.count(p->first)) {
2709 // ambiguous!
2710 mds->mdcache->add_ambiguous_import(p->first, p->second);
2711 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second,
2712 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
2713 } else {
2714 // not ambiguous
2715 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid());
2716 }
2717 }
2718
2719 mds->mdcache->recalc_auth_bits(true);
2720
2721 mds->mdcache->show_subtrees();
2722 }
2723
2724
2725
2726 // -----------------------
2727 // EFragment
2728
2729 void EFragment::replay(MDSRank *mds)
2730 {
2731 dout(10) << "EFragment.replay " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << dendl;
2732
2733 list<CDir*> resultfrags;
2734 list<MDSInternalContextBase*> waiters;
2735 list<frag_t> old_frags;
2736
2737 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2738 // it will be once we replay the metablob , but first we need to
2739 // refragment anything we already have in the cache.
2740 CInode *in = mds->mdcache->get_inode(ino);
2741
2742 switch (op) {
2743 case OP_PREPARE:
2744 mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, _segment, &rollback);
2745
2746 if (in)
2747 mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters, true);
2748 break;
2749
2750 case OP_ROLLBACK:
2751 if (in) {
2752 in->dirfragtree.get_leaves_under(basefrag, old_frags);
2753 if (orig_frags.empty()) {
2754 // old format EFragment
2755 mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true);
2756 } else {
2757 for (list<frag_t>::iterator p = orig_frags.begin(); p != orig_frags.end(); ++p)
2758 mds->mdcache->force_dir_fragment(in, *p);
2759 }
2760 }
2761 mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), old_frags);
2762 break;
2763
2764 case OP_COMMIT:
2765 case OP_FINISH:
2766 mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op);
2767 break;
2768
2769 default:
2770 ceph_abort();
2771 }
2772
2773 metablob.replay(mds, _segment);
2774 if (in && g_conf->mds_debug_frag)
2775 in->verify_dirfrags();
2776 }
2777
2778 void EFragment::encode(bufferlist &bl, uint64_t features) const {
2779 ENCODE_START(5, 4, bl);
2780 ::encode(stamp, bl);
2781 ::encode(op, bl);
2782 ::encode(ino, bl);
2783 ::encode(basefrag, bl);
2784 ::encode(bits, bl);
2785 ::encode(metablob, bl, features);
2786 ::encode(orig_frags, bl);
2787 ::encode(rollback, bl);
2788 ENCODE_FINISH(bl);
2789 }
2790
2791 void EFragment::decode(bufferlist::iterator &bl) {
2792 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
2793 if (struct_v >= 2)
2794 ::decode(stamp, bl);
2795 if (struct_v >= 3)
2796 ::decode(op, bl);
2797 ::decode(ino, bl);
2798 ::decode(basefrag, bl);
2799 ::decode(bits, bl);
2800 ::decode(metablob, bl);
2801 if (struct_v >= 5) {
2802 ::decode(orig_frags, bl);
2803 ::decode(rollback, bl);
2804 }
2805 DECODE_FINISH(bl);
2806 }
2807
2808 void EFragment::dump(Formatter *f) const
2809 {
2810 /*f->open_object_section("Metablob");
2811 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2812 f->close_section();*/
2813 f->dump_string("op", op_name(op));
2814 f->dump_stream("ino") << ino;
2815 f->dump_stream("base frag") << basefrag;
2816 f->dump_int("bits", bits);
2817 }
2818
2819 void EFragment::generate_test_instances(list<EFragment*>& ls)
2820 {
2821 ls.push_back(new EFragment);
2822 ls.push_back(new EFragment);
2823 ls.back()->op = OP_PREPARE;
2824 ls.back()->ino = 1;
2825 ls.back()->bits = 5;
2826 }
2827
2828 void dirfrag_rollback::encode(bufferlist &bl) const
2829 {
2830 ENCODE_START(1, 1, bl);
2831 ::encode(fnode, bl);
2832 ENCODE_FINISH(bl);
2833 }
2834
2835 void dirfrag_rollback::decode(bufferlist::iterator &bl)
2836 {
2837 DECODE_START(1, bl);
2838 ::decode(fnode, bl);
2839 DECODE_FINISH(bl);
2840 }
2841
2842
2843
2844 // =========================================================================
2845
2846 // -----------------------
2847 // EExport
2848
2849 void EExport::replay(MDSRank *mds)
2850 {
2851 dout(10) << "EExport.replay " << base << dendl;
2852 metablob.replay(mds, _segment);
2853
2854 CDir *dir = mds->mdcache->get_dirfrag(base);
2855 assert(dir);
2856
2857 set<CDir*> realbounds;
2858 for (set<dirfrag_t>::iterator p = bounds.begin();
2859 p != bounds.end();
2860 ++p) {
2861 CDir *bd = mds->mdcache->get_dirfrag(*p);
2862 assert(bd);
2863 realbounds.insert(bd);
2864 }
2865
2866 // adjust auth away
2867 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, CDIR_AUTH_UNDEF);
2868
2869 mds->mdcache->try_trim_non_auth_subtree(dir);
2870 }
2871
2872 void EExport::encode(bufferlist& bl, uint64_t features) const
2873 {
2874 ENCODE_START(4, 3, bl);
2875 ::encode(stamp, bl);
2876 ::encode(metablob, bl, features);
2877 ::encode(base, bl);
2878 ::encode(bounds, bl);
2879 ::encode(target, bl);
2880 ENCODE_FINISH(bl);
2881 }
2882
2883 void EExport::decode(bufferlist::iterator &bl)
2884 {
2885 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2886 if (struct_v >= 2)
2887 ::decode(stamp, bl);
2888 ::decode(metablob, bl);
2889 ::decode(base, bl);
2890 ::decode(bounds, bl);
2891 if (struct_v >= 4)
2892 ::decode(target, bl);
2893 DECODE_FINISH(bl);
2894 }
2895
2896 void EExport::dump(Formatter *f) const
2897 {
2898 f->dump_float("stamp", (double)stamp);
2899 /*f->open_object_section("Metablob");
2900 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2901 f->close_section();*/
2902 f->dump_stream("base dirfrag") << base;
2903 f->open_array_section("bounds dirfrags");
2904 for (set<dirfrag_t>::const_iterator i = bounds.begin();
2905 i != bounds.end(); ++i) {
2906 f->dump_stream("dirfrag") << *i;
2907 }
2908 f->close_section(); // bounds dirfrags
2909 }
2910
2911 void EExport::generate_test_instances(list<EExport*>& ls)
2912 {
2913 EExport *sample = new EExport();
2914 ls.push_back(sample);
2915 }
2916
2917
2918 // -----------------------
2919 // EImportStart
2920
2921 void EImportStart::update_segment()
2922 {
2923 _segment->sessionmapv = cmapv;
2924 }
2925
2926 void EImportStart::replay(MDSRank *mds)
2927 {
2928 dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl;
2929 //metablob.print(*_dout);
2930 metablob.replay(mds, _segment);
2931
2932 // put in ambiguous import list
2933 mds->mdcache->add_ambiguous_import(base, bounds);
2934
2935 // set auth partially to us so we don't trim it
2936 CDir *dir = mds->mdcache->get_dirfrag(base);
2937 assert(dir);
2938
2939 set<CDir*> realbounds;
2940 for (vector<dirfrag_t>::iterator p = bounds.begin();
2941 p != bounds.end();
2942 ++p) {
2943 CDir *bd = mds->mdcache->get_dirfrag(*p);
2944 assert(bd);
2945 if (!bd->is_subtree_root())
2946 bd->state_clear(CDir::STATE_AUTH);
2947 realbounds.insert(bd);
2948 }
2949
2950 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds,
2951 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
2952
2953 // open client sessions?
2954 if (mds->sessionmap.get_version() >= cmapv) {
2955 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
2956 << " >= " << cmapv << ", noop" << dendl;
2957 } else {
2958 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
2959 << " < " << cmapv << dendl;
2960 map<client_t,entity_inst_t> cm;
2961 bufferlist::iterator blp = client_map.begin();
2962 ::decode(cm, blp);
2963 mds->sessionmap.open_sessions(cm);
2964 if (mds->sessionmap.get_version() != cmapv)
2965 {
2966 derr << "sessionmap version " << mds->sessionmap.get_version()
2967 << " != cmapv " << cmapv << dendl;
2968 mds->clog->error() << "failure replaying journal (EImportStart)";
2969 mds->damaged();
2970 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2971 }
2972 mds->sessionmap.set_projected(mds->sessionmap.get_version());
2973 }
2974 update_segment();
2975 }
2976
2977 void EImportStart::encode(bufferlist &bl, uint64_t features) const {
2978 ENCODE_START(4, 3, bl);
2979 ::encode(stamp, bl);
2980 ::encode(base, bl);
2981 ::encode(metablob, bl, features);
2982 ::encode(bounds, bl);
2983 ::encode(cmapv, bl);
2984 ::encode(client_map, bl);
2985 ::encode(from, bl);
2986 ENCODE_FINISH(bl);
2987 }
2988
2989 void EImportStart::decode(bufferlist::iterator &bl) {
2990 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2991 if (struct_v >= 2)
2992 ::decode(stamp, bl);
2993 ::decode(base, bl);
2994 ::decode(metablob, bl);
2995 ::decode(bounds, bl);
2996 ::decode(cmapv, bl);
2997 ::decode(client_map, bl);
2998 if (struct_v >= 4)
2999 ::decode(from, bl);
3000 DECODE_FINISH(bl);
3001 }
3002
3003 void EImportStart::dump(Formatter *f) const
3004 {
3005 f->dump_stream("base dirfrag") << base;
3006 f->open_array_section("boundary dirfrags");
3007 for (vector<dirfrag_t>::const_iterator iter = bounds.begin();
3008 iter != bounds.end(); ++iter) {
3009 f->dump_stream("frag") << *iter;
3010 }
3011 f->close_section();
3012 }
3013
3014 void EImportStart::generate_test_instances(list<EImportStart*>& ls)
3015 {
3016 ls.push_back(new EImportStart);
3017 }
3018
3019 // -----------------------
3020 // EImportFinish
3021
3022 void EImportFinish::replay(MDSRank *mds)
3023 {
3024 if (mds->mdcache->have_ambiguous_import(base)) {
3025 dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl;
3026 if (success) {
3027 mds->mdcache->finish_ambiguous_import(base);
3028 } else {
3029 CDir *dir = mds->mdcache->get_dirfrag(base);
3030 assert(dir);
3031 vector<dirfrag_t> bounds;
3032 mds->mdcache->get_ambiguous_import_bounds(base, bounds);
3033 mds->mdcache->adjust_bounded_subtree_auth(dir, bounds, CDIR_AUTH_UNDEF);
3034 mds->mdcache->cancel_ambiguous_import(dir);
3035 mds->mdcache->try_trim_non_auth_subtree(dir);
3036 }
3037 } else {
3038 // this shouldn't happen unless this is an old journal
3039 dout(10) << "EImportFinish.replay " << base << " success=" << success
3040 << " on subtree not marked as ambiguous"
3041 << dendl;
3042 mds->clog->error() << "failure replaying journal (EImportFinish)";
3043 mds->damaged();
3044 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3045 }
3046 }
3047
3048 void EImportFinish::encode(bufferlist& bl, uint64_t features) const
3049 {
3050 ENCODE_START(3, 3, bl);
3051 ::encode(stamp, bl);
3052 ::encode(base, bl);
3053 ::encode(success, bl);
3054 ENCODE_FINISH(bl);
3055 }
3056
3057 void EImportFinish::decode(bufferlist::iterator &bl)
3058 {
3059 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
3060 if (struct_v >= 2)
3061 ::decode(stamp, bl);
3062 ::decode(base, bl);
3063 ::decode(success, bl);
3064 DECODE_FINISH(bl);
3065 }
3066
3067 void EImportFinish::dump(Formatter *f) const
3068 {
3069 f->dump_stream("base dirfrag") << base;
3070 f->dump_string("success", success ? "true" : "false");
3071 }
3072 void EImportFinish::generate_test_instances(list<EImportFinish*>& ls)
3073 {
3074 ls.push_back(new EImportFinish);
3075 ls.push_back(new EImportFinish);
3076 ls.back()->success = true;
3077 }
3078
3079
3080 // ------------------------
3081 // EResetJournal
3082
3083 void EResetJournal::encode(bufferlist& bl, uint64_t features) const
3084 {
3085 ENCODE_START(2, 2, bl);
3086 ::encode(stamp, bl);
3087 ENCODE_FINISH(bl);
3088 }
3089
3090 void EResetJournal::decode(bufferlist::iterator &bl)
3091 {
3092 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
3093 ::decode(stamp, bl);
3094 DECODE_FINISH(bl);
3095 }
3096
3097 void EResetJournal::dump(Formatter *f) const
3098 {
3099 f->dump_stream("timestamp") << stamp;
3100 }
3101
3102 void EResetJournal::generate_test_instances(list<EResetJournal*>& ls)
3103 {
3104 ls.push_back(new EResetJournal());
3105 }
3106
3107 void EResetJournal::replay(MDSRank *mds)
3108 {
3109 dout(1) << "EResetJournal" << dendl;
3110
3111 mds->sessionmap.wipe();
3112 mds->inotable->replay_reset();
3113
3114 if (mds->mdsmap->get_root() == mds->get_nodeid()) {
3115 CDir *rootdir = mds->mdcache->get_root()->get_or_open_dirfrag(mds->mdcache, frag_t());
3116 mds->mdcache->adjust_subtree_auth(rootdir, mds->get_nodeid());
3117 }
3118
3119 CDir *mydir = mds->mdcache->get_myin()->get_or_open_dirfrag(mds->mdcache, frag_t());
3120 mds->mdcache->adjust_subtree_auth(mydir, mds->get_nodeid());
3121
3122 mds->mdcache->recalc_auth_bits(true);
3123
3124 mds->mdcache->show_subtrees();
3125 }
3126
3127
3128 void ENoOp::encode(bufferlist &bl, uint64_t features) const
3129 {
3130 ENCODE_START(2, 2, bl);
3131 ::encode(pad_size, bl);
3132 uint8_t const pad = 0xff;
3133 for (unsigned int i = 0; i < pad_size; ++i) {
3134 ::encode(pad, bl);
3135 }
3136 ENCODE_FINISH(bl);
3137 }
3138
3139
3140 void ENoOp::decode(bufferlist::iterator &bl)
3141 {
3142 DECODE_START(2, bl);
3143 ::decode(pad_size, bl);
3144 if (bl.get_remaining() != pad_size) {
3145 // This is spiritually an assertion, but expressing in a way that will let
3146 // journal debug tools catch it and recognise a malformed entry.
3147 throw buffer::end_of_buffer();
3148 } else {
3149 bl.advance(pad_size);
3150 }
3151 DECODE_FINISH(bl);
3152 }
3153
3154
3155 void ENoOp::replay(MDSRank *mds)
3156 {
3157 dout(4) << "ENoOp::replay, " << pad_size << " bytes skipped in journal" << dendl;
3158 }
3159
3160 /**
3161 * If re-formatting an old journal that used absolute log position
3162 * references as segment sequence numbers, use this function to update
3163 * it.
3164 *
3165 * @param mds
3166 * MDSRank instance, just used for logging
3167 * @param old_to_new
3168 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3169 *
3170 * @return
3171 * True if the event was modified.
3172 */
3173 bool EMetaBlob::rewrite_truncate_finish(MDSRank const *mds,
3174 std::map<log_segment_seq_t, log_segment_seq_t> const &old_to_new)
3175 {
3176 bool modified = false;
3177 map<inodeno_t, log_segment_seq_t> new_trunc_finish;
3178 for (std::map<inodeno_t, log_segment_seq_t>::iterator i = truncate_finish.begin();
3179 i != truncate_finish.end(); ++i) {
3180 if (old_to_new.count(i->second)) {
3181 dout(20) << __func__ << " applying segment seq mapping "
3182 << i->second << " -> " << old_to_new.find(i->second)->second << dendl;
3183 new_trunc_finish[i->first] = old_to_new.find(i->second)->second;
3184 modified = true;
3185 } else {
3186 dout(20) << __func__ << " no segment seq mapping found for "
3187 << i->second << dendl;
3188 new_trunc_finish[i->first] = i->second;
3189 }
3190 }
3191 truncate_finish = new_trunc_finish;
3192
3193 return modified;
3194 }