]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/journal.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / mds / journal.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "common/config.h"
16 #include "osdc/Journaler.h"
17 #include "events/ESubtreeMap.h"
18 #include "events/ESession.h"
19 #include "events/ESessions.h"
20
21 #include "events/EMetaBlob.h"
22 #include "events/EResetJournal.h"
23 #include "events/ENoOp.h"
24
25 #include "events/EUpdate.h"
26 #include "events/ESlaveUpdate.h"
27 #include "events/EOpen.h"
28 #include "events/ECommitted.h"
29
30 #include "events/EExport.h"
31 #include "events/EImportStart.h"
32 #include "events/EImportFinish.h"
33 #include "events/EFragment.h"
34
35 #include "events/ETableClient.h"
36 #include "events/ETableServer.h"
37
38 #include "include/stringify.h"
39
40 #include "LogSegment.h"
41
42 #include "MDSRank.h"
43 #include "MDLog.h"
44 #include "MDCache.h"
45 #include "Server.h"
46 #include "Migrator.h"
47 #include "Mutation.h"
48
49 #include "InoTable.h"
50 #include "MDSTableClient.h"
51 #include "MDSTableServer.h"
52
53 #include "Locker.h"
54
55 #define dout_context g_ceph_context
56 #define dout_subsys ceph_subsys_mds
57 #undef dout_prefix
58 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
59
60
61 // -----------------------
62 // LogSegment
63
64 void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int op_prio)
65 {
66 set<CDir*> commit;
67
68 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire" << dendl;
69
70 assert(g_conf->mds_kill_journal_expire_at != 1);
71
72 // commit dirs
73 for (elist<CDir*>::iterator p = new_dirfrags.begin(); !p.end(); ++p) {
74 dout(20) << " new_dirfrag " << **p << dendl;
75 assert((*p)->is_auth());
76 commit.insert(*p);
77 }
78 for (elist<CDir*>::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) {
79 dout(20) << " dirty_dirfrag " << **p << dendl;
80 assert((*p)->is_auth());
81 commit.insert(*p);
82 }
83 for (elist<CDentry*>::iterator p = dirty_dentries.begin(); !p.end(); ++p) {
84 dout(20) << " dirty_dentry " << **p << dendl;
85 assert((*p)->is_auth());
86 commit.insert((*p)->get_dir());
87 }
88 for (elist<CInode*>::iterator p = dirty_inodes.begin(); !p.end(); ++p) {
89 dout(20) << " dirty_inode " << **p << dendl;
90 assert((*p)->is_auth());
91 if ((*p)->is_base()) {
92 (*p)->store(gather_bld.new_sub());
93 } else
94 commit.insert((*p)->get_parent_dn()->get_dir());
95 }
96
97 if (!commit.empty()) {
98 for (set<CDir*>::iterator p = commit.begin();
99 p != commit.end();
100 ++p) {
101 CDir *dir = *p;
102 assert(dir->is_auth());
103 if (dir->can_auth_pin()) {
104 dout(15) << "try_to_expire committing " << *dir << dendl;
105 dir->commit(0, gather_bld.new_sub(), false, op_prio);
106 } else {
107 dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl;
108 dir->add_waiter(CDir::WAIT_UNFREEZE, gather_bld.new_sub());
109 }
110 }
111 }
112
113 // master ops with possibly uncommitted slaves
114 for (set<metareqid_t>::iterator p = uncommitted_masters.begin();
115 p != uncommitted_masters.end();
116 ++p) {
117 dout(10) << "try_to_expire waiting for slaves to ack commit on " << *p << dendl;
118 mds->mdcache->wait_for_uncommitted_master(*p, gather_bld.new_sub());
119 }
120
121 // uncommitted fragments
122 for (set<dirfrag_t>::iterator p = uncommitted_fragments.begin();
123 p != uncommitted_fragments.end();
124 ++p) {
125 dout(10) << "try_to_expire waiting for uncommitted fragment " << *p << dendl;
126 mds->mdcache->wait_for_uncommitted_fragment(*p, gather_bld.new_sub());
127 }
128
129 // nudge scatterlocks
130 for (elist<CInode*>::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) {
131 CInode *in = *p;
132 dout(10) << "try_to_expire waiting for dirlock flush on " << *in << dendl;
133 mds->locker->scatter_nudge(&in->filelock, gather_bld.new_sub());
134 }
135 for (elist<CInode*>::iterator p = dirty_dirfrag_dirfragtree.begin(); !p.end(); ++p) {
136 CInode *in = *p;
137 dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in << dendl;
138 mds->locker->scatter_nudge(&in->dirfragtreelock, gather_bld.new_sub());
139 }
140 for (elist<CInode*>::iterator p = dirty_dirfrag_nest.begin(); !p.end(); ++p) {
141 CInode *in = *p;
142 dout(10) << "try_to_expire waiting for nest flush on " << *in << dendl;
143 mds->locker->scatter_nudge(&in->nestlock, gather_bld.new_sub());
144 }
145
146 assert(g_conf->mds_kill_journal_expire_at != 2);
147
148 // open files and snap inodes
149 if (!open_files.empty()) {
150 assert(!mds->mdlog->is_capped()); // hmm FIXME
151 EOpen *le = 0;
152 LogSegment *ls = mds->mdlog->get_current_segment();
153 assert(ls != this);
154 elist<CInode*>::iterator p = open_files.begin(member_offset(CInode, item_open_file));
155 while (!p.end()) {
156 CInode *in = *p;
157 ++p;
158 if (in->last == CEPH_NOSNAP && in->is_auth() &&
159 !in->is_ambiguous_auth() && in->is_any_caps()) {
160 if (in->is_any_caps_wanted()) {
161 dout(20) << "try_to_expire requeueing open file " << *in << dendl;
162 if (!le) {
163 le = new EOpen(mds->mdlog);
164 mds->mdlog->start_entry(le);
165 }
166 le->add_clean_inode(in);
167 ls->open_files.push_back(&in->item_open_file);
168 } else {
169 // drop inodes that aren't wanted
170 dout(20) << "try_to_expire not requeueing and delisting unwanted file " << *in << dendl;
171 in->item_open_file.remove_myself();
172 }
173 } else if (in->last != CEPH_NOSNAP && !in->client_snap_caps.empty()) {
174 // journal snap inodes that need flush. This simplify the mds failover hanlding
175 dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl;
176 if (!le) {
177 le = new EOpen(mds->mdlog);
178 mds->mdlog->start_entry(le);
179 }
180 le->add_clean_inode(in);
181 ls->open_files.push_back(&in->item_open_file);
182 } else {
183 /*
184 * we can get a capless inode here if we replay an open file, the client fails to
185 * reconnect it, but does REPLAY an open request (that adds it to the logseg). AFAICS
186 * it's ok for the client to replay an open on a file it doesn't have in it's cache
187 * anymore.
188 *
189 * this makes the mds less sensitive to strict open_file consistency, although it does
190 * make it easier to miss subtle problems.
191 */
192 dout(20) << "try_to_expire not requeueing and delisting capless file " << *in << dendl;
193 in->item_open_file.remove_myself();
194 }
195 }
196 if (le) {
197 mds->mdlog->submit_entry(le);
198 mds->mdlog->wait_for_safe(gather_bld.new_sub());
199 dout(10) << "try_to_expire waiting for open files to rejournal" << dendl;
200 }
201 }
202
203 assert(g_conf->mds_kill_journal_expire_at != 3);
204
205 // backtraces to be stored/updated
206 for (elist<CInode*>::iterator p = dirty_parent_inodes.begin(); !p.end(); ++p) {
207 CInode *in = *p;
208 assert(in->is_auth());
209 if (in->can_auth_pin()) {
210 dout(15) << "try_to_expire waiting for storing backtrace on " << *in << dendl;
211 in->store_backtrace(gather_bld.new_sub(), op_prio);
212 } else {
213 dout(15) << "try_to_expire waiting for unfreeze on " << *in << dendl;
214 in->add_waiter(CInode::WAIT_UNFREEZE, gather_bld.new_sub());
215 }
216 }
217
218 assert(g_conf->mds_kill_journal_expire_at != 4);
219
220 // slave updates
221 for (elist<MDSlaveUpdate*>::iterator p = slave_updates.begin(member_offset(MDSlaveUpdate,
222 item));
223 !p.end(); ++p) {
224 MDSlaveUpdate *su = *p;
225 dout(10) << "try_to_expire waiting on slave update " << su << dendl;
226 assert(su->waiter == 0);
227 su->waiter = gather_bld.new_sub();
228 }
229
230 // idalloc
231 if (inotablev > mds->inotable->get_committed_version()) {
232 dout(10) << "try_to_expire saving inotable table, need " << inotablev
233 << ", committed is " << mds->inotable->get_committed_version()
234 << " (" << mds->inotable->get_committing_version() << ")"
235 << dendl;
236 mds->inotable->save(gather_bld.new_sub(), inotablev);
237 }
238
239 // sessionmap
240 if (sessionmapv > mds->sessionmap.get_committed()) {
241 dout(10) << "try_to_expire saving sessionmap, need " << sessionmapv
242 << ", committed is " << mds->sessionmap.get_committed()
243 << " (" << mds->sessionmap.get_committing() << ")"
244 << dendl;
245 mds->sessionmap.save(gather_bld.new_sub(), sessionmapv);
246 }
247
248 // updates to sessions for completed_requests
249 mds->sessionmap.save_if_dirty(touched_sessions, &gather_bld);
250 touched_sessions.clear();
251
252 // pending commit atids
253 for (map<int, ceph::unordered_set<version_t> >::iterator p = pending_commit_tids.begin();
254 p != pending_commit_tids.end();
255 ++p) {
256 MDSTableClient *client = mds->get_table_client(p->first);
257 assert(client);
258 for (ceph::unordered_set<version_t>::iterator q = p->second.begin();
259 q != p->second.end();
260 ++q) {
261 dout(10) << "try_to_expire " << get_mdstable_name(p->first) << " transaction " << *q
262 << " pending commit (not yet acked), waiting" << dendl;
263 assert(!client->has_committed(*q));
264 client->wait_for_ack(*q, gather_bld.new_sub());
265 }
266 }
267
268 // table servers
269 for (map<int, version_t>::iterator p = tablev.begin();
270 p != tablev.end();
271 ++p) {
272 MDSTableServer *server = mds->get_table_server(p->first);
273 assert(server);
274 if (p->second > server->get_committed_version()) {
275 dout(10) << "try_to_expire waiting for " << get_mdstable_name(p->first)
276 << " to save, need " << p->second << dendl;
277 server->save(gather_bld.new_sub());
278 }
279 }
280
281 // truncating
282 for (set<CInode*>::iterator p = truncating_inodes.begin();
283 p != truncating_inodes.end();
284 ++p) {
285 dout(10) << "try_to_expire waiting for truncate of " << **p << dendl;
286 (*p)->add_waiter(CInode::WAIT_TRUNC, gather_bld.new_sub());
287 }
288
289 if (gather_bld.has_subs()) {
290 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire waiting" << dendl;
291 mds->mdlog->flush();
292 } else {
293 assert(g_conf->mds_kill_journal_expire_at != 5);
294 dout(6) << "LogSegment(" << seq << "/" << offset << ").try_to_expire success" << dendl;
295 }
296 }
297
298
299 // -----------------------
300 // EMetaBlob
301
302 EMetaBlob::EMetaBlob(MDLog *mdlog) : opened_ino(0), renamed_dirino(0),
303 inotablev(0), sessionmapv(0), allocated_ino(0),
304 last_subtree_map(0), event_seq(0)
305 { }
306
307 void EMetaBlob::add_dir_context(CDir *dir, int mode)
308 {
309 MDSRank *mds = dir->cache->mds;
310
311 list<CDentry*> parents;
312
313 // it may be okay not to include the maybe items, if
314 // - we journaled the maybe child inode in this segment
315 // - that subtree turns out to be unambiguously auth
316 list<CDentry*> maybe;
317 bool maybenot = false;
318
319 while (true) {
320 // already have this dir? (we must always add in order)
321 if (lump_map.count(dir->dirfrag())) {
322 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") have lump " << dir->dirfrag() << dendl;
323 break;
324 }
325
326 // stop at root/stray
327 CInode *diri = dir->get_inode();
328 CDentry *parent = diri->get_projected_parent_dn();
329
330 if (mode == TO_AUTH_SUBTREE_ROOT) {
331 // subtree root?
332 if (dir->is_subtree_root() &&
333 !dir->state_test(CDir::STATE_EXPORTBOUND)) {
334 if (dir->is_auth() && !dir->is_ambiguous_auth() ) {
335 if (dir->state_test(CDir::STATE_AUXSUBTREE) &&
336 dir->get_dir_auth().first == diri->authority().first) {
337 // auxiliary subtree. treat it as normal dirfrag
338 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") auxiliary subtree " << dendl;
339 } else {
340 // it's an auth subtree, we don't need maybe (if any), and we're done.
341 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
342 << " at " << *dir << dendl;
343 maybe.clear();
344 break;
345 }
346 } else {
347 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached ambig or !auth subtree, need " << maybe
348 << " at " << *dir << dendl;
349 // we need the maybe list after all!
350 parents.splice(parents.begin(), maybe);
351 maybenot = false;
352 }
353 }
354
355 // was the inode journaled in this blob?
356 if (event_seq && diri->last_journaled == event_seq) {
357 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri this blob " << *diri << dendl;
358 break;
359 }
360
361 // have we journaled this inode since the last subtree map?
362 if (!maybenot && last_subtree_map && diri->last_journaled >= last_subtree_map) {
363 dout(20) << "EMetaBlob::add_dir_context(" << dir << ") already have diri in this segment ("
364 << diri->last_journaled << " >= " << last_subtree_map << "), setting maybenot flag "
365 << *diri << dendl;
366 maybenot = true;
367 }
368 }
369
370 if (!parent)
371 break;
372
373 if (maybenot) {
374 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") maybe " << *parent << dendl;
375 maybe.push_front(parent);
376 } else {
377 dout(25) << "EMetaBlob::add_dir_context(" << dir << ") definitely " << *parent << dendl;
378 parents.push_front(parent);
379 }
380
381 dir = parent->get_dir();
382 }
383
384 parents.splice(parents.begin(), maybe);
385
386 dout(20) << "EMetaBlob::add_dir_context final: " << parents << dendl;
387 for (list<CDentry*>::iterator p = parents.begin(); p != parents.end(); ++p) {
388 assert((*p)->get_projected_linkage()->is_primary());
389 add_dentry(*p, false);
390 }
391 }
392
393 void EMetaBlob::update_segment(LogSegment *ls)
394 {
395 // dirty inode mtimes
396 // -> handled directly by Server.cc, replay()
397
398 // alloc table update?
399 if (inotablev)
400 ls->inotablev = inotablev;
401 if (sessionmapv)
402 ls->sessionmapv = sessionmapv;
403
404 // truncated inodes
405 // -> handled directly by Server.cc
406
407 // client requests
408 // note the newest request per client
409 //if (!client_reqs.empty())
410 // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid);
411 }
412
413 // EMetaBlob::fullbit
414
415 void EMetaBlob::fullbit::encode(bufferlist& bl, uint64_t features) const {
416 ENCODE_START(8, 5, bl);
417 ::encode(dn, bl);
418 ::encode(dnfirst, bl);
419 ::encode(dnlast, bl);
420 ::encode(dnv, bl);
421 ::encode(inode, bl, features);
422 ::encode(xattrs, bl);
423 if (inode.is_symlink())
424 ::encode(symlink, bl);
425 if (inode.is_dir()) {
426 ::encode(dirfragtree, bl);
427 ::encode(snapbl, bl);
428 }
429 ::encode(state, bl);
430 if (old_inodes.empty()) {
431 ::encode(false, bl);
432 } else {
433 ::encode(true, bl);
434 ::encode(old_inodes, bl, features);
435 }
436 if (!inode.is_dir())
437 ::encode(snapbl, bl);
438 ::encode(oldest_snap, bl);
439 ENCODE_FINISH(bl);
440 }
441
442 void EMetaBlob::fullbit::decode(bufferlist::iterator &bl) {
443 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
444 ::decode(dn, bl);
445 ::decode(dnfirst, bl);
446 ::decode(dnlast, bl);
447 ::decode(dnv, bl);
448 ::decode(inode, bl);
449 ::decode(xattrs, bl);
450 if (inode.is_symlink())
451 ::decode(symlink, bl);
452 if (inode.is_dir()) {
453 ::decode(dirfragtree, bl);
454 ::decode(snapbl, bl);
455 if ((struct_v == 2) || (struct_v == 3)) {
456 bool dir_layout_exists;
457 ::decode(dir_layout_exists, bl);
458 if (dir_layout_exists) {
459 __u8 dir_struct_v;
460 ::decode(dir_struct_v, bl); // default_file_layout version
461 ::decode(inode.layout, bl); // and actual layout, that we care about
462 }
463 }
464 }
465 if (struct_v >= 6) {
466 ::decode(state, bl);
467 } else {
468 bool dirty;
469 ::decode(dirty, bl);
470 state = dirty ? EMetaBlob::fullbit::STATE_DIRTY : 0;
471 }
472
473 if (struct_v >= 3) {
474 bool old_inodes_present;
475 ::decode(old_inodes_present, bl);
476 if (old_inodes_present) {
477 ::decode(old_inodes, bl);
478 }
479 }
480 if (!inode.is_dir()) {
481 if (struct_v >= 7)
482 ::decode(snapbl, bl);
483 }
484 if (struct_v >= 8)
485 ::decode(oldest_snap, bl);
486 else
487 oldest_snap = CEPH_NOSNAP;
488
489 DECODE_FINISH(bl);
490 }
491
492 void EMetaBlob::fullbit::dump(Formatter *f) const
493 {
494 f->dump_string("dentry", dn);
495 f->dump_stream("snapid.first") << dnfirst;
496 f->dump_stream("snapid.last") << dnlast;
497 f->dump_int("dentry version", dnv);
498 f->open_object_section("inode");
499 inode.dump(f);
500 f->close_section(); // inode
501 f->open_object_section("xattrs");
502 for (map<string, bufferptr>::const_iterator iter = xattrs.begin();
503 iter != xattrs.end(); ++iter) {
504 string s(iter->second.c_str(), iter->second.length());
505 f->dump_string(iter->first.c_str(), s);
506 }
507 f->close_section(); // xattrs
508 if (inode.is_symlink()) {
509 f->dump_string("symlink", symlink);
510 }
511 if (inode.is_dir()) {
512 f->dump_stream("frag tree") << dirfragtree;
513 f->dump_string("has_snapbl", snapbl.length() ? "true" : "false");
514 if (inode.has_layout()) {
515 f->open_object_section("file layout policy");
516 // FIXME
517 f->dump_string("layout", "the layout exists");
518 f->close_section(); // file layout policy
519 }
520 }
521 f->dump_string("state", state_string());
522 if (!old_inodes.empty()) {
523 f->open_array_section("old inodes");
524 for (old_inodes_t::const_iterator iter = old_inodes.begin();
525 iter != old_inodes.end();
526 ++iter) {
527 f->open_object_section("inode");
528 f->dump_int("snapid", iter->first);
529 iter->second.dump(f);
530 f->close_section(); // inode
531 }
532 f->close_section(); // old inodes
533 }
534 }
535
536 void EMetaBlob::fullbit::generate_test_instances(list<EMetaBlob::fullbit*>& ls)
537 {
538 inode_t inode;
539 fragtree_t fragtree;
540 map<string,bufferptr> empty_xattrs;
541 bufferlist empty_snapbl;
542 fullbit *sample = new fullbit("/testdn", 0, 0, 0,
543 inode, fragtree, empty_xattrs, "", 0, empty_snapbl,
544 false, NULL);
545 ls.push_back(sample);
546 }
547
548 void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in)
549 {
550 in->inode = inode;
551 in->xattrs = xattrs;
552 if (in->inode.is_dir()) {
553 if (!(in->dirfragtree == dirfragtree)) {
554 dout(10) << "EMetaBlob::fullbit::update_inode dft " << in->dirfragtree << " -> "
555 << dirfragtree << " on " << *in << dendl;
556 in->dirfragtree = dirfragtree;
557 in->force_dirfrags();
558 if (in->has_dirfrags() && in->authority() == CDIR_AUTH_UNDEF) {
559 list<CDir*> ls;
560 in->get_nested_dirfrags(ls);
561 for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
562 CDir *dir = *p;
563 if (dir->get_num_any() == 0 &&
564 mds->mdcache->can_trim_non_auth_dirfrag(dir)) {
565 dout(10) << " closing empty non-auth dirfrag " << *dir << dendl;
566 in->close_dirfrag(dir->get_frag());
567 }
568 }
569 }
570 }
571 } else if (in->inode.is_symlink()) {
572 in->symlink = symlink;
573 }
574 in->old_inodes = old_inodes;
575 if (!in->old_inodes.empty()) {
576 snapid_t min_first = in->old_inodes.rbegin()->first + 1;
577 if (min_first > in->first)
578 in->first = min_first;
579 }
580
581 /*
582 * we can do this before linking hte inode bc the split_at would
583 * be a no-op.. we have no children (namely open snaprealms) to
584 * divy up
585 */
586 in->oldest_snap = oldest_snap;
587 in->decode_snap_blob(snapbl);
588
589 /*
590 * In case there was anything malformed in the journal that we are
591 * replaying, do sanity checks on the inodes we're replaying and
592 * go damaged instead of letting any trash into a live cache
593 */
594 if (in->is_file()) {
595 // Files must have valid layouts with a pool set
596 if (in->inode.layout.pool_id == -1 || !in->inode.layout.is_valid()) {
597 dout(0) << "EMetaBlob.replay invalid layout on ino " << *in
598 << ": " << in->inode.layout << dendl;
599 std::ostringstream oss;
600 oss << "Invalid layout for inode 0x" << std::hex << in->inode.ino
601 << std::dec << " in journal";
602 mds->clog->error() << oss.str();
603 mds->damaged();
604 ceph_abort(); // Should be unreachable because damaged() calls respawn()
605 }
606 }
607 }
608
609 // EMetaBlob::remotebit
610
611 void EMetaBlob::remotebit::encode(bufferlist& bl) const
612 {
613 ENCODE_START(2, 2, bl);
614 ::encode(dn, bl);
615 ::encode(dnfirst, bl);
616 ::encode(dnlast, bl);
617 ::encode(dnv, bl);
618 ::encode(ino, bl);
619 ::encode(d_type, bl);
620 ::encode(dirty, bl);
621 ENCODE_FINISH(bl);
622 }
623
624 void EMetaBlob::remotebit::decode(bufferlist::iterator &bl)
625 {
626 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
627 ::decode(dn, bl);
628 ::decode(dnfirst, bl);
629 ::decode(dnlast, bl);
630 ::decode(dnv, bl);
631 ::decode(ino, bl);
632 ::decode(d_type, bl);
633 ::decode(dirty, bl);
634 DECODE_FINISH(bl);
635 }
636
637 void EMetaBlob::remotebit::dump(Formatter *f) const
638 {
639 f->dump_string("dentry", dn);
640 f->dump_int("snapid.first", dnfirst);
641 f->dump_int("snapid.last", dnlast);
642 f->dump_int("dentry version", dnv);
643 f->dump_int("inodeno", ino);
644 uint32_t type = DTTOIF(d_type) & S_IFMT; // convert to type entries
645 string type_string;
646 switch(type) {
647 case S_IFREG:
648 type_string = "file"; break;
649 case S_IFLNK:
650 type_string = "symlink"; break;
651 case S_IFDIR:
652 type_string = "directory"; break;
653 case S_IFIFO:
654 type_string = "fifo"; break;
655 case S_IFCHR:
656 type_string = "chr"; break;
657 case S_IFBLK:
658 type_string = "blk"; break;
659 case S_IFSOCK:
660 type_string = "sock"; break;
661 default:
662 assert (0 == "unknown d_type!");
663 }
664 f->dump_string("d_type", type_string);
665 f->dump_string("dirty", dirty ? "true" : "false");
666 }
667
668 void EMetaBlob::remotebit::
669 generate_test_instances(list<EMetaBlob::remotebit*>& ls)
670 {
671 remotebit *remote = new remotebit("/test/dn", 0, 10, 15, 1, IFTODT(S_IFREG), false);
672 ls.push_back(remote);
673 }
674
675 // EMetaBlob::nullbit
676
677 void EMetaBlob::nullbit::encode(bufferlist& bl) const
678 {
679 ENCODE_START(2, 2, bl);
680 ::encode(dn, bl);
681 ::encode(dnfirst, bl);
682 ::encode(dnlast, bl);
683 ::encode(dnv, bl);
684 ::encode(dirty, bl);
685 ENCODE_FINISH(bl);
686 }
687
688 void EMetaBlob::nullbit::decode(bufferlist::iterator &bl)
689 {
690 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
691 ::decode(dn, bl);
692 ::decode(dnfirst, bl);
693 ::decode(dnlast, bl);
694 ::decode(dnv, bl);
695 ::decode(dirty, bl);
696 DECODE_FINISH(bl);
697 }
698
699 void EMetaBlob::nullbit::dump(Formatter *f) const
700 {
701 f->dump_string("dentry", dn);
702 f->dump_int("snapid.first", dnfirst);
703 f->dump_int("snapid.last", dnlast);
704 f->dump_int("dentry version", dnv);
705 f->dump_string("dirty", dirty ? "true" : "false");
706 }
707
708 void EMetaBlob::nullbit::generate_test_instances(list<nullbit*>& ls)
709 {
710 nullbit *sample = new nullbit("/test/dentry", 0, 10, 15, false);
711 nullbit *sample2 = new nullbit("/test/dirty", 10, 20, 25, true);
712 ls.push_back(sample);
713 ls.push_back(sample2);
714 }
715
716 // EMetaBlob::dirlump
717
718 void EMetaBlob::dirlump::encode(bufferlist& bl, uint64_t features) const
719 {
720 ENCODE_START(2, 2, bl);
721 ::encode(fnode, bl);
722 ::encode(state, bl);
723 ::encode(nfull, bl);
724 ::encode(nremote, bl);
725 ::encode(nnull, bl);
726 _encode_bits(features);
727 ::encode(dnbl, bl);
728 ENCODE_FINISH(bl);
729 }
730
731 void EMetaBlob::dirlump::decode(bufferlist::iterator &bl)
732 {
733 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl)
734 ::decode(fnode, bl);
735 ::decode(state, bl);
736 ::decode(nfull, bl);
737 ::decode(nremote, bl);
738 ::decode(nnull, bl);
739 ::decode(dnbl, bl);
740 dn_decoded = false; // don't decode bits unless we need them.
741 DECODE_FINISH(bl);
742 }
743
744 void EMetaBlob::dirlump::dump(Formatter *f) const
745 {
746 if (!dn_decoded) {
747 dirlump *me = const_cast<dirlump*>(this);
748 me->_decode_bits();
749 }
750 f->open_object_section("fnode");
751 fnode.dump(f);
752 f->close_section(); // fnode
753 f->dump_string("state", state_string());
754 f->dump_int("nfull", nfull);
755 f->dump_int("nremote", nremote);
756 f->dump_int("nnull", nnull);
757
758 f->open_array_section("full bits");
759 for (list<ceph::shared_ptr<fullbit> >::const_iterator
760 iter = dfull.begin(); iter != dfull.end(); ++iter) {
761 f->open_object_section("fullbit");
762 (*iter)->dump(f);
763 f->close_section(); // fullbit
764 }
765 f->close_section(); // full bits
766 f->open_array_section("remote bits");
767 for (list<remotebit>::const_iterator
768 iter = dremote.begin(); iter != dremote.end(); ++iter) {
769 f->open_object_section("remotebit");
770 (*iter).dump(f);
771 f->close_section(); // remotebit
772 }
773 f->close_section(); // remote bits
774 f->open_array_section("null bits");
775 for (list<nullbit>::const_iterator
776 iter = dnull.begin(); iter != dnull.end(); ++iter) {
777 f->open_object_section("null bit");
778 (*iter).dump(f);
779 f->close_section(); // null bit
780 }
781 f->close_section(); // null bits
782 }
783
784 void EMetaBlob::dirlump::generate_test_instances(list<dirlump*>& ls)
785 {
786 ls.push_back(new dirlump());
787 }
788
789 /**
790 * EMetaBlob proper
791 */
792 void EMetaBlob::encode(bufferlist& bl, uint64_t features) const
793 {
794 ENCODE_START(8, 5, bl);
795 ::encode(lump_order, bl);
796 ::encode(lump_map, bl, features);
797 ::encode(roots, bl, features);
798 ::encode(table_tids, bl);
799 ::encode(opened_ino, bl);
800 ::encode(allocated_ino, bl);
801 ::encode(used_preallocated_ino, bl);
802 ::encode(preallocated_inos, bl);
803 ::encode(client_name, bl);
804 ::encode(inotablev, bl);
805 ::encode(sessionmapv, bl);
806 ::encode(truncate_start, bl);
807 ::encode(truncate_finish, bl);
808 ::encode(destroyed_inodes, bl);
809 ::encode(client_reqs, bl);
810 ::encode(renamed_dirino, bl);
811 ::encode(renamed_dir_frags, bl);
812 {
813 // make MDSRank use v6 format happy
814 int64_t i = -1;
815 bool b = false;
816 ::encode(i, bl);
817 ::encode(b, bl);
818 }
819 ::encode(client_flushes, bl);
820 ENCODE_FINISH(bl);
821 }
822 void EMetaBlob::decode(bufferlist::iterator &bl)
823 {
824 DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
825 ::decode(lump_order, bl);
826 ::decode(lump_map, bl);
827 if (struct_v >= 4) {
828 ::decode(roots, bl);
829 } else {
830 bufferlist rootbl;
831 ::decode(rootbl, bl);
832 if (rootbl.length()) {
833 bufferlist::iterator p = rootbl.begin();
834 roots.push_back(ceph::shared_ptr<fullbit>(new fullbit(p)));
835 }
836 }
837 ::decode(table_tids, bl);
838 ::decode(opened_ino, bl);
839 ::decode(allocated_ino, bl);
840 ::decode(used_preallocated_ino, bl);
841 ::decode(preallocated_inos, bl);
842 ::decode(client_name, bl);
843 ::decode(inotablev, bl);
844 ::decode(sessionmapv, bl);
845 ::decode(truncate_start, bl);
846 ::decode(truncate_finish, bl);
847 ::decode(destroyed_inodes, bl);
848 if (struct_v >= 2) {
849 ::decode(client_reqs, bl);
850 } else {
851 list<metareqid_t> r;
852 ::decode(r, bl);
853 while (!r.empty()) {
854 client_reqs.push_back(pair<metareqid_t,uint64_t>(r.front(), 0));
855 r.pop_front();
856 }
857 }
858 if (struct_v >= 3) {
859 ::decode(renamed_dirino, bl);
860 ::decode(renamed_dir_frags, bl);
861 }
862 if (struct_v >= 6) {
863 // ignore
864 int64_t i;
865 bool b;
866 ::decode(i, bl);
867 ::decode(b, bl);
868 }
869 if (struct_v >= 8) {
870 ::decode(client_flushes, bl);
871 }
872 DECODE_FINISH(bl);
873 }
874
875
876 /**
877 * Get all inodes touched by this metablob. Includes the 'bits' within
878 * dirlumps, and the inodes of the dirs themselves.
879 */
880 void EMetaBlob::get_inodes(
881 std::set<inodeno_t> &inodes) const
882 {
883 // For all dirlumps in this metablob
884 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
885 // Record inode of dirlump
886 inodeno_t const dir_ino = i->first.ino;
887 inodes.insert(dir_ino);
888
889 // Decode dirlump bits
890 dirlump const &dl = i->second;
891 dl._decode_bits();
892
893 // Record inodes of fullbits
894 list<ceph::shared_ptr<fullbit> > const &fb_list = dl.get_dfull();
895 for (list<ceph::shared_ptr<fullbit> >::const_iterator
896 iter = fb_list.begin(); iter != fb_list.end(); ++iter) {
897 inodes.insert((*iter)->inode.ino);
898 }
899
900 // Record inodes of remotebits
901 list<remotebit> const &rb_list = dl.get_dremote();
902 for (list<remotebit>::const_iterator
903 iter = rb_list.begin(); iter != rb_list.end(); ++iter) {
904 inodes.insert(iter->ino);
905 }
906 }
907 }
908
909
910 /**
911 * Get a map of dirfrag to set of dentries in that dirfrag which are
912 * touched in this operation.
913 */
914 void EMetaBlob::get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const
915 {
916 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
917 dirlump const &dl = i->second;
918 dirfrag_t const &df = i->first;
919
920 // Get all bits
921 dl._decode_bits();
922 list<ceph::shared_ptr<fullbit> > const &fb_list = dl.get_dfull();
923 list<nullbit> const &nb_list = dl.get_dnull();
924 list<remotebit> const &rb_list = dl.get_dremote();
925
926 // For all bits, store dentry
927 for (list<ceph::shared_ptr<fullbit> >::const_iterator
928 iter = fb_list.begin(); iter != fb_list.end(); ++iter) {
929 dentries[df].insert((*iter)->dn);
930
931 }
932 for (list<nullbit>::const_iterator
933 iter = nb_list.begin(); iter != nb_list.end(); ++iter) {
934 dentries[df].insert(iter->dn);
935 }
936 for (list<remotebit>::const_iterator
937 iter = rb_list.begin(); iter != rb_list.end(); ++iter) {
938 dentries[df].insert(iter->dn);
939 }
940 }
941 }
942
943
944
945 /**
946 * Calculate all paths that we can infer are touched by this metablob. Only uses
947 * information local to this metablob so it may only be the path within the
948 * subtree.
949 */
950 void EMetaBlob::get_paths(
951 std::vector<std::string> &paths) const
952 {
953 // Each dentry has a 'location' which is a 2-tuple of parent inode and dentry name
954 typedef std::pair<inodeno_t, std::string> Location;
955
956 // Whenever we see a dentry within a dirlump, we remember it as a child of
957 // the dirlump's inode
958 std::map<inodeno_t, std::list<std::string> > children;
959
960 // Whenever we see a location for an inode, remember it: this allows us to
961 // build a path given an inode
962 std::map<inodeno_t, Location> ino_locations;
963
964 // Special case: operations on root inode populate roots but not dirlumps
965 if (lump_map.empty() && !roots.empty()) {
966 paths.push_back("/");
967 return;
968 }
969
970 // First pass
971 // ==========
972 // Build a tiny local metadata cache for the path structure in this metablob
973 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
974 inodeno_t const dir_ino = i->first.ino;
975 dirlump const &dl = i->second;
976 dl._decode_bits();
977
978 list<ceph::shared_ptr<fullbit> > const &fb_list = dl.get_dfull();
979 list<nullbit> const &nb_list = dl.get_dnull();
980 list<remotebit> const &rb_list = dl.get_dremote();
981
982 for (list<ceph::shared_ptr<fullbit> >::const_iterator
983 iter = fb_list.begin(); iter != fb_list.end(); ++iter) {
984 std::string const &dentry = (*iter)->dn;
985 children[dir_ino].push_back(dentry);
986 ino_locations[(*iter)->inode.ino] = Location(dir_ino, dentry);
987 }
988
989 for (list<nullbit>::const_iterator
990 iter = nb_list.begin(); iter != nb_list.end(); ++iter) {
991 std::string const &dentry = iter->dn;
992 children[dir_ino].push_back(dentry);
993 }
994
995 for (list<remotebit>::const_iterator
996 iter = rb_list.begin(); iter != rb_list.end(); ++iter) {
997 std::string const &dentry = iter->dn;
998 children[dir_ino].push_back(dentry);
999 }
1000 }
1001
1002 std::vector<Location> leaf_locations;
1003
1004 // Second pass
1005 // ===========
1006 // Output paths for all childless nodes in the metablob
1007 for (std::map<dirfrag_t, dirlump>::const_iterator i = lump_map.begin(); i != lump_map.end(); ++i) {
1008 inodeno_t const dir_ino = i->first.ino;
1009 dirlump const &dl = i->second;
1010 dl._decode_bits();
1011
1012 list<ceph::shared_ptr<fullbit> > const &fb_list = dl.get_dfull();
1013 for (list<ceph::shared_ptr<fullbit> >::const_iterator
1014 iter = fb_list.begin(); iter != fb_list.end(); ++iter) {
1015 std::string const &dentry = (*iter)->dn;
1016 children[dir_ino].push_back(dentry);
1017 ino_locations[(*iter)->inode.ino] = Location(dir_ino, dentry);
1018 if (children.find((*iter)->inode.ino) == children.end()) {
1019 leaf_locations.push_back(Location(dir_ino, dentry));
1020
1021 }
1022 }
1023
1024 list<nullbit> const &nb_list = dl.get_dnull();
1025 for (list<nullbit>::const_iterator
1026 iter = nb_list.begin(); iter != nb_list.end(); ++iter) {
1027 std::string const &dentry = iter->dn;
1028 leaf_locations.push_back(Location(dir_ino, dentry));
1029 }
1030
1031 list<remotebit> const &rb_list = dl.get_dremote();
1032 for (list<remotebit>::const_iterator
1033 iter = rb_list.begin(); iter != rb_list.end(); ++iter) {
1034 std::string const &dentry = iter->dn;
1035 leaf_locations.push_back(Location(dir_ino, dentry));
1036 }
1037 }
1038
1039 // For all the leaf locations identified, generate paths
1040 for (std::vector<Location>::iterator i = leaf_locations.begin(); i != leaf_locations.end(); ++i) {
1041 Location const &loc = *i;
1042 std::string path = loc.second;
1043 inodeno_t ino = loc.first;
1044 while(ino_locations.find(ino) != ino_locations.end()) {
1045 Location const &loc = ino_locations[ino];
1046 if (!path.empty()) {
1047 path = loc.second + "/" + path;
1048 } else {
1049 path = loc.second + path;
1050 }
1051 ino = loc.first;
1052 }
1053
1054 paths.push_back(path);
1055 }
1056 }
1057
1058
1059 void EMetaBlob::dump(Formatter *f) const
1060 {
1061 f->open_array_section("lumps");
1062 for (list<dirfrag_t>::const_iterator i = lump_order.begin();
1063 i != lump_order.end(); ++i) {
1064 f->open_object_section("lump");
1065 f->open_object_section("dirfrag");
1066 f->dump_stream("dirfrag") << *i;
1067 f->close_section(); // dirfrag
1068 f->open_object_section("dirlump");
1069 lump_map.at(*i).dump(f);
1070 f->close_section(); // dirlump
1071 f->close_section(); // lump
1072 }
1073 f->close_section(); // lumps
1074
1075 f->open_array_section("roots");
1076 for (list<ceph::shared_ptr<fullbit> >::const_iterator i = roots.begin();
1077 i != roots.end(); ++i) {
1078 f->open_object_section("root");
1079 (*i)->dump(f);
1080 f->close_section(); // root
1081 }
1082 f->close_section(); // roots
1083
1084 f->open_array_section("tableclient tranactions");
1085 for (list<pair<__u8,version_t> >::const_iterator i = table_tids.begin();
1086 i != table_tids.end(); ++i) {
1087 f->open_object_section("transaction");
1088 f->dump_int("tid", i->first);
1089 f->dump_int("version", i->second);
1090 f->close_section(); // transaction
1091 }
1092 f->close_section(); // tableclient transactions
1093
1094 f->dump_int("renamed directory inodeno", renamed_dirino);
1095
1096 f->open_array_section("renamed directory fragments");
1097 for (list<frag_t>::const_iterator i = renamed_dir_frags.begin();
1098 i != renamed_dir_frags.end(); ++i) {
1099 f->dump_int("frag", *i);
1100 }
1101 f->close_section(); // renamed directory fragments
1102
1103 f->dump_int("inotable version", inotablev);
1104 f->dump_int("SessionMap version", sessionmapv);
1105 f->dump_int("allocated ino", allocated_ino);
1106
1107 f->dump_stream("preallocated inos") << preallocated_inos;
1108 f->dump_int("used preallocated ino", used_preallocated_ino);
1109
1110 f->open_object_section("client name");
1111 client_name.dump(f);
1112 f->close_section(); // client name
1113
1114 f->open_array_section("inodes starting a truncate");
1115 for(list<inodeno_t>::const_iterator i = truncate_start.begin();
1116 i != truncate_start.end(); ++i) {
1117 f->dump_int("inodeno", *i);
1118 }
1119 f->close_section(); // truncate inodes
1120 f->open_array_section("inodes finishing a truncated");
1121 for(map<inodeno_t,uint64_t>::const_iterator i = truncate_finish.begin();
1122 i != truncate_finish.end(); ++i) {
1123 f->open_object_section("inode+segment");
1124 f->dump_int("inodeno", i->first);
1125 f->dump_int("truncate starting segment", i->second);
1126 f->close_section(); // truncated inode
1127 }
1128 f->close_section(); // truncate finish inodes
1129
1130 f->open_array_section("destroyed inodes");
1131 for(vector<inodeno_t>::const_iterator i = destroyed_inodes.begin();
1132 i != destroyed_inodes.end(); ++i) {
1133 f->dump_int("inodeno", *i);
1134 }
1135 f->close_section(); // destroyed inodes
1136
1137 f->open_array_section("client requests");
1138 for(list<pair<metareqid_t,uint64_t> >::const_iterator i = client_reqs.begin();
1139 i != client_reqs.end(); ++i) {
1140 f->open_object_section("Client request");
1141 f->dump_stream("request ID") << i->first;
1142 f->dump_int("oldest request on client", i->second);
1143 f->close_section(); // request
1144 }
1145 f->close_section(); // client requests
1146 }
1147
1148 void EMetaBlob::generate_test_instances(list<EMetaBlob*>& ls)
1149 {
1150 ls.push_back(new EMetaBlob());
1151 }
1152
1153 void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
1154 {
1155 dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
1156
1157 assert(logseg);
1158
1159 assert(g_conf->mds_kill_journal_replay_at != 1);
1160
1161 for (list<ceph::shared_ptr<fullbit> >::iterator p = roots.begin(); p != roots.end(); ++p) {
1162 CInode *in = mds->mdcache->get_inode((*p)->inode.ino);
1163 bool isnew = in ? false:true;
1164 if (!in)
1165 in = new CInode(mds->mdcache, false);
1166 (*p)->update_inode(mds, in);
1167
1168 if (isnew)
1169 mds->mdcache->add_inode(in);
1170 if ((*p)->is_dirty()) in->_mark_dirty(logseg);
1171 dout(10) << "EMetaBlob.replay " << (isnew ? " added root ":" updated root ") << *in << dendl;
1172 }
1173
1174 CInode *renamed_diri = 0;
1175 CDir *olddir = 0;
1176 if (renamed_dirino) {
1177 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
1178 if (renamed_diri)
1179 dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri << dendl;
1180 else
1181 dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino << dendl;
1182
1183 int nnull = 0;
1184 for (list<dirfrag_t>::iterator lp = lump_order.begin(); lp != lump_order.end(); ++lp) {
1185 dirlump &lump = lump_map[*lp];
1186 if (lump.nnull) {
1187 dout(10) << "EMetaBlob.replay found null dentry in dir " << *lp << dendl;
1188 nnull += lump.nnull;
1189 }
1190 }
1191 assert(nnull <= 1);
1192 }
1193
1194 // keep track of any inodes we unlink and don't relink elsewhere
1195 map<CInode*, CDir*> unlinked;
1196 set<CInode*> linked;
1197
1198 // walk through my dirs (in order!)
1199 for (list<dirfrag_t>::iterator lp = lump_order.begin();
1200 lp != lump_order.end();
1201 ++lp) {
1202 dout(10) << "EMetaBlob.replay dir " << *lp << dendl;
1203 dirlump &lump = lump_map[*lp];
1204
1205 // the dir
1206 CDir *dir = mds->mdcache->get_force_dirfrag(*lp, true);
1207 if (!dir) {
1208 // hmm. do i have the inode?
1209 CInode *diri = mds->mdcache->get_inode((*lp).ino);
1210 if (!diri) {
1211 if (MDS_INO_IS_MDSDIR(lp->ino)) {
1212 assert(MDS_INO_MDSDIR(mds->get_nodeid()) != lp->ino);
1213 diri = mds->mdcache->create_system_inode(lp->ino, S_IFDIR|0755);
1214 diri->state_clear(CInode::STATE_AUTH);
1215 dout(10) << "EMetaBlob.replay created base " << *diri << dendl;
1216 } else {
1217 dout(0) << "EMetaBlob.replay missing dir ino " << (*lp).ino << dendl;
1218 mds->clog->error() << "failure replaying journal (EMetaBlob)";
1219 mds->damaged();
1220 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1221 }
1222 }
1223
1224 // create the dirfrag
1225 dir = diri->get_or_open_dirfrag(mds->mdcache, (*lp).frag);
1226
1227 if (MDS_INO_IS_BASE(lp->ino))
1228 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF);
1229
1230 dout(10) << "EMetaBlob.replay added dir " << *dir << dendl;
1231 }
1232 dir->set_version( lump.fnode.version );
1233 dir->fnode = lump.fnode;
1234
1235 if (lump.is_importing()) {
1236 dir->state_set(CDir::STATE_AUTH);
1237 dir->state_clear(CDir::STATE_COMPLETE);
1238 }
1239 if (lump.is_dirty()) {
1240 dir->_mark_dirty(logseg);
1241
1242 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
1243 dout(10) << "EMetaBlob.replay dirty nestinfo on " << *dir << dendl;
1244 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
1245 logseg->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
1246 } else {
1247 dout(10) << "EMetaBlob.replay clean nestinfo on " << *dir << dendl;
1248 }
1249 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
1250 dout(10) << "EMetaBlob.replay dirty fragstat on " << *dir << dendl;
1251 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
1252 logseg->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
1253 } else {
1254 dout(10) << "EMetaBlob.replay clean fragstat on " << *dir << dendl;
1255 }
1256 }
1257 if (lump.is_dirty_dft()) {
1258 dout(10) << "EMetaBlob.replay dirty dirfragtree on " << *dir << dendl;
1259 dir->state_set(CDir::STATE_DIRTYDFT);
1260 mds->locker->mark_updated_scatterlock(&dir->inode->dirfragtreelock);
1261 logseg->dirty_dirfrag_dirfragtree.push_back(&dir->inode->item_dirty_dirfrag_dirfragtree);
1262 }
1263 if (lump.is_new())
1264 dir->mark_new(logseg);
1265 if (lump.is_complete())
1266 dir->mark_complete();
1267
1268 dout(10) << "EMetaBlob.replay updated dir " << *dir << dendl;
1269
1270 // decode bits
1271 lump._decode_bits();
1272
1273 // full dentry+inode pairs
1274 for (list<ceph::shared_ptr<fullbit> >::const_iterator pp = lump.get_dfull().begin();
1275 pp != lump.get_dfull().end();
1276 ++pp) {
1277 ceph::shared_ptr<fullbit> p = *pp;
1278 CDentry *dn = dir->lookup_exact_snap(p->dn, p->dnlast);
1279 if (!dn) {
1280 dn = dir->add_null_dentry(p->dn, p->dnfirst, p->dnlast);
1281 dn->set_version(p->dnv);
1282 if (p->is_dirty()) dn->_mark_dirty(logseg);
1283 dout(10) << "EMetaBlob.replay added (full) " << *dn << dendl;
1284 } else {
1285 dn->set_version(p->dnv);
1286 if (p->is_dirty()) dn->_mark_dirty(logseg);
1287 dout(10) << "EMetaBlob.replay for [" << p->dnfirst << "," << p->dnlast << "] had " << *dn << dendl;
1288 dn->first = p->dnfirst;
1289 assert(dn->last == p->dnlast);
1290 }
1291 if (lump.is_importing())
1292 dn->state_set(CDentry::STATE_AUTH);
1293
1294 CInode *in = mds->mdcache->get_inode(p->inode.ino, p->dnlast);
1295 if (!in) {
1296 in = new CInode(mds->mdcache, dn->is_auth(), p->dnfirst, p->dnlast);
1297 p->update_inode(mds, in);
1298 mds->mdcache->add_inode(in);
1299 if (!dn->get_linkage()->is_null()) {
1300 if (dn->get_linkage()->is_primary()) {
1301 unlinked[dn->get_linkage()->get_inode()] = dir;
1302 stringstream ss;
1303 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1304 << " " << *dn->get_linkage()->get_inode() << " should be " << p->inode.ino;
1305 dout(0) << ss.str() << dendl;
1306 mds->clog->warn(ss);
1307 }
1308 dir->unlink_inode(dn);
1309 mds->mdcache->touch_dentry_bottom(dn);
1310 }
1311 if (unlinked.count(in))
1312 linked.insert(in);
1313 dir->link_primary_inode(dn, in);
1314 dout(10) << "EMetaBlob.replay added " << *in << dendl;
1315 } else {
1316 in->first = p->dnfirst;
1317 p->update_inode(mds, in);
1318 if (dn->get_linkage()->get_inode() != in && in->get_parent_dn()) {
1319 dout(10) << "EMetaBlob.replay unlinking " << *in << dendl;
1320 unlinked[in] = in->get_parent_dir();
1321 CDentry *unlinked_dn = in->get_parent_dn();
1322 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
1323 mds->mdcache->touch_dentry_bottom(unlinked_dn);
1324 }
1325 if (dn->get_linkage()->get_inode() != in) {
1326 if (!dn->get_linkage()->is_null()) { // note: might be remote. as with stray reintegration.
1327 if (dn->get_linkage()->is_primary()) {
1328 unlinked[dn->get_linkage()->get_inode()] = dir;
1329 stringstream ss;
1330 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1331 << " " << *dn->get_linkage()->get_inode() << " should be " << p->inode.ino;
1332 dout(0) << ss.str() << dendl;
1333 mds->clog->warn(ss);
1334 }
1335 dir->unlink_inode(dn);
1336 mds->mdcache->touch_dentry_bottom(dn);
1337 }
1338 if (unlinked.count(in))
1339 linked.insert(in);
1340 dir->link_primary_inode(dn, in);
1341 dout(10) << "EMetaBlob.replay linked " << *in << dendl;
1342 } else {
1343 dout(10) << "EMetaBlob.replay for [" << p->dnfirst << "," << p->dnlast << "] had " << *in << dendl;
1344 }
1345 assert(in->first == p->dnfirst ||
1346 (in->is_multiversion() && in->first > p->dnfirst));
1347 }
1348 if (p->is_dirty())
1349 in->_mark_dirty(logseg);
1350 if (p->is_dirty_parent())
1351 in->_mark_dirty_parent(logseg, p->is_dirty_pool());
1352 if (p->need_snapflush())
1353 logseg->open_files.push_back(&in->item_open_file);
1354 if (dn->is_auth())
1355 in->state_set(CInode::STATE_AUTH);
1356 else
1357 in->state_clear(CInode::STATE_AUTH);
1358 assert(g_conf->mds_kill_journal_replay_at != 2);
1359 }
1360
1361 // remote dentries
1362 for (list<remotebit>::const_iterator p = lump.get_dremote().begin();
1363 p != lump.get_dremote().end();
1364 ++p) {
1365 CDentry *dn = dir->lookup_exact_snap(p->dn, p->dnlast);
1366 if (!dn) {
1367 dn = dir->add_remote_dentry(p->dn, p->ino, p->d_type, p->dnfirst, p->dnlast);
1368 dn->set_version(p->dnv);
1369 if (p->dirty) dn->_mark_dirty(logseg);
1370 dout(10) << "EMetaBlob.replay added " << *dn << dendl;
1371 } else {
1372 if (!dn->get_linkage()->is_null()) {
1373 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1374 if (dn->get_linkage()->is_primary()) {
1375 unlinked[dn->get_linkage()->get_inode()] = dir;
1376 stringstream ss;
1377 ss << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn
1378 << " " << *dn->get_linkage()->get_inode() << " should be remote " << p->ino;
1379 dout(0) << ss.str() << dendl;
1380 }
1381 dir->unlink_inode(dn);
1382 mds->mdcache->touch_dentry_bottom(dn);
1383 }
1384 dir->link_remote_inode(dn, p->ino, p->d_type);
1385 dn->set_version(p->dnv);
1386 if (p->dirty) dn->_mark_dirty(logseg);
1387 dout(10) << "EMetaBlob.replay for [" << p->dnfirst << "," << p->dnlast << "] had " << *dn << dendl;
1388 dn->first = p->dnfirst;
1389 assert(dn->last == p->dnlast);
1390 }
1391 if (lump.is_importing())
1392 dn->state_set(CDentry::STATE_AUTH);
1393 }
1394
1395 // null dentries
1396 for (list<nullbit>::const_iterator p = lump.get_dnull().begin();
1397 p != lump.get_dnull().end();
1398 ++p) {
1399 CDentry *dn = dir->lookup_exact_snap(p->dn, p->dnlast);
1400 if (!dn) {
1401 dn = dir->add_null_dentry(p->dn, p->dnfirst, p->dnlast);
1402 dn->set_version(p->dnv);
1403 if (p->dirty) dn->_mark_dirty(logseg);
1404 dout(10) << "EMetaBlob.replay added (nullbit) " << *dn << dendl;
1405 } else {
1406 dn->first = p->dnfirst;
1407 if (!dn->get_linkage()->is_null()) {
1408 dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
1409 CInode *in = dn->get_linkage()->get_inode();
1410 // For renamed inode, We may call CInode::force_dirfrag() later.
1411 // CInode::force_dirfrag() doesn't work well when inode is detached
1412 // from the hierarchy.
1413 if (!renamed_diri || renamed_diri != in) {
1414 if (dn->get_linkage()->is_primary())
1415 unlinked[in] = dir;
1416 dir->unlink_inode(dn);
1417 mds->mdcache->touch_dentry_bottom(dn);
1418 }
1419 }
1420 dn->set_version(p->dnv);
1421 if (p->dirty) dn->_mark_dirty(logseg);
1422 dout(10) << "EMetaBlob.replay had " << *dn << dendl;
1423 assert(dn->last == p->dnlast);
1424 }
1425 olddir = dir;
1426 if (lump.is_importing())
1427 dn->state_set(CDentry::STATE_AUTH);
1428
1429 // Make null dentries the first things we trim
1430 dout(10) << "EMetaBlob.replay pushing to bottom of lru " << *dn << dendl;
1431 mds->mdcache->touch_dentry_bottom(dn);
1432 }
1433 }
1434
1435 assert(g_conf->mds_kill_journal_replay_at != 3);
1436
1437 if (renamed_dirino) {
1438 if (renamed_diri) {
1439 assert(unlinked.count(renamed_diri));
1440 assert(linked.count(renamed_diri));
1441 olddir = unlinked[renamed_diri];
1442 } else {
1443 // we imported a diri we haven't seen before
1444 renamed_diri = mds->mdcache->get_inode(renamed_dirino);
1445 assert(renamed_diri); // it was in the metablob
1446 }
1447
1448 if (olddir) {
1449 if (olddir->authority() != CDIR_AUTH_UNDEF &&
1450 renamed_diri->authority() == CDIR_AUTH_UNDEF) {
1451 assert(slaveup); // auth to non-auth, must be slave prepare
1452 list<frag_t> leaves;
1453 renamed_diri->dirfragtree.get_leaves(leaves);
1454 for (list<frag_t>::iterator p = leaves.begin(); p != leaves.end(); ++p) {
1455 CDir *dir = renamed_diri->get_dirfrag(*p);
1456 assert(dir);
1457 if (dir->get_dir_auth() == CDIR_AUTH_UNDEF)
1458 // preserve subtree bound until slave commit
1459 slaveup->olddirs.insert(dir->inode);
1460 else
1461 dir->state_set(CDir::STATE_AUTH);
1462 }
1463 }
1464
1465 mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
1466
1467 // see if we can discard the subtree we renamed out of
1468 CDir *root = mds->mdcache->get_subtree_root(olddir);
1469 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
1470 if (slaveup) // preserve the old dir until slave commit
1471 slaveup->olddirs.insert(olddir->inode);
1472 else
1473 mds->mdcache->try_trim_non_auth_subtree(root);
1474 }
1475 }
1476
1477 // if we are the srci importer, we'll also have some dirfrags we have to open up...
1478 if (renamed_diri->authority() != CDIR_AUTH_UNDEF) {
1479 for (list<frag_t>::iterator p = renamed_dir_frags.begin(); p != renamed_dir_frags.end(); ++p) {
1480 CDir *dir = renamed_diri->get_dirfrag(*p);
1481 if (dir) {
1482 // we already had the inode before, and we already adjusted this subtree accordingly.
1483 dout(10) << " already had+adjusted rename import bound " << *dir << dendl;
1484 assert(olddir);
1485 continue;
1486 }
1487 dir = renamed_diri->get_or_open_dirfrag(mds->mdcache, *p);
1488 dout(10) << " creating new rename import bound " << *dir << dendl;
1489 dir->state_clear(CDir::STATE_AUTH);
1490 mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNDEF, false);
1491 }
1492 }
1493
1494 // rename may overwrite an empty directory and move it into stray dir.
1495 unlinked.erase(renamed_diri);
1496 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
1497 if (!linked.count(p->first))
1498 continue;
1499 assert(p->first->is_dir());
1500 mds->mdcache->adjust_subtree_after_rename(p->first, p->second, false);
1501 }
1502 }
1503
1504 if (!unlinked.empty()) {
1505 for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); ++p)
1506 unlinked.erase(*p);
1507 dout(10) << " unlinked set contains " << unlinked << dendl;
1508 for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
1509 if (slaveup) // preserve unlinked inodes until slave commit
1510 slaveup->unlinked.insert(p->first);
1511 else
1512 mds->mdcache->remove_inode_recursive(p->first);
1513 }
1514 }
1515
1516 // table client transactions
1517 for (list<pair<__u8,version_t> >::iterator p = table_tids.begin();
1518 p != table_tids.end();
1519 ++p) {
1520 dout(10) << "EMetaBlob.replay noting " << get_mdstable_name(p->first)
1521 << " transaction " << p->second << dendl;
1522 MDSTableClient *client = mds->get_table_client(p->first);
1523 if (client)
1524 client->got_journaled_agree(p->second, logseg);
1525 }
1526
1527 // opened ino?
1528 if (opened_ino) {
1529 CInode *in = mds->mdcache->get_inode(opened_ino);
1530 assert(in);
1531 dout(10) << "EMetaBlob.replay noting opened inode " << *in << dendl;
1532 logseg->open_files.push_back(&in->item_open_file);
1533 }
1534
1535 // allocated_inos
1536 if (inotablev) {
1537 if (mds->inotable->get_version() >= inotablev) {
1538 dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
1539 << " <= table " << mds->inotable->get_version() << dendl;
1540 } else {
1541 dout(10) << "EMetaBlob.replay inotable v " << inotablev
1542 << " - 1 == table " << mds->inotable->get_version()
1543 << " allocated+used " << allocated_ino
1544 << " prealloc " << preallocated_inos
1545 << dendl;
1546 if (allocated_ino)
1547 mds->inotable->replay_alloc_id(allocated_ino);
1548 if (preallocated_inos.size())
1549 mds->inotable->replay_alloc_ids(preallocated_inos);
1550
1551 // [repair bad inotable updates]
1552 if (inotablev > mds->inotable->get_version()) {
1553 mds->clog->error() << "journal replay inotablev mismatch "
1554 << mds->inotable->get_version() << " -> " << inotablev;
1555 mds->inotable->force_replay_version(inotablev);
1556 }
1557
1558 assert(inotablev == mds->inotable->get_version());
1559 }
1560 }
1561 if (sessionmapv) {
1562 if (mds->sessionmap.get_version() >= sessionmapv) {
1563 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1564 << " <= table " << mds->sessionmap.get_version() << dendl;
1565 } else if (mds->sessionmap.get_version() + 2 >= sessionmapv) {
1566 dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
1567 << " -(1|2) == table " << mds->sessionmap.get_version()
1568 << " prealloc " << preallocated_inos
1569 << " used " << used_preallocated_ino
1570 << dendl;
1571 Session *session = mds->sessionmap.get_session(client_name);
1572 if (session) {
1573 dout(20) << " (session prealloc " << session->info.prealloc_inos << ")" << dendl;
1574 if (used_preallocated_ino) {
1575 if (!session->info.prealloc_inos.empty()) {
1576 inodeno_t next = session->next_ino();
1577 inodeno_t i = session->take_ino(used_preallocated_ino);
1578 if (next != i)
1579 mds->clog->warn() << " replayed op " << client_reqs << " used ino " << i
1580 << " but session next is " << next;
1581 assert(i == used_preallocated_ino);
1582 session->info.used_inos.clear();
1583 }
1584 mds->sessionmap.replay_dirty_session(session);
1585 }
1586 if (!preallocated_inos.empty()) {
1587 session->info.prealloc_inos.insert(preallocated_inos);
1588 mds->sessionmap.replay_dirty_session(session);
1589 }
1590
1591 } else {
1592 dout(10) << "EMetaBlob.replay no session for " << client_name << dendl;
1593 if (used_preallocated_ino) {
1594 mds->sessionmap.replay_advance_version();
1595 }
1596 if (!preallocated_inos.empty())
1597 mds->sessionmap.replay_advance_version();
1598 }
1599 assert(sessionmapv == mds->sessionmap.get_version());
1600 } else {
1601 mds->clog->error() << "journal replay sessionmap v " << sessionmapv
1602 << " -(1|2) > table " << mds->sessionmap.get_version();
1603 assert(g_conf->mds_wipe_sessions);
1604 mds->sessionmap.wipe();
1605 mds->sessionmap.set_version(sessionmapv);
1606 }
1607 }
1608
1609 // truncating inodes
1610 for (list<inodeno_t>::iterator p = truncate_start.begin();
1611 p != truncate_start.end();
1612 ++p) {
1613 CInode *in = mds->mdcache->get_inode(*p);
1614 assert(in);
1615 mds->mdcache->add_recovered_truncate(in, logseg);
1616 }
1617 for (map<inodeno_t,uint64_t>::iterator p = truncate_finish.begin();
1618 p != truncate_finish.end();
1619 ++p) {
1620 LogSegment *ls = mds->mdlog->get_segment(p->second);
1621 if (ls) {
1622 CInode *in = mds->mdcache->get_inode(p->first);
1623 assert(in);
1624 mds->mdcache->remove_recovered_truncate(in, ls);
1625 }
1626 }
1627
1628 // destroyed inodes
1629 for (vector<inodeno_t>::iterator p = destroyed_inodes.begin();
1630 p != destroyed_inodes.end();
1631 ++p) {
1632 CInode *in = mds->mdcache->get_inode(*p);
1633 if (in) {
1634 dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl;
1635 CDentry *parent = in->get_parent_dn();
1636 mds->mdcache->remove_inode(in);
1637 if (parent) {
1638 dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl;
1639 assert(parent->get_linkage()->is_null());
1640 mds->mdcache->touch_dentry_bottom(parent);
1641 }
1642 } else {
1643 dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl;
1644 }
1645 }
1646
1647 // client requests
1648 for (list<pair<metareqid_t, uint64_t> >::iterator p = client_reqs.begin();
1649 p != client_reqs.end();
1650 ++p) {
1651 if (p->first.name.is_client()) {
1652 dout(10) << "EMetaBlob.replay request " << p->first << " trim_to " << p->second << dendl;
1653 inodeno_t created = allocated_ino ? allocated_ino : used_preallocated_ino;
1654 // if we allocated an inode, there should be exactly one client request id.
1655 assert(created == inodeno_t() || client_reqs.size() == 1);
1656
1657 Session *session = mds->sessionmap.get_session(p->first.name);
1658 if (session) {
1659 session->add_completed_request(p->first.tid, created);
1660 if (p->second)
1661 session->trim_completed_requests(p->second);
1662 }
1663 }
1664 }
1665
1666 // client flushes
1667 for (list<pair<metareqid_t, uint64_t> >::iterator p = client_flushes.begin();
1668 p != client_flushes.end();
1669 ++p) {
1670 if (p->first.name.is_client()) {
1671 dout(10) << "EMetaBlob.replay flush " << p->first << " trim_to " << p->second << dendl;
1672 Session *session = mds->sessionmap.get_session(p->first.name);
1673 if (session) {
1674 session->add_completed_flush(p->first.tid);
1675 if (p->second)
1676 session->trim_completed_flushes(p->second);
1677 }
1678 }
1679 }
1680
1681 // update segment
1682 update_segment(logseg);
1683
1684 assert(g_conf->mds_kill_journal_replay_at != 4);
1685 }
1686
1687 // -----------------------
1688 // ESession
1689
1690 void ESession::update_segment()
1691 {
1692 _segment->sessionmapv = cmapv;
1693 if (inos.size() && inotablev)
1694 _segment->inotablev = inotablev;
1695 }
1696
1697 void ESession::replay(MDSRank *mds)
1698 {
1699 if (mds->sessionmap.get_version() >= cmapv) {
1700 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1701 << " >= " << cmapv << ", noop" << dendl;
1702 } else {
1703 dout(10) << "ESession.replay sessionmap " << mds->sessionmap.get_version()
1704 << " < " << cmapv << " " << (open ? "open":"close") << " " << client_inst << dendl;
1705 Session *session;
1706 if (open) {
1707 session = mds->sessionmap.get_or_add_session(client_inst);
1708 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1709 session->set_client_metadata(client_metadata);
1710 dout(10) << " opened session " << session->info.inst << dendl;
1711 } else {
1712 session = mds->sessionmap.get_session(client_inst.name);
1713 if (session) { // there always should be a session, but there's a bug
1714 if (session->connection == NULL) {
1715 dout(10) << " removed session " << session->info.inst << dendl;
1716 mds->sessionmap.remove_session(session);
1717 session = NULL;
1718 } else {
1719 session->clear(); // the client has reconnected; keep the Session, but reset
1720 dout(10) << " reset session " << session->info.inst << " (they reconnected)" << dendl;
1721 }
1722 } else {
1723 mds->clog->error() << "replayed stray Session close event for " << client_inst
1724 << " from time " << stamp << ", ignoring";
1725 }
1726 }
1727 if (session) {
1728 mds->sessionmap.replay_dirty_session(session);
1729 } else {
1730 mds->sessionmap.replay_advance_version();
1731 }
1732 assert(mds->sessionmap.get_version() == cmapv);
1733 }
1734
1735 if (inos.size() && inotablev) {
1736 if (mds->inotable->get_version() >= inotablev) {
1737 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1738 << " >= " << inotablev << ", noop" << dendl;
1739 } else {
1740 dout(10) << "ESession.replay inotable " << mds->inotable->get_version()
1741 << " < " << inotablev << " " << (open ? "add":"remove") << dendl;
1742 assert(!open); // for now
1743 mds->inotable->replay_release_ids(inos);
1744 assert(mds->inotable->get_version() == inotablev);
1745 }
1746 }
1747
1748 update_segment();
1749 }
1750
1751 void ESession::encode(bufferlist &bl, uint64_t features) const
1752 {
1753 ENCODE_START(4, 3, bl);
1754 ::encode(stamp, bl);
1755 ::encode(client_inst, bl, features);
1756 ::encode(open, bl);
1757 ::encode(cmapv, bl);
1758 ::encode(inos, bl);
1759 ::encode(inotablev, bl);
1760 ::encode(client_metadata, bl);
1761 ENCODE_FINISH(bl);
1762 }
1763
1764 void ESession::decode(bufferlist::iterator &bl)
1765 {
1766 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
1767 if (struct_v >= 2)
1768 ::decode(stamp, bl);
1769 ::decode(client_inst, bl);
1770 ::decode(open, bl);
1771 ::decode(cmapv, bl);
1772 ::decode(inos, bl);
1773 ::decode(inotablev, bl);
1774 if (struct_v >= 4) {
1775 ::decode(client_metadata, bl);
1776 }
1777 DECODE_FINISH(bl);
1778 }
1779
1780 void ESession::dump(Formatter *f) const
1781 {
1782 f->dump_stream("client instance") << client_inst;
1783 f->dump_string("open", open ? "true" : "false");
1784 f->dump_int("client map version", cmapv);
1785 f->dump_stream("inos") << inos;
1786 f->dump_int("inotable version", inotablev);
1787 f->open_object_section("client_metadata");
1788 for (map<string, string>::const_iterator i = client_metadata.begin();
1789 i != client_metadata.end(); ++i) {
1790 f->dump_string(i->first.c_str(), i->second);
1791 }
1792 f->close_section(); // client_metadata
1793 }
1794
1795 void ESession::generate_test_instances(list<ESession*>& ls)
1796 {
1797 ls.push_back(new ESession);
1798 }
1799
1800 // -----------------------
1801 // ESessions
1802
1803 void ESessions::encode(bufferlist &bl, uint64_t features) const
1804 {
1805 ENCODE_START(1, 1, bl);
1806 ::encode(client_map, bl, features);
1807 ::encode(cmapv, bl);
1808 ::encode(stamp, bl);
1809 ENCODE_FINISH(bl);
1810 }
1811
1812 void ESessions::decode_old(bufferlist::iterator &bl)
1813 {
1814 ::decode(client_map, bl);
1815 ::decode(cmapv, bl);
1816 if (!bl.end())
1817 ::decode(stamp, bl);
1818 }
1819
1820 void ESessions::decode_new(bufferlist::iterator &bl)
1821 {
1822 DECODE_START(1, bl);
1823 ::decode(client_map, bl);
1824 ::decode(cmapv, bl);
1825 if (!bl.end())
1826 ::decode(stamp, bl);
1827 DECODE_FINISH(bl);
1828 }
1829
1830 void ESessions::dump(Formatter *f) const
1831 {
1832 f->dump_int("client map version", cmapv);
1833
1834 f->open_array_section("client map");
1835 for (map<client_t,entity_inst_t>::const_iterator i = client_map.begin();
1836 i != client_map.end(); ++i) {
1837 f->open_object_section("client");
1838 f->dump_int("client id", i->first.v);
1839 f->dump_stream("client entity") << i->second;
1840 f->close_section(); // client
1841 }
1842 f->close_section(); // client map
1843 }
1844
1845 void ESessions::generate_test_instances(list<ESessions*>& ls)
1846 {
1847 ls.push_back(new ESessions());
1848 }
1849
1850 void ESessions::update_segment()
1851 {
1852 _segment->sessionmapv = cmapv;
1853 }
1854
1855 void ESessions::replay(MDSRank *mds)
1856 {
1857 if (mds->sessionmap.get_version() >= cmapv) {
1858 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1859 << " >= " << cmapv << ", noop" << dendl;
1860 } else {
1861 dout(10) << "ESessions.replay sessionmap " << mds->sessionmap.get_version()
1862 << " < " << cmapv << dendl;
1863 mds->sessionmap.open_sessions(client_map);
1864 assert(mds->sessionmap.get_version() == cmapv);
1865 mds->sessionmap.set_projected(mds->sessionmap.get_version());
1866 }
1867 update_segment();
1868 }
1869
1870
1871 // -----------------------
1872 // ETableServer
1873
1874 void ETableServer::encode(bufferlist& bl, uint64_t features) const
1875 {
1876 ENCODE_START(3, 3, bl);
1877 ::encode(stamp, bl);
1878 ::encode(table, bl);
1879 ::encode(op, bl);
1880 ::encode(reqid, bl);
1881 ::encode(bymds, bl);
1882 ::encode(mutation, bl);
1883 ::encode(tid, bl);
1884 ::encode(version, bl);
1885 ENCODE_FINISH(bl);
1886 }
1887
1888 void ETableServer::decode(bufferlist::iterator &bl)
1889 {
1890 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
1891 if (struct_v >= 2)
1892 ::decode(stamp, bl);
1893 ::decode(table, bl);
1894 ::decode(op, bl);
1895 ::decode(reqid, bl);
1896 ::decode(bymds, bl);
1897 ::decode(mutation, bl);
1898 ::decode(tid, bl);
1899 ::decode(version, bl);
1900 DECODE_FINISH(bl);
1901 }
1902
1903 void ETableServer::dump(Formatter *f) const
1904 {
1905 f->dump_int("table id", table);
1906 f->dump_int("op", op);
1907 f->dump_int("request id", reqid);
1908 f->dump_int("by mds", bymds);
1909 f->dump_int("tid", tid);
1910 f->dump_int("version", version);
1911 }
1912
1913 void ETableServer::generate_test_instances(list<ETableServer*>& ls)
1914 {
1915 ls.push_back(new ETableServer());
1916 }
1917
1918
1919 void ETableServer::update_segment()
1920 {
1921 _segment->tablev[table] = version;
1922 }
1923
1924 void ETableServer::replay(MDSRank *mds)
1925 {
1926 MDSTableServer *server = mds->get_table_server(table);
1927 if (!server)
1928 return;
1929
1930 if (server->get_version() >= version) {
1931 dout(10) << "ETableServer.replay " << get_mdstable_name(table)
1932 << " " << get_mdstableserver_opname(op)
1933 << " event " << version
1934 << " <= table " << server->get_version() << dendl;
1935 return;
1936 }
1937
1938 dout(10) << " ETableServer.replay " << get_mdstable_name(table)
1939 << " " << get_mdstableserver_opname(op)
1940 << " event " << version << " - 1 == table " << server->get_version() << dendl;
1941 assert(version-1 == server->get_version());
1942
1943 switch (op) {
1944 case TABLESERVER_OP_PREPARE:
1945 server->_prepare(mutation, reqid, bymds);
1946 server->_note_prepare(bymds, reqid);
1947 break;
1948 case TABLESERVER_OP_COMMIT:
1949 server->_commit(tid);
1950 server->_note_commit(tid);
1951 break;
1952 case TABLESERVER_OP_ROLLBACK:
1953 server->_rollback(tid);
1954 server->_note_rollback(tid);
1955 break;
1956 case TABLESERVER_OP_SERVER_UPDATE:
1957 server->_server_update(mutation);
1958 break;
1959 default:
1960 mds->clog->error() << "invalid tableserver op in ETableServer";
1961 mds->damaged();
1962 ceph_abort(); // Should be unreachable because damaged() calls respawn()
1963 }
1964
1965 assert(version == server->get_version());
1966 update_segment();
1967 }
1968
1969
1970 // ---------------------
1971 // ETableClient
1972
1973 void ETableClient::encode(bufferlist& bl, uint64_t features) const
1974 {
1975 ENCODE_START(3, 3, bl);
1976 ::encode(stamp, bl);
1977 ::encode(table, bl);
1978 ::encode(op, bl);
1979 ::encode(tid, bl);
1980 ENCODE_FINISH(bl);
1981 }
1982
1983 void ETableClient::decode(bufferlist::iterator &bl)
1984 {
1985 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
1986 if (struct_v >= 2)
1987 ::decode(stamp, bl);
1988 ::decode(table, bl);
1989 ::decode(op, bl);
1990 ::decode(tid, bl);
1991 DECODE_FINISH(bl);
1992 }
1993
1994 void ETableClient::dump(Formatter *f) const
1995 {
1996 f->dump_int("table", table);
1997 f->dump_int("op", op);
1998 f->dump_int("tid", tid);
1999 }
2000
2001 void ETableClient::generate_test_instances(list<ETableClient*>& ls)
2002 {
2003 ls.push_back(new ETableClient());
2004 }
2005
2006 void ETableClient::replay(MDSRank *mds)
2007 {
2008 dout(10) << " ETableClient.replay " << get_mdstable_name(table)
2009 << " op " << get_mdstableserver_opname(op)
2010 << " tid " << tid << dendl;
2011
2012 MDSTableClient *client = mds->get_table_client(table);
2013 if (!client)
2014 return;
2015
2016 assert(op == TABLESERVER_OP_ACK);
2017 client->got_journaled_ack(tid);
2018 }
2019
2020
2021 // -----------------------
2022 // ESnap
2023 /*
2024 void ESnap::update_segment()
2025 {
2026 _segment->tablev[TABLE_SNAP] = version;
2027 }
2028
2029 void ESnap::replay(MDSRank *mds)
2030 {
2031 if (mds->snaptable->get_version() >= version) {
2032 dout(10) << "ESnap.replay event " << version
2033 << " <= table " << mds->snaptable->get_version() << dendl;
2034 return;
2035 }
2036
2037 dout(10) << " ESnap.replay event " << version
2038 << " - 1 == table " << mds->snaptable->get_version() << dendl;
2039 assert(version-1 == mds->snaptable->get_version());
2040
2041 if (create) {
2042 version_t v;
2043 snapid_t s = mds->snaptable->create(snap.dirino, snap.name, snap.stamp, &v);
2044 assert(s == snap.snapid);
2045 } else {
2046 mds->snaptable->remove(snap.snapid);
2047 }
2048
2049 assert(version == mds->snaptable->get_version());
2050 }
2051 */
2052
2053
2054
2055 // -----------------------
2056 // EUpdate
2057
2058 void EUpdate::encode(bufferlist &bl, uint64_t features) const
2059 {
2060 ENCODE_START(4, 4, bl);
2061 ::encode(stamp, bl);
2062 ::encode(type, bl);
2063 ::encode(metablob, bl, features);
2064 ::encode(client_map, bl);
2065 ::encode(cmapv, bl);
2066 ::encode(reqid, bl);
2067 ::encode(had_slaves, bl);
2068 ENCODE_FINISH(bl);
2069 }
2070
2071 void EUpdate::decode(bufferlist::iterator &bl)
2072 {
2073 DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl);
2074 if (struct_v >= 2)
2075 ::decode(stamp, bl);
2076 ::decode(type, bl);
2077 ::decode(metablob, bl);
2078 ::decode(client_map, bl);
2079 if (struct_v >= 3)
2080 ::decode(cmapv, bl);
2081 ::decode(reqid, bl);
2082 ::decode(had_slaves, bl);
2083 DECODE_FINISH(bl);
2084 }
2085
2086 void EUpdate::dump(Formatter *f) const
2087 {
2088 f->open_object_section("metablob");
2089 metablob.dump(f);
2090 f->close_section(); // metablob
2091
2092 f->dump_string("type", type);
2093 f->dump_int("client map length", client_map.length());
2094 f->dump_int("client map version", cmapv);
2095 f->dump_stream("reqid") << reqid;
2096 f->dump_string("had slaves", had_slaves ? "true" : "false");
2097 }
2098
2099 void EUpdate::generate_test_instances(list<EUpdate*>& ls)
2100 {
2101 ls.push_back(new EUpdate());
2102 }
2103
2104
2105 void EUpdate::update_segment()
2106 {
2107 metablob.update_segment(_segment);
2108
2109 if (client_map.length())
2110 _segment->sessionmapv = cmapv;
2111
2112 if (had_slaves)
2113 _segment->uncommitted_masters.insert(reqid);
2114 }
2115
2116 void EUpdate::replay(MDSRank *mds)
2117 {
2118 metablob.replay(mds, _segment);
2119
2120 if (had_slaves) {
2121 dout(10) << "EUpdate.replay " << reqid << " had slaves, expecting a matching ECommitted" << dendl;
2122 _segment->uncommitted_masters.insert(reqid);
2123 set<mds_rank_t> slaves;
2124 mds->mdcache->add_uncommitted_master(reqid, _segment, slaves, true);
2125 }
2126
2127 if (client_map.length()) {
2128 if (mds->sessionmap.get_version() >= cmapv) {
2129 dout(10) << "EUpdate.replay sessionmap v " << cmapv
2130 << " <= table " << mds->sessionmap.get_version() << dendl;
2131 } else {
2132 dout(10) << "EUpdate.replay sessionmap " << mds->sessionmap.get_version()
2133 << " < " << cmapv << dendl;
2134 // open client sessions?
2135 map<client_t,entity_inst_t> cm;
2136 bufferlist::iterator blp = client_map.begin();
2137 ::decode(cm, blp);
2138 mds->sessionmap.open_sessions(cm);
2139
2140 assert(mds->sessionmap.get_version() == cmapv);
2141 mds->sessionmap.set_projected(mds->sessionmap.get_version());
2142 }
2143 }
2144 update_segment();
2145 }
2146
2147
2148 // ------------------------
2149 // EOpen
2150
2151 void EOpen::encode(bufferlist &bl, uint64_t features) const {
2152 ENCODE_START(4, 3, bl);
2153 ::encode(stamp, bl);
2154 ::encode(metablob, bl, features);
2155 ::encode(inos, bl);
2156 ::encode(snap_inos, bl);
2157 ENCODE_FINISH(bl);
2158 }
2159
2160 void EOpen::decode(bufferlist::iterator &bl) {
2161 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2162 if (struct_v >= 2)
2163 ::decode(stamp, bl);
2164 ::decode(metablob, bl);
2165 ::decode(inos, bl);
2166 if (struct_v >= 4)
2167 ::decode(snap_inos, bl);
2168 DECODE_FINISH(bl);
2169 }
2170
2171 void EOpen::dump(Formatter *f) const
2172 {
2173 f->open_object_section("metablob");
2174 metablob.dump(f);
2175 f->close_section(); // metablob
2176 f->open_array_section("inos involved");
2177 for (vector<inodeno_t>::const_iterator i = inos.begin();
2178 i != inos.end(); ++i) {
2179 f->dump_int("ino", *i);
2180 }
2181 f->close_section(); // inos
2182 }
2183
2184 void EOpen::generate_test_instances(list<EOpen*>& ls)
2185 {
2186 ls.push_back(new EOpen());
2187 ls.push_back(new EOpen());
2188 ls.back()->add_ino(0);
2189 }
2190
2191 void EOpen::update_segment()
2192 {
2193 // ??
2194 }
2195
2196 void EOpen::replay(MDSRank *mds)
2197 {
2198 dout(10) << "EOpen.replay " << dendl;
2199 metablob.replay(mds, _segment);
2200
2201 // note which segments inodes belong to, so we don't have to start rejournaling them
2202 for (const auto &ino : inos) {
2203 CInode *in = mds->mdcache->get_inode(ino);
2204 if (!in) {
2205 dout(0) << "EOpen.replay ino " << ino << " not in metablob" << dendl;
2206 assert(in);
2207 }
2208 _segment->open_files.push_back(&in->item_open_file);
2209 }
2210 for (const auto &vino : snap_inos) {
2211 CInode *in = mds->mdcache->get_inode(vino);
2212 if (!in) {
2213 dout(0) << "EOpen.replay ino " << vino << " not in metablob" << dendl;
2214 assert(in);
2215 }
2216 _segment->open_files.push_back(&in->item_open_file);
2217 }
2218 }
2219
2220
2221 // -----------------------
2222 // ECommitted
2223
2224 void ECommitted::replay(MDSRank *mds)
2225 {
2226 if (mds->mdcache->uncommitted_masters.count(reqid)) {
2227 dout(10) << "ECommitted.replay " << reqid << dendl;
2228 mds->mdcache->uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2229 mds->mdcache->uncommitted_masters.erase(reqid);
2230 } else {
2231 dout(10) << "ECommitted.replay " << reqid << " -- didn't see original op" << dendl;
2232 }
2233 }
2234
2235 void ECommitted::encode(bufferlist& bl, uint64_t features) const
2236 {
2237 ENCODE_START(3, 3, bl);
2238 ::encode(stamp, bl);
2239 ::encode(reqid, bl);
2240 ENCODE_FINISH(bl);
2241 }
2242
2243 void ECommitted::decode(bufferlist::iterator& bl)
2244 {
2245 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2246 if (struct_v >= 2)
2247 ::decode(stamp, bl);
2248 ::decode(reqid, bl);
2249 DECODE_FINISH(bl);
2250 }
2251
2252 void ECommitted::dump(Formatter *f) const {
2253 f->dump_stream("stamp") << stamp;
2254 f->dump_stream("reqid") << reqid;
2255 }
2256
2257 void ECommitted::generate_test_instances(list<ECommitted*>& ls)
2258 {
2259 ls.push_back(new ECommitted);
2260 ls.push_back(new ECommitted);
2261 ls.back()->stamp = utime_t(1, 2);
2262 ls.back()->reqid = metareqid_t(entity_name_t::CLIENT(123), 456);
2263 }
2264
2265 // -----------------------
2266 // ESlaveUpdate
2267
2268 void link_rollback::encode(bufferlist &bl) const
2269 {
2270 ENCODE_START(2, 2, bl);
2271 ::encode(reqid, bl);
2272 ::encode(ino, bl);
2273 ::encode(was_inc, bl);
2274 ::encode(old_ctime, bl);
2275 ::encode(old_dir_mtime, bl);
2276 ::encode(old_dir_rctime, bl);
2277 ENCODE_FINISH(bl);
2278 }
2279
2280 void link_rollback::decode(bufferlist::iterator &bl)
2281 {
2282 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2283 ::decode(reqid, bl);
2284 ::decode(ino, bl);
2285 ::decode(was_inc, bl);
2286 ::decode(old_ctime, bl);
2287 ::decode(old_dir_mtime, bl);
2288 ::decode(old_dir_rctime, bl);
2289 DECODE_FINISH(bl);
2290 }
2291
2292 void link_rollback::dump(Formatter *f) const
2293 {
2294 f->dump_stream("metareqid") << reqid;
2295 f->dump_int("ino", ino);
2296 f->dump_string("was incremented", was_inc ? "true" : "false");
2297 f->dump_stream("old_ctime") << old_ctime;
2298 f->dump_stream("old_dir_mtime") << old_dir_mtime;
2299 f->dump_stream("old_dir_rctime") << old_dir_rctime;
2300 }
2301
2302 void link_rollback::generate_test_instances(list<link_rollback*>& ls)
2303 {
2304 ls.push_back(new link_rollback());
2305 }
2306
2307 void rmdir_rollback::encode(bufferlist& bl) const
2308 {
2309 ENCODE_START(2, 2, bl);
2310 ::encode(reqid, bl);
2311 ::encode(src_dir, bl);
2312 ::encode(src_dname, bl);
2313 ::encode(dest_dir, bl);
2314 ::encode(dest_dname, bl);
2315 ENCODE_FINISH(bl);
2316 }
2317
2318 void rmdir_rollback::decode(bufferlist::iterator& bl)
2319 {
2320 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2321 ::decode(reqid, bl);
2322 ::decode(src_dir, bl);
2323 ::decode(src_dname, bl);
2324 ::decode(dest_dir, bl);
2325 ::decode(dest_dname, bl);
2326 DECODE_FINISH(bl);
2327 }
2328
2329 void rmdir_rollback::dump(Formatter *f) const
2330 {
2331 f->dump_stream("metareqid") << reqid;
2332 f->dump_stream("source directory") << src_dir;
2333 f->dump_string("source dname", src_dname);
2334 f->dump_stream("destination directory") << dest_dir;
2335 f->dump_string("destination dname", dest_dname);
2336 }
2337
2338 void rmdir_rollback::generate_test_instances(list<rmdir_rollback*>& ls)
2339 {
2340 ls.push_back(new rmdir_rollback());
2341 }
2342
2343 void rename_rollback::drec::encode(bufferlist &bl) const
2344 {
2345 ENCODE_START(2, 2, bl);
2346 ::encode(dirfrag, bl);
2347 ::encode(dirfrag_old_mtime, bl);
2348 ::encode(dirfrag_old_rctime, bl);
2349 ::encode(ino, bl);
2350 ::encode(remote_ino, bl);
2351 ::encode(dname, bl);
2352 ::encode(remote_d_type, bl);
2353 ::encode(old_ctime, bl);
2354 ENCODE_FINISH(bl);
2355 }
2356
2357 void rename_rollback::drec::decode(bufferlist::iterator &bl)
2358 {
2359 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2360 ::decode(dirfrag, bl);
2361 ::decode(dirfrag_old_mtime, bl);
2362 ::decode(dirfrag_old_rctime, bl);
2363 ::decode(ino, bl);
2364 ::decode(remote_ino, bl);
2365 ::decode(dname, bl);
2366 ::decode(remote_d_type, bl);
2367 ::decode(old_ctime, bl);
2368 DECODE_FINISH(bl);
2369 }
2370
2371 void rename_rollback::drec::dump(Formatter *f) const
2372 {
2373 f->dump_stream("directory fragment") << dirfrag;
2374 f->dump_stream("directory old mtime") << dirfrag_old_mtime;
2375 f->dump_stream("directory old rctime") << dirfrag_old_rctime;
2376 f->dump_int("ino", ino);
2377 f->dump_int("remote ino", remote_ino);
2378 f->dump_string("dname", dname);
2379 uint32_t type = DTTOIF(remote_d_type) & S_IFMT; // convert to type entries
2380 string type_string;
2381 switch(type) {
2382 case S_IFREG:
2383 type_string = "file"; break;
2384 case S_IFLNK:
2385 type_string = "symlink"; break;
2386 case S_IFDIR:
2387 type_string = "directory"; break;
2388 default:
2389 type_string = "UNKNOWN-" + stringify((int)type); break;
2390 }
2391 f->dump_string("remote dtype", type_string);
2392 f->dump_stream("old ctime") << old_ctime;
2393 }
2394
2395 void rename_rollback::drec::generate_test_instances(list<drec*>& ls)
2396 {
2397 ls.push_back(new drec());
2398 ls.back()->remote_d_type = IFTODT(S_IFREG);
2399 }
2400
2401 void rename_rollback::encode(bufferlist &bl) const
2402 {
2403 ENCODE_START(2, 2, bl);
2404 ::encode(reqid, bl);
2405 encode(orig_src, bl);
2406 encode(orig_dest, bl);
2407 encode(stray, bl);
2408 ::encode(ctime, bl);
2409 ENCODE_FINISH(bl);
2410 }
2411
2412 void rename_rollback::decode(bufferlist::iterator &bl)
2413 {
2414 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2415 ::decode(reqid, bl);
2416 decode(orig_src, bl);
2417 decode(orig_dest, bl);
2418 decode(stray, bl);
2419 ::decode(ctime, bl);
2420 DECODE_FINISH(bl);
2421 }
2422
2423 void rename_rollback::dump(Formatter *f) const
2424 {
2425 f->dump_stream("request id") << reqid;
2426 f->open_object_section("original src drec");
2427 orig_src.dump(f);
2428 f->close_section(); // original src drec
2429 f->open_object_section("original dest drec");
2430 orig_dest.dump(f);
2431 f->close_section(); // original dest drec
2432 f->open_object_section("stray drec");
2433 stray.dump(f);
2434 f->close_section(); // stray drec
2435 f->dump_stream("ctime") << ctime;
2436 }
2437
2438 void rename_rollback::generate_test_instances(list<rename_rollback*>& ls)
2439 {
2440 ls.push_back(new rename_rollback());
2441 ls.back()->orig_src.remote_d_type = IFTODT(S_IFREG);
2442 ls.back()->orig_dest.remote_d_type = IFTODT(S_IFREG);
2443 ls.back()->stray.remote_d_type = IFTODT(S_IFREG);
2444 }
2445
2446 void ESlaveUpdate::encode(bufferlist &bl, uint64_t features) const
2447 {
2448 ENCODE_START(3, 3, bl);
2449 ::encode(stamp, bl);
2450 ::encode(type, bl);
2451 ::encode(reqid, bl);
2452 ::encode(master, bl);
2453 ::encode(op, bl);
2454 ::encode(origop, bl);
2455 ::encode(commit, bl, features);
2456 ::encode(rollback, bl);
2457 ENCODE_FINISH(bl);
2458 }
2459
2460 void ESlaveUpdate::decode(bufferlist::iterator &bl)
2461 {
2462 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2463 if (struct_v >= 2)
2464 ::decode(stamp, bl);
2465 ::decode(type, bl);
2466 ::decode(reqid, bl);
2467 ::decode(master, bl);
2468 ::decode(op, bl);
2469 ::decode(origop, bl);
2470 ::decode(commit, bl);
2471 ::decode(rollback, bl);
2472 DECODE_FINISH(bl);
2473 }
2474
2475 void ESlaveUpdate::dump(Formatter *f) const
2476 {
2477 f->open_object_section("metablob");
2478 commit.dump(f);
2479 f->close_section(); // metablob
2480
2481 f->dump_int("rollback length", rollback.length());
2482 f->dump_string("type", type);
2483 f->dump_stream("metareqid") << reqid;
2484 f->dump_int("master", master);
2485 f->dump_int("op", op);
2486 f->dump_int("original op", origop);
2487 }
2488
2489 void ESlaveUpdate::generate_test_instances(list<ESlaveUpdate*>& ls)
2490 {
2491 ls.push_back(new ESlaveUpdate());
2492 }
2493
2494
2495 void ESlaveUpdate::replay(MDSRank *mds)
2496 {
2497 MDSlaveUpdate *su;
2498 switch (op) {
2499 case ESlaveUpdate::OP_PREPARE:
2500 dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds." << master
2501 << ": applying commit, saving rollback info" << dendl;
2502 su = new MDSlaveUpdate(origop, rollback, _segment->slave_updates);
2503 commit.replay(mds, _segment, su);
2504 mds->mdcache->add_uncommitted_slave_update(reqid, master, su);
2505 break;
2506
2507 case ESlaveUpdate::OP_COMMIT:
2508 su = mds->mdcache->get_uncommitted_slave_update(reqid, master);
2509 if (su) {
2510 dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << dendl;
2511 mds->mdcache->finish_uncommitted_slave_update(reqid, master);
2512 } else {
2513 dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master
2514 << ": ignoring, no previously saved prepare" << dendl;
2515 }
2516 break;
2517
2518 case ESlaveUpdate::OP_ROLLBACK:
2519 dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
2520 << ": applying rollback commit blob" << dendl;
2521 commit.replay(mds, _segment);
2522 su = mds->mdcache->get_uncommitted_slave_update(reqid, master);
2523 if (su)
2524 mds->mdcache->finish_uncommitted_slave_update(reqid, master);
2525 break;
2526
2527 default:
2528 mds->clog->error() << "invalid op in ESlaveUpdate";
2529 mds->damaged();
2530 ceph_abort(); // Should be unreachable because damaged() calls respawn()
2531 }
2532 }
2533
2534
2535 // -----------------------
2536 // ESubtreeMap
2537
2538 void ESubtreeMap::encode(bufferlist& bl, uint64_t features) const
2539 {
2540 ENCODE_START(6, 5, bl);
2541 ::encode(stamp, bl);
2542 ::encode(metablob, bl, features);
2543 ::encode(subtrees, bl);
2544 ::encode(ambiguous_subtrees, bl);
2545 ::encode(expire_pos, bl);
2546 ::encode(event_seq, bl);
2547 ENCODE_FINISH(bl);
2548 }
2549
2550 void ESubtreeMap::decode(bufferlist::iterator &bl)
2551 {
2552 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
2553 if (struct_v >= 2)
2554 ::decode(stamp, bl);
2555 ::decode(metablob, bl);
2556 ::decode(subtrees, bl);
2557 if (struct_v >= 4)
2558 ::decode(ambiguous_subtrees, bl);
2559 if (struct_v >= 3)
2560 ::decode(expire_pos, bl);
2561 if (struct_v >= 6)
2562 ::decode(event_seq, bl);
2563 DECODE_FINISH(bl);
2564 }
2565
2566 void ESubtreeMap::dump(Formatter *f) const
2567 {
2568 f->open_object_section("metablob");
2569 metablob.dump(f);
2570 f->close_section(); // metablob
2571
2572 f->open_array_section("subtrees");
2573 for(map<dirfrag_t,vector<dirfrag_t> >::const_iterator i = subtrees.begin();
2574 i != subtrees.end(); ++i) {
2575 f->open_object_section("tree");
2576 f->dump_stream("root dirfrag") << i->first;
2577 for (vector<dirfrag_t>::const_iterator j = i->second.begin();
2578 j != i->second.end(); ++j) {
2579 f->dump_stream("bound dirfrag") << *j;
2580 }
2581 f->close_section(); // tree
2582 }
2583 f->close_section(); // subtrees
2584
2585 f->open_array_section("ambiguous subtrees");
2586 for(set<dirfrag_t>::const_iterator i = ambiguous_subtrees.begin();
2587 i != ambiguous_subtrees.end(); ++i) {
2588 f->dump_stream("dirfrag") << *i;
2589 }
2590 f->close_section(); // ambiguous subtrees
2591
2592 f->dump_int("expire position", expire_pos);
2593 }
2594
2595 void ESubtreeMap::generate_test_instances(list<ESubtreeMap*>& ls)
2596 {
2597 ls.push_back(new ESubtreeMap());
2598 }
2599
2600 void ESubtreeMap::replay(MDSRank *mds)
2601 {
2602 if (expire_pos && expire_pos > mds->mdlog->journaler->get_expire_pos())
2603 mds->mdlog->journaler->set_expire_pos(expire_pos);
2604
2605 // suck up the subtree map?
2606 if (mds->mdcache->is_subtrees()) {
2607 dout(10) << "ESubtreeMap.replay -- i already have import map; verifying" << dendl;
2608 int errors = 0;
2609
2610 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2611 p != subtrees.end();
2612 ++p) {
2613 CDir *dir = mds->mdcache->get_dirfrag(p->first);
2614 if (!dir) {
2615 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2616 << " subtree root " << p->first << " not in cache";
2617 ++errors;
2618 continue;
2619 }
2620
2621 if (!mds->mdcache->is_subtree(dir)) {
2622 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2623 << " subtree root " << p->first << " not a subtree in cache";
2624 ++errors;
2625 continue;
2626 }
2627 if (dir->get_dir_auth().first != mds->get_nodeid()) {
2628 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2629 << " subtree root " << p->first
2630 << " is not mine in cache (it's " << dir->get_dir_auth() << ")";
2631 ++errors;
2632 continue;
2633 }
2634
2635 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2636 mds->mdcache->get_force_dirfrag(*q, true);
2637
2638 set<CDir*> bounds;
2639 mds->mdcache->get_subtree_bounds(dir, bounds);
2640 for (vector<dirfrag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
2641 CDir *b = mds->mdcache->get_dirfrag(*q);
2642 if (!b) {
2643 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2644 << " subtree " << p->first << " bound " << *q << " not in cache";
2645 ++errors;
2646 continue;
2647 }
2648 if (bounds.count(b) == 0) {
2649 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2650 << " subtree " << p->first << " bound " << *q << " not a bound in cache";
2651 ++errors;
2652 continue;
2653 }
2654 bounds.erase(b);
2655 }
2656 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q) {
2657 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2658 << " subtree " << p->first << " has extra bound in cache " << (*q)->dirfrag();
2659 ++errors;
2660 }
2661
2662 if (ambiguous_subtrees.count(p->first)) {
2663 if (!mds->mdcache->have_ambiguous_import(p->first)) {
2664 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2665 << " subtree " << p->first << " is ambiguous but is not in our cache";
2666 ++errors;
2667 }
2668 } else {
2669 if (mds->mdcache->have_ambiguous_import(p->first)) {
2670 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2671 << " subtree " << p->first << " is not ambiguous but is in our cache";
2672 ++errors;
2673 }
2674 }
2675 }
2676
2677 list<CDir*> subs;
2678 mds->mdcache->list_subtrees(subs);
2679 for (list<CDir*>::iterator p = subs.begin(); p != subs.end(); ++p) {
2680 CDir *dir = *p;
2681 if (dir->get_dir_auth().first != mds->get_nodeid())
2682 continue;
2683 if (subtrees.count(dir->dirfrag()) == 0) {
2684 mds->clog->error() << " replayed ESubtreeMap at " << get_start_off()
2685 << " does not include cache subtree " << dir->dirfrag();
2686 ++errors;
2687 }
2688 }
2689
2690 if (errors) {
2691 dout(0) << "journal subtrees: " << subtrees << dendl;
2692 dout(0) << "journal ambig_subtrees: " << ambiguous_subtrees << dendl;
2693 mds->mdcache->show_subtrees();
2694 assert(!g_conf->mds_debug_subtrees || errors == 0);
2695 }
2696 return;
2697 }
2698
2699 dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl;
2700
2701 // first, stick the spanning tree in my cache
2702 //metablob.print(*_dout);
2703 metablob.replay(mds, _segment);
2704
2705 // restore import/export maps
2706 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
2707 p != subtrees.end();
2708 ++p) {
2709 CDir *dir = mds->mdcache->get_dirfrag(p->first);
2710 assert(dir);
2711 if (ambiguous_subtrees.count(p->first)) {
2712 // ambiguous!
2713 mds->mdcache->add_ambiguous_import(p->first, p->second);
2714 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second,
2715 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
2716 } else {
2717 // not ambiguous
2718 mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid());
2719 }
2720 }
2721
2722 mds->mdcache->recalc_auth_bits(true);
2723
2724 mds->mdcache->show_subtrees();
2725 }
2726
2727
2728
2729 // -----------------------
2730 // EFragment
2731
2732 void EFragment::replay(MDSRank *mds)
2733 {
2734 dout(10) << "EFragment.replay " << op_name(op) << " " << ino << " " << basefrag << " by " << bits << dendl;
2735
2736 list<CDir*> resultfrags;
2737 list<MDSInternalContextBase*> waiters;
2738 list<frag_t> old_frags;
2739
2740 // in may be NULL if it wasn't in our cache yet. if it's a prepare
2741 // it will be once we replay the metablob , but first we need to
2742 // refragment anything we already have in the cache.
2743 CInode *in = mds->mdcache->get_inode(ino);
2744
2745 switch (op) {
2746 case OP_PREPARE:
2747 mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, _segment, &rollback);
2748
2749 if (in)
2750 mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters, true);
2751 break;
2752
2753 case OP_ROLLBACK:
2754 if (in) {
2755 in->dirfragtree.get_leaves_under(basefrag, old_frags);
2756 if (orig_frags.empty()) {
2757 // old format EFragment
2758 mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true);
2759 } else {
2760 for (list<frag_t>::iterator p = orig_frags.begin(); p != orig_frags.end(); ++p)
2761 mds->mdcache->force_dir_fragment(in, *p);
2762 }
2763 }
2764 mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), old_frags);
2765 break;
2766
2767 case OP_COMMIT:
2768 case OP_FINISH:
2769 mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op);
2770 break;
2771
2772 default:
2773 ceph_abort();
2774 }
2775
2776 metablob.replay(mds, _segment);
2777 if (in && g_conf->mds_debug_frag)
2778 in->verify_dirfrags();
2779 }
2780
2781 void EFragment::encode(bufferlist &bl, uint64_t features) const {
2782 ENCODE_START(5, 4, bl);
2783 ::encode(stamp, bl);
2784 ::encode(op, bl);
2785 ::encode(ino, bl);
2786 ::encode(basefrag, bl);
2787 ::encode(bits, bl);
2788 ::encode(metablob, bl, features);
2789 ::encode(orig_frags, bl);
2790 ::encode(rollback, bl);
2791 ENCODE_FINISH(bl);
2792 }
2793
2794 void EFragment::decode(bufferlist::iterator &bl) {
2795 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
2796 if (struct_v >= 2)
2797 ::decode(stamp, bl);
2798 if (struct_v >= 3)
2799 ::decode(op, bl);
2800 ::decode(ino, bl);
2801 ::decode(basefrag, bl);
2802 ::decode(bits, bl);
2803 ::decode(metablob, bl);
2804 if (struct_v >= 5) {
2805 ::decode(orig_frags, bl);
2806 ::decode(rollback, bl);
2807 }
2808 DECODE_FINISH(bl);
2809 }
2810
2811 void EFragment::dump(Formatter *f) const
2812 {
2813 /*f->open_object_section("Metablob");
2814 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2815 f->close_section();*/
2816 f->dump_string("op", op_name(op));
2817 f->dump_stream("ino") << ino;
2818 f->dump_stream("base frag") << basefrag;
2819 f->dump_int("bits", bits);
2820 }
2821
2822 void EFragment::generate_test_instances(list<EFragment*>& ls)
2823 {
2824 ls.push_back(new EFragment);
2825 ls.push_back(new EFragment);
2826 ls.back()->op = OP_PREPARE;
2827 ls.back()->ino = 1;
2828 ls.back()->bits = 5;
2829 }
2830
2831 void dirfrag_rollback::encode(bufferlist &bl) const
2832 {
2833 ENCODE_START(1, 1, bl);
2834 ::encode(fnode, bl);
2835 ENCODE_FINISH(bl);
2836 }
2837
2838 void dirfrag_rollback::decode(bufferlist::iterator &bl)
2839 {
2840 DECODE_START(1, bl);
2841 ::decode(fnode, bl);
2842 DECODE_FINISH(bl);
2843 }
2844
2845
2846
2847 // =========================================================================
2848
2849 // -----------------------
2850 // EExport
2851
2852 void EExport::replay(MDSRank *mds)
2853 {
2854 dout(10) << "EExport.replay " << base << dendl;
2855 metablob.replay(mds, _segment);
2856
2857 CDir *dir = mds->mdcache->get_dirfrag(base);
2858 assert(dir);
2859
2860 set<CDir*> realbounds;
2861 for (set<dirfrag_t>::iterator p = bounds.begin();
2862 p != bounds.end();
2863 ++p) {
2864 CDir *bd = mds->mdcache->get_dirfrag(*p);
2865 assert(bd);
2866 realbounds.insert(bd);
2867 }
2868
2869 // adjust auth away
2870 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, CDIR_AUTH_UNDEF);
2871
2872 mds->mdcache->try_trim_non_auth_subtree(dir);
2873 }
2874
2875 void EExport::encode(bufferlist& bl, uint64_t features) const
2876 {
2877 ENCODE_START(3, 3, bl);
2878 ::encode(stamp, bl);
2879 ::encode(metablob, bl, features);
2880 ::encode(base, bl);
2881 ::encode(bounds, bl);
2882 ENCODE_FINISH(bl);
2883 }
2884
2885 void EExport::decode(bufferlist::iterator &bl)
2886 {
2887 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2888 if (struct_v >= 2)
2889 ::decode(stamp, bl);
2890 ::decode(metablob, bl);
2891 ::decode(base, bl);
2892 ::decode(bounds, bl);
2893 DECODE_FINISH(bl);
2894 }
2895
2896 void EExport::dump(Formatter *f) const
2897 {
2898 f->dump_float("stamp", (double)stamp);
2899 /*f->open_object_section("Metablob");
2900 metablob.dump(f); // sadly we don't have this; dunno if we'll get it
2901 f->close_section();*/
2902 f->dump_stream("base dirfrag") << base;
2903 f->open_array_section("bounds dirfrags");
2904 for (set<dirfrag_t>::const_iterator i = bounds.begin();
2905 i != bounds.end(); ++i) {
2906 f->dump_stream("dirfrag") << *i;
2907 }
2908 f->close_section(); // bounds dirfrags
2909 }
2910
2911 void EExport::generate_test_instances(list<EExport*>& ls)
2912 {
2913 EExport *sample = new EExport();
2914 ls.push_back(sample);
2915 }
2916
2917
2918 // -----------------------
2919 // EImportStart
2920
2921 void EImportStart::update_segment()
2922 {
2923 _segment->sessionmapv = cmapv;
2924 }
2925
2926 void EImportStart::replay(MDSRank *mds)
2927 {
2928 dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl;
2929 //metablob.print(*_dout);
2930 metablob.replay(mds, _segment);
2931
2932 // put in ambiguous import list
2933 mds->mdcache->add_ambiguous_import(base, bounds);
2934
2935 // set auth partially to us so we don't trim it
2936 CDir *dir = mds->mdcache->get_dirfrag(base);
2937 assert(dir);
2938
2939 set<CDir*> realbounds;
2940 for (vector<dirfrag_t>::iterator p = bounds.begin();
2941 p != bounds.end();
2942 ++p) {
2943 CDir *bd = mds->mdcache->get_dirfrag(*p);
2944 assert(bd);
2945 if (!bd->is_subtree_root())
2946 bd->state_clear(CDir::STATE_AUTH);
2947 realbounds.insert(bd);
2948 }
2949
2950 mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds,
2951 mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
2952
2953 // open client sessions?
2954 if (mds->sessionmap.get_version() >= cmapv) {
2955 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
2956 << " >= " << cmapv << ", noop" << dendl;
2957 } else {
2958 dout(10) << "EImportStart.replay sessionmap " << mds->sessionmap.get_version()
2959 << " < " << cmapv << dendl;
2960 map<client_t,entity_inst_t> cm;
2961 bufferlist::iterator blp = client_map.begin();
2962 ::decode(cm, blp);
2963 mds->sessionmap.open_sessions(cm);
2964 assert(mds->sessionmap.get_version() == cmapv);
2965 mds->sessionmap.set_projected(mds->sessionmap.get_version());
2966 }
2967 update_segment();
2968 }
2969
2970 void EImportStart::encode(bufferlist &bl, uint64_t features) const {
2971 ENCODE_START(3, 3, bl);
2972 ::encode(stamp, bl);
2973 ::encode(base, bl);
2974 ::encode(metablob, bl, features);
2975 ::encode(bounds, bl);
2976 ::encode(cmapv, bl);
2977 ::encode(client_map, bl);
2978 ENCODE_FINISH(bl);
2979 }
2980
2981 void EImportStart::decode(bufferlist::iterator &bl) {
2982 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
2983 if (struct_v >= 2)
2984 ::decode(stamp, bl);
2985 ::decode(base, bl);
2986 ::decode(metablob, bl);
2987 ::decode(bounds, bl);
2988 ::decode(cmapv, bl);
2989 ::decode(client_map, bl);
2990 DECODE_FINISH(bl);
2991 }
2992
2993 void EImportStart::dump(Formatter *f) const
2994 {
2995 f->dump_stream("base dirfrag") << base;
2996 f->open_array_section("boundary dirfrags");
2997 for (vector<dirfrag_t>::const_iterator iter = bounds.begin();
2998 iter != bounds.end(); ++iter) {
2999 f->dump_stream("frag") << *iter;
3000 }
3001 f->close_section();
3002 }
3003
3004 void EImportStart::generate_test_instances(list<EImportStart*>& ls)
3005 {
3006 ls.push_back(new EImportStart);
3007 }
3008
3009 // -----------------------
3010 // EImportFinish
3011
3012 void EImportFinish::replay(MDSRank *mds)
3013 {
3014 if (mds->mdcache->have_ambiguous_import(base)) {
3015 dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl;
3016 if (success) {
3017 mds->mdcache->finish_ambiguous_import(base);
3018 } else {
3019 CDir *dir = mds->mdcache->get_dirfrag(base);
3020 assert(dir);
3021 vector<dirfrag_t> bounds;
3022 mds->mdcache->get_ambiguous_import_bounds(base, bounds);
3023 mds->mdcache->adjust_bounded_subtree_auth(dir, bounds, CDIR_AUTH_UNDEF);
3024 mds->mdcache->cancel_ambiguous_import(dir);
3025 mds->mdcache->try_trim_non_auth_subtree(dir);
3026 }
3027 } else {
3028 // this shouldn't happen unless this is an old journal
3029 dout(10) << "EImportFinish.replay " << base << " success=" << success
3030 << " on subtree not marked as ambiguous"
3031 << dendl;
3032 mds->clog->error() << "failure replaying journal (EImportFinish)";
3033 mds->damaged();
3034 ceph_abort(); // Should be unreachable because damaged() calls respawn()
3035 }
3036 }
3037
3038 void EImportFinish::encode(bufferlist& bl, uint64_t features) const
3039 {
3040 ENCODE_START(3, 3, bl);
3041 ::encode(stamp, bl);
3042 ::encode(base, bl);
3043 ::encode(success, bl);
3044 ENCODE_FINISH(bl);
3045 }
3046
3047 void EImportFinish::decode(bufferlist::iterator &bl)
3048 {
3049 DECODE_START_LEGACY_COMPAT_LEN(3, 3, 3, bl);
3050 if (struct_v >= 2)
3051 ::decode(stamp, bl);
3052 ::decode(base, bl);
3053 ::decode(success, bl);
3054 DECODE_FINISH(bl);
3055 }
3056
3057 void EImportFinish::dump(Formatter *f) const
3058 {
3059 f->dump_stream("base dirfrag") << base;
3060 f->dump_string("success", success ? "true" : "false");
3061 }
3062 void EImportFinish::generate_test_instances(list<EImportFinish*>& ls)
3063 {
3064 ls.push_back(new EImportFinish);
3065 ls.push_back(new EImportFinish);
3066 ls.back()->success = true;
3067 }
3068
3069
3070 // ------------------------
3071 // EResetJournal
3072
3073 void EResetJournal::encode(bufferlist& bl, uint64_t features) const
3074 {
3075 ENCODE_START(2, 2, bl);
3076 ::encode(stamp, bl);
3077 ENCODE_FINISH(bl);
3078 }
3079
3080 void EResetJournal::decode(bufferlist::iterator &bl)
3081 {
3082 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
3083 ::decode(stamp, bl);
3084 DECODE_FINISH(bl);
3085 }
3086
3087 void EResetJournal::dump(Formatter *f) const
3088 {
3089 f->dump_stream("timestamp") << stamp;
3090 }
3091
3092 void EResetJournal::generate_test_instances(list<EResetJournal*>& ls)
3093 {
3094 ls.push_back(new EResetJournal());
3095 }
3096
3097 void EResetJournal::replay(MDSRank *mds)
3098 {
3099 dout(1) << "EResetJournal" << dendl;
3100
3101 mds->sessionmap.wipe();
3102 mds->inotable->replay_reset();
3103
3104 if (mds->mdsmap->get_root() == mds->get_nodeid()) {
3105 CDir *rootdir = mds->mdcache->get_root()->get_or_open_dirfrag(mds->mdcache, frag_t());
3106 mds->mdcache->adjust_subtree_auth(rootdir, mds->get_nodeid());
3107 }
3108
3109 CDir *mydir = mds->mdcache->get_myin()->get_or_open_dirfrag(mds->mdcache, frag_t());
3110 mds->mdcache->adjust_subtree_auth(mydir, mds->get_nodeid());
3111
3112 mds->mdcache->recalc_auth_bits(true);
3113
3114 mds->mdcache->show_subtrees();
3115 }
3116
3117
3118 void ENoOp::encode(bufferlist &bl, uint64_t features) const
3119 {
3120 ENCODE_START(2, 2, bl);
3121 ::encode(pad_size, bl);
3122 uint8_t const pad = 0xff;
3123 for (unsigned int i = 0; i < pad_size; ++i) {
3124 ::encode(pad, bl);
3125 }
3126 ENCODE_FINISH(bl);
3127 }
3128
3129
3130 void ENoOp::decode(bufferlist::iterator &bl)
3131 {
3132 DECODE_START(2, bl);
3133 ::decode(pad_size, bl);
3134 if (bl.get_remaining() != pad_size) {
3135 // This is spiritually an assertion, but expressing in a way that will let
3136 // journal debug tools catch it and recognise a malformed entry.
3137 throw buffer::end_of_buffer();
3138 } else {
3139 bl.advance(pad_size);
3140 }
3141 DECODE_FINISH(bl);
3142 }
3143
3144
3145 void ENoOp::replay(MDSRank *mds)
3146 {
3147 dout(4) << "ENoOp::replay, " << pad_size << " bytes skipped in journal" << dendl;
3148 }
3149
3150 /**
3151 * If re-formatting an old journal that used absolute log position
3152 * references as segment sequence numbers, use this function to update
3153 * it.
3154 *
3155 * @param mds
3156 * MDSRank instance, just used for logging
3157 * @param old_to_new
3158 * Map of old journal segment sequence numbers to new journal segment sequence numbers
3159 *
3160 * @return
3161 * True if the event was modified.
3162 */
3163 bool EMetaBlob::rewrite_truncate_finish(MDSRank const *mds,
3164 std::map<log_segment_seq_t, log_segment_seq_t> const &old_to_new)
3165 {
3166 bool modified = false;
3167 map<inodeno_t, log_segment_seq_t> new_trunc_finish;
3168 for (std::map<inodeno_t, log_segment_seq_t>::iterator i = truncate_finish.begin();
3169 i != truncate_finish.end(); ++i) {
3170 if (old_to_new.count(i->second)) {
3171 dout(20) << __func__ << " applying segment seq mapping "
3172 << i->second << " -> " << old_to_new.find(i->second)->second << dendl;
3173 new_trunc_finish[i->first] = old_to_new.find(i->second)->second;
3174 modified = true;
3175 } else {
3176 dout(20) << __func__ << " no segment seq mapping found for "
3177 << i->second << dendl;
3178 new_trunc_finish[i->first] = i->second;
3179 }
3180 }
3181 truncate_finish = new_trunc_finish;
3182
3183 return modified;
3184 }