1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_MDS_EMETABLOB_H
16 #define CEPH_MDS_EMETABLOB_H
20 #include "../CInode.h"
22 #include "../CDentry.h"
23 #include "../LogSegment.h"
25 #include "include/interval_set.h"
33 * a bunch of metadata in the journal
38 * - make sure you adjust the inode.version for any modified inode you
39 * journal. CDir and CDentry maintain a projected_version, but CInode
40 * doesn't, since the journaled inode usually has to be modifed
41 * manually anyway (to delay the change in the MDS's cache until after
50 /* fullbit - a regular dentry + inode
52 * We encode this one a bit weirdly, just because (also, it's marginally faster
53 * on multiple encodes, which I think can happen):
54 * Encode a bufferlist on struct creation with all data members, without a struct_v.
55 * When encode is called, encode struct_v and then append the bufferlist.
56 * Decode straight into the appropriate variables.
58 * So, if you add members, encode them in the constructor and then change
59 * the struct_v in the encode function!
62 static const int STATE_DIRTY
= (1<<0);
63 static const int STATE_DIRTYPARENT
= (1<<1);
64 static const int STATE_DIRTYPOOL
= (1<<2);
65 static const int STATE_NEED_SNAPFLUSH
= (1<<3);
66 typedef compact_map
<snapid_t
, old_inode_t
> old_inodes_t
;
68 snapid_t dnfirst
, dnlast
;
70 inode_t inode
; // if it's not
71 fragtree_t dirfragtree
;
72 map
<string
,bufferptr
> xattrs
;
77 old_inodes_t old_inodes
;
79 fullbit(const fullbit
& o
);
80 const fullbit
& operator=(const fullbit
& o
);
82 fullbit(const string
& d
, snapid_t df
, snapid_t dl
,
83 version_t v
, const inode_t
& i
, const fragtree_t
&dft
,
84 const map
<string
,bufferptr
> &xa
, const string
& sym
,
85 snapid_t os
, const bufferlist
&sbl
, __u8 st
,
86 const old_inodes_t
*oi
= NULL
) :
87 dn(d
), dnfirst(df
), dnlast(dl
), dnv(v
), inode(i
), xattrs(xa
),
88 oldest_snap(os
), state(st
)
98 explicit fullbit(bufferlist::iterator
&p
) {
104 void encode(bufferlist
& bl
, uint64_t features
) const;
105 void decode(bufferlist::iterator
&bl
);
106 void dump(Formatter
*f
) const;
107 static void generate_test_instances(list
<EMetaBlob::fullbit
*>& ls
);
109 void update_inode(MDSRank
*mds
, CInode
*in
);
110 bool is_dirty() const { return (state
& STATE_DIRTY
); }
111 bool is_dirty_parent() const { return (state
& STATE_DIRTYPARENT
); }
112 bool is_dirty_pool() const { return (state
& STATE_DIRTYPOOL
); }
113 bool need_snapflush() const { return (state
& STATE_NEED_SNAPFLUSH
); }
115 void print(ostream
& out
) const {
116 out
<< " fullbit dn " << dn
<< " [" << dnfirst
<< "," << dnlast
<< "] dnv " << dnv
117 << " inode " << inode
.ino
118 << " state=" << state
<< std::endl
;
120 string
state_string() const {
122 bool marked_already
= false;
124 state_string
.append("dirty");
125 marked_already
= true;
127 if (is_dirty_parent()) {
128 state_string
.append(marked_already
? "+dirty_parent" : "dirty_parent");
130 state_string
.append("+dirty_pool");
135 WRITE_CLASS_ENCODER_FEATURES(fullbit
)
137 /* remotebit - a dentry + remote inode link (i.e. just an ino)
141 snapid_t dnfirst
, dnlast
;
144 unsigned char d_type
;
147 remotebit(const string
& d
, snapid_t df
, snapid_t dl
, version_t v
, inodeno_t i
, unsigned char dt
, bool dr
) :
148 dn(d
), dnfirst(df
), dnlast(dl
), dnv(v
), ino(i
), d_type(dt
), dirty(dr
) { }
149 explicit remotebit(bufferlist::iterator
&p
) { decode(p
); }
150 remotebit(): dnfirst(0), dnlast(0), dnv(0), ino(0),
151 d_type('\0'), dirty(false) {}
153 void encode(bufferlist
& bl
) const;
154 void decode(bufferlist::iterator
&bl
);
155 void print(ostream
& out
) const {
156 out
<< " remotebit dn " << dn
<< " [" << dnfirst
<< "," << dnlast
<< "] dnv " << dnv
158 << " dirty=" << dirty
<< std::endl
;
160 void dump(Formatter
*f
) const;
161 static void generate_test_instances(list
<remotebit
*>& ls
);
163 WRITE_CLASS_ENCODER(remotebit
)
166 * nullbit - a null dentry
170 snapid_t dnfirst
, dnlast
;
174 nullbit(const string
& d
, snapid_t df
, snapid_t dl
, version_t v
, bool dr
) :
175 dn(d
), dnfirst(df
), dnlast(dl
), dnv(v
), dirty(dr
) { }
176 explicit nullbit(bufferlist::iterator
&p
) { decode(p
); }
177 nullbit(): dnfirst(0), dnlast(0), dnv(0), dirty(false) {}
179 void encode(bufferlist
& bl
) const;
180 void decode(bufferlist::iterator
&bl
);
181 void dump(Formatter
*f
) const;
182 static void generate_test_instances(list
<nullbit
*>& ls
);
183 void print(ostream
& out
) {
184 out
<< " nullbit dn " << dn
<< " [" << dnfirst
<< "," << dnlast
<< "] dnv " << dnv
185 << " dirty=" << dirty
<< std::endl
;
188 WRITE_CLASS_ENCODER(nullbit
)
191 /* dirlump - contains metadata for any dir we have contents for.
195 static const int STATE_COMPLETE
= (1<<1);
196 static const int STATE_DIRTY
= (1<<2); // dirty due to THIS journal item, that is!
197 static const int STATE_NEW
= (1<<3); // new directory
198 static const int STATE_IMPORTING
= (1<<4); // importing directory
199 static const int STATE_DIRTYDFT
= (1<<5); // dirty dirfragtree
204 __u32 nfull
, nremote
, nnull
;
207 mutable bufferlist dnbl
;
208 mutable bool dn_decoded
;
209 mutable list
<ceph::shared_ptr
<fullbit
> > dfull
;
210 mutable list
<remotebit
> dremote
;
211 mutable list
<nullbit
> dnull
;
214 dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { }
216 bool is_complete() const { return state
& STATE_COMPLETE
; }
217 void mark_complete() { state
|= STATE_COMPLETE
; }
218 bool is_dirty() const { return state
& STATE_DIRTY
; }
219 void mark_dirty() { state
|= STATE_DIRTY
; }
220 bool is_new() const { return state
& STATE_NEW
; }
221 void mark_new() { state
|= STATE_NEW
; }
222 bool is_importing() { return state
& STATE_IMPORTING
; }
223 void mark_importing() { state
|= STATE_IMPORTING
; }
224 bool is_dirty_dft() { return state
& STATE_DIRTYDFT
; }
225 void mark_dirty_dft() { state
|= STATE_DIRTYDFT
; }
227 const list
<ceph::shared_ptr
<fullbit
> > &get_dfull() const { return dfull
; }
228 const list
<remotebit
> &get_dremote() const { return dremote
; }
229 const list
<nullbit
> &get_dnull() const { return dnull
; }
231 void add_dnull(nullbit
const &n
) { dnull
.push_back(n
); };
232 void add_dfull(ceph::shared_ptr
<fullbit
> const &p
) { dfull
.push_back(p
); };
233 void add_dremote(remotebit
const &r
) { dremote
.push_back(r
); };
235 void print(dirfrag_t dirfrag
, ostream
& out
) {
236 out
<< "dirlump " << dirfrag
<< " v " << fnode
.version
237 << " state " << state
238 << " num " << nfull
<< "/" << nremote
<< "/" << nnull
241 for (list
<ceph::shared_ptr
<fullbit
> >::iterator p
= dfull
.begin(); p
!= dfull
.end(); ++p
)
243 for (list
<remotebit
>::iterator p
= dremote
.begin(); p
!= dremote
.end(); ++p
)
245 for (list
<nullbit
>::iterator p
= dnull
.begin(); p
!= dnull
.end(); ++p
)
249 string
state_string() const {
251 bool marked_already
= false;
253 state_string
.append("complete");
254 marked_already
= true;
257 state_string
.append(marked_already
? "+dirty" : "dirty");
258 marked_already
= true;
261 state_string
.append(marked_already
? "+new" : "new");
266 // if this changes, update the versioning in encode for it!
267 void _encode_bits(uint64_t features
) const {
268 if (!dn_decoded
) return;
269 ::encode(dfull
, dnbl
, features
);
270 ::encode(dremote
, dnbl
);
271 ::encode(dnull
, dnbl
);
273 void _decode_bits() const {
274 if (dn_decoded
) return;
275 bufferlist::iterator p
= dnbl
.begin();
277 ::decode(dremote
, p
);
282 void encode(bufferlist
& bl
, uint64_t features
) const;
283 void decode(bufferlist::iterator
&bl
);
284 void dump(Formatter
*f
) const;
285 static void generate_test_instances(list
<dirlump
*>& ls
);
287 WRITE_CLASS_ENCODER_FEATURES(dirlump
)
289 // my lumps. preserve the order we added them in a list.
290 list
<dirfrag_t
> lump_order
;
291 map
<dirfrag_t
, dirlump
> lump_map
;
292 list
<ceph::shared_ptr
<fullbit
> > roots
;
294 list
<pair
<__u8
,version_t
> > table_tids
; // tableclient transactions
296 inodeno_t opened_ino
;
298 inodeno_t renamed_dirino
;
299 list
<frag_t
> renamed_dir_frags
;
302 // ino (pre)allocation. may involve both inotable AND session state.
303 version_t inotablev
, sessionmapv
;
304 inodeno_t allocated_ino
; // inotable
305 interval_set
<inodeno_t
> preallocated_inos
; // inotable + session
306 inodeno_t used_preallocated_ino
; // session
307 entity_name_t client_name
; // session
309 // inodes i've truncated
310 list
<inodeno_t
> truncate_start
; // start truncate
311 map
<inodeno_t
, log_segment_seq_t
> truncate_finish
; // finished truncate (started in segment blah)
314 vector
<inodeno_t
> destroyed_inodes
;
318 list
<pair
<metareqid_t
,uint64_t> > client_reqs
;
319 list
<pair
<metareqid_t
,uint64_t> > client_flushes
;
322 void encode(bufferlist
& bl
, uint64_t features
) const;
323 void decode(bufferlist::iterator
& bl
);
324 void get_inodes(std::set
<inodeno_t
> &inodes
) const;
325 void get_paths(std::vector
<std::string
> &paths
) const;
326 void get_dentries(std::map
<dirfrag_t
, std::set
<std::string
> > &dentries
) const;
327 entity_name_t
get_client_name() const {return client_name
;}
329 void dump(Formatter
*f
) const;
330 static void generate_test_instances(list
<EMetaBlob
*>& ls
);
332 uint64_t last_subtree_map
;
335 // for replay, in certain cases
336 //LogSegment *_segment;
338 explicit EMetaBlob(MDLog
*mdl
= 0); // defined in journal.cc
341 void print(ostream
& out
) {
342 for (list
<dirfrag_t
>::iterator p
= lump_order
.begin();
343 p
!= lump_order
.end();
345 lump_map
[*p
].print(*p
, out
);
349 void add_client_req(metareqid_t r
, uint64_t tid
=0) {
350 client_reqs
.push_back(pair
<metareqid_t
,uint64_t>(r
, tid
));
352 void add_client_flush(metareqid_t r
, uint64_t tid
=0) {
353 client_flushes
.push_back(pair
<metareqid_t
,uint64_t>(r
, tid
));
356 void add_table_transaction(int table
, version_t tid
) {
357 table_tids
.push_back(pair
<__u8
, version_t
>(table
, tid
));
360 void add_opened_ino(inodeno_t ino
) {
365 void set_ino_alloc(inodeno_t alloc
,
366 inodeno_t used_prealloc
,
367 interval_set
<inodeno_t
>& prealloc
,
368 entity_name_t client
,
369 version_t sv
, version_t iv
) {
370 allocated_ino
= alloc
;
371 used_preallocated_ino
= used_prealloc
;
372 preallocated_inos
= prealloc
;
373 client_name
= client
;
378 void add_truncate_start(inodeno_t ino
) {
379 truncate_start
.push_back(ino
);
381 void add_truncate_finish(inodeno_t ino
, uint64_t segoff
) {
382 truncate_finish
[ino
] = segoff
;
385 bool rewrite_truncate_finish(MDSRank
const *mds
, std::map
<uint64_t, uint64_t> const &old_to_new
);
387 void add_destroyed_inode(inodeno_t ino
) {
388 destroyed_inodes
.push_back(ino
);
391 void add_null_dentry(CDentry
*dn
, bool dirty
) {
392 add_null_dentry(add_dir(dn
->get_dir(), false), dn
, dirty
);
394 void add_null_dentry(dirlump
& lump
, CDentry
*dn
, bool dirty
) {
397 lump
.add_dnull(nullbit(dn
->get_name(),
399 dn
->get_projected_version(),
403 void add_remote_dentry(CDentry
*dn
, bool dirty
) {
404 add_remote_dentry(add_dir(dn
->get_dir(), false), dn
, dirty
, 0, 0);
406 void add_remote_dentry(CDentry
*dn
, bool dirty
, inodeno_t rino
, int rdt
) {
407 add_remote_dentry(add_dir(dn
->get_dir(), false), dn
, dirty
, rino
, rdt
);
409 void add_remote_dentry(dirlump
& lump
, CDentry
*dn
, bool dirty
,
410 inodeno_t rino
=0, unsigned char rdt
=0) {
412 rino
= dn
->get_projected_linkage()->get_remote_ino();
413 rdt
= dn
->get_projected_linkage()->get_remote_d_type();
416 lump
.add_dremote(remotebit(dn
->get_name(),
418 dn
->get_projected_version(),
423 // return remote pointer to to-be-journaled inode
424 void add_primary_dentry(CDentry
*dn
, CInode
*in
, bool dirty
,
425 bool dirty_parent
=false, bool dirty_pool
=false,
426 bool need_snapflush
=false) {
428 if (dirty
) state
|= fullbit::STATE_DIRTY
;
429 if (dirty_parent
) state
|= fullbit::STATE_DIRTYPARENT
;
430 if (dirty_pool
) state
|= fullbit::STATE_DIRTYPOOL
;
431 if (need_snapflush
) state
|= fullbit::STATE_NEED_SNAPFLUSH
;
432 add_primary_dentry(add_dir(dn
->get_dir(), false), dn
, in
, state
);
434 void add_primary_dentry(dirlump
& lump
, CDentry
*dn
, CInode
*in
, __u8 state
) {
436 in
= dn
->get_projected_linkage()->get_inode();
438 // make note of where this inode was last journaled
439 in
->last_journaled
= event_seq
;
440 //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
442 const inode_t
*pi
= in
->get_projected_inode();
443 if ((state
& fullbit::STATE_DIRTY
) && pi
->is_backtrace_updated())
444 state
|= fullbit::STATE_DIRTYPARENT
;
447 const sr_t
*sr
= in
->get_projected_srnode();
452 lump
.add_dfull(ceph::shared_ptr
<fullbit
>(new fullbit(dn
->get_name(),
454 dn
->get_projected_version(),
455 *pi
, in
->dirfragtree
,
456 *in
->get_projected_xattrs(),
458 in
->oldest_snap
, snapbl
,
459 state
, &in
->old_inodes
)));
462 // convenience: primary or remote? figure it out.
463 void add_dentry(CDentry
*dn
, bool dirty
) {
464 dirlump
& lump
= add_dir(dn
->get_dir(), false);
465 add_dentry(lump
, dn
, dirty
, false, false);
467 void add_import_dentry(CDentry
*dn
) {
468 bool dirty_parent
= false;
469 bool dirty_pool
= false;
470 if (dn
->get_linkage()->is_primary()) {
471 dirty_parent
= dn
->get_linkage()->get_inode()->is_dirty_parent();
472 dirty_pool
= dn
->get_linkage()->get_inode()->is_dirty_pool();
474 dirlump
& lump
= add_dir(dn
->get_dir(), false);
475 add_dentry(lump
, dn
, dn
->is_dirty(), dirty_parent
, dirty_pool
);
477 void add_dentry(dirlump
& lump
, CDentry
*dn
, bool dirty
, bool dirty_parent
, bool dirty_pool
) {
479 if (dn
->get_projected_linkage()->is_remote()) {
480 add_remote_dentry(dn
, dirty
);
482 } else if (dn
->get_projected_linkage()->is_null()) {
483 add_null_dentry(dn
, dirty
);
486 assert(dn
->get_projected_linkage()->is_primary());
487 add_primary_dentry(dn
, 0, dirty
, dirty_parent
, dirty_pool
);
490 void add_root(bool dirty
, CInode
*in
, const inode_t
*pi
=0, fragtree_t
*pdft
=0, bufferlist
*psnapbl
=0,
491 map
<string
,bufferptr
> *px
=0) {
492 in
->last_journaled
= event_seq
;
493 //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
495 if (!pi
) pi
= in
->get_projected_inode();
496 if (!pdft
) pdft
= &in
->dirfragtree
;
497 if (!px
) px
= in
->get_projected_xattrs();
503 in
->encode_snap_blob(snapbl
);
505 for (list
<ceph::shared_ptr
<fullbit
> >::iterator p
= roots
.begin(); p
!= roots
.end(); ++p
) {
506 if ((*p
)->inode
.ino
== in
->ino()) {
513 roots
.push_back(ceph::shared_ptr
<fullbit
>(new fullbit(empty
, in
->first
, in
->last
, 0, *pi
,
514 *pdft
, *px
, in
->symlink
,
515 in
->oldest_snap
, snapbl
,
516 dirty
? fullbit::STATE_DIRTY
: 0,
520 dirlump
& add_dir(CDir
*dir
, bool dirty
, bool complete
=false) {
521 return add_dir(dir
->dirfrag(), dir
->get_projected_fnode(), dir
->get_projected_version(),
524 dirlump
& add_new_dir(CDir
*dir
) {
525 return add_dir(dir
->dirfrag(), dir
->get_projected_fnode(), dir
->get_projected_version(),
526 true, true, true); // dirty AND complete AND new
528 dirlump
& add_import_dir(CDir
*dir
) {
529 // dirty=false would be okay in some cases
530 return add_dir(dir
->dirfrag(), dir
->get_projected_fnode(), dir
->get_projected_version(),
531 dir
->is_dirty(), dir
->is_complete(), false, true, dir
->is_dirty_dft());
533 dirlump
& add_fragmented_dir(CDir
*dir
, bool dirty
, bool dirtydft
) {
534 return add_dir(dir
->dirfrag(), dir
->get_projected_fnode(), dir
->get_projected_version(),
535 dirty
, false, false, false, dirtydft
);
537 dirlump
& add_dir(dirfrag_t df
, const fnode_t
*pf
, version_t pv
, bool dirty
,
538 bool complete
=false, bool isnew
=false,
539 bool importing
=false, bool dirty_dft
=false) {
540 if (lump_map
.count(df
) == 0)
541 lump_order
.push_back(df
);
543 dirlump
& l
= lump_map
[df
];
545 l
.fnode
.version
= pv
;
546 if (complete
) l
.mark_complete();
547 if (dirty
) l
.mark_dirty();
548 if (isnew
) l
.mark_new();
549 if (importing
) l
.mark_importing();
550 if (dirty_dft
) l
.mark_dirty_dft();
554 static const int TO_AUTH_SUBTREE_ROOT
= 0; // default.
555 static const int TO_ROOT
= 1;
557 void add_dir_context(CDir
*dir
, int mode
= TO_AUTH_SUBTREE_ROOT
);
560 return roots
.empty() && lump_order
.empty() && table_tids
.empty() &&
561 truncate_start
.empty() && truncate_finish
.empty() &&
562 destroyed_inodes
.empty() && client_reqs
.empty() &&
563 opened_ino
== 0 && inotablev
== 0 && sessionmapv
== 0;
566 void print(ostream
& out
) const {
568 if (!lump_order
.empty())
569 out
<< " " << lump_order
.front() << ", " << lump_map
.size() << " dirs";
570 if (!table_tids
.empty())
571 out
<< " table_tids=" << table_tids
;
572 if (allocated_ino
|| preallocated_inos
.size()) {
574 out
<< " alloc_ino=" << allocated_ino
;
575 if (preallocated_inos
.size())
576 out
<< " prealloc_ino=" << preallocated_inos
;
577 if (used_preallocated_ino
)
578 out
<< " used_prealloc_ino=" << used_preallocated_ino
;
579 out
<< " v" << inotablev
;
584 void update_segment(LogSegment
*ls
);
585 void replay(MDSRank
*mds
, LogSegment
*ls
, MDSlaveUpdate
*su
=NULL
);
587 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob
)
588 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::fullbit
)
589 WRITE_CLASS_ENCODER(EMetaBlob::remotebit
)
590 WRITE_CLASS_ENCODER(EMetaBlob::nullbit
)
591 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::dirlump
)
593 inline ostream
& operator<<(ostream
& out
, const EMetaBlob
& t
) {