1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_MDS_EMETABLOB_H
16 #define CEPH_MDS_EMETABLOB_H
18 #include <string_view>
20 #include "../CInode.h"
22 #include "../CDentry.h"
23 #include "../LogSegment.h"
25 #include "include/interval_set.h"
26 #include "common/strescape.h"
34 * a bunch of metadata in the journal
39 * - make sure you adjust the inode.version for any modified inode you
40 * journal. CDir and CDentry maintain a projected_version, but CInode
41 * doesn't, since the journaled inode usually has to be modified
42 * manually anyway (to delay the change in the MDS's cache until after
51 /* fullbit - a regular dentry + inode
53 * We encode this one a bit weirdly, just because (also, it's marginally faster
54 * on multiple encodes, which I think can happen):
55 * Encode a bufferlist on struct creation with all data members, without a struct_v.
56 * When encode is called, encode struct_v and then append the bufferlist.
57 * Decode straight into the appropriate variables.
59 * So, if you add members, encode them in the constructor and then change
60 * the struct_v in the encode function!
63 static const int STATE_DIRTY
= (1<<0);
64 static const int STATE_DIRTYPARENT
= (1<<1);
65 static const int STATE_DIRTYPOOL
= (1<<2);
66 static const int STATE_NEED_SNAPFLUSH
= (1<<3);
67 static const int STATE_EPHEMERAL_RANDOM
= (1<<4);
68 std::string dn
; // dentry
69 std::string alternate_name
;
70 snapid_t dnfirst
, dnlast
;
72 CInode::inode_const_ptr inode
; // if it's not XXX should not be part of mempool; wait for std::pmr to simplify
73 CInode::xattr_map_const_ptr xattrs
;
74 fragtree_t dirfragtree
;
79 CInode::old_inode_map_const_ptr old_inodes
; // XXX should not be part of mempool; wait for std::pmr to simplify
81 fullbit(std::string_view d
, std::string_view an
, snapid_t df
, snapid_t dl
,
82 version_t v
, const CInode::inode_const_ptr
& i
, const fragtree_t
&dft
,
83 const CInode::xattr_map_const_ptr
& xa
, std::string_view sym
,
84 snapid_t os
, const bufferlist
&sbl
, __u8 st
,
85 const CInode::old_inode_map_const_ptr
& oi
) :
86 dn(d
), alternate_name(an
), dnfirst(df
), dnlast(dl
), dnv(v
), inode(i
), xattrs(xa
),
87 oldest_snap(os
), state(st
), old_inodes(oi
)
95 explicit fullbit(bufferlist::const_iterator
&p
) {
99 fullbit(const fullbit
&) = delete;
101 fullbit
& operator=(const fullbit
&) = delete;
103 void encode(bufferlist
& bl
, uint64_t features
) const;
104 void decode(bufferlist::const_iterator
&bl
);
105 void dump(Formatter
*f
) const;
106 static void generate_test_instances(std::list
<EMetaBlob::fullbit
*>& ls
);
108 void update_inode(MDSRank
*mds
, CInode
*in
);
109 bool is_dirty() const { return (state
& STATE_DIRTY
); }
110 bool is_dirty_parent() const { return (state
& STATE_DIRTYPARENT
); }
111 bool is_dirty_pool() const { return (state
& STATE_DIRTYPOOL
); }
112 bool need_snapflush() const { return (state
& STATE_NEED_SNAPFLUSH
); }
113 bool is_export_ephemeral_random() const { return (state
& STATE_EPHEMERAL_RANDOM
); }
115 void print(ostream
& out
) const {
116 out
<< " fullbit dn " << dn
<< " [" << dnfirst
<< "," << dnlast
<< "] dnv " << dnv
117 << " inode " << inode
->ino
118 << " state=" << state
;
119 if (!alternate_name
.empty()) {
120 out
<< " altn " << binstrprint(alternate_name
, 8);
124 string
state_string() const {
126 bool marked_already
= false;
128 state_string
.append("dirty");
129 marked_already
= true;
131 if (is_dirty_parent()) {
132 state_string
.append(marked_already
? "+dirty_parent" : "dirty_parent");
134 state_string
.append("+dirty_pool");
139 WRITE_CLASS_ENCODER_FEATURES(fullbit
)
141 /* remotebit - a dentry + remote inode link (i.e. just an ino)
145 std::string alternate_name
;
146 snapid_t dnfirst
= 0, dnlast
= 0;
149 unsigned char d_type
= '\0';
152 remotebit(std::string_view d
, std::string_view an
, snapid_t df
, snapid_t dl
, version_t v
, inodeno_t i
, unsigned char dt
, bool dr
) :
153 dn(d
), alternate_name(an
), dnfirst(df
), dnlast(dl
), dnv(v
), ino(i
), d_type(dt
), dirty(dr
) { }
154 explicit remotebit(bufferlist::const_iterator
&p
) { decode(p
); }
155 remotebit() = default;
157 void encode(bufferlist
& bl
) const;
158 void decode(bufferlist::const_iterator
&bl
);
159 void print(ostream
& out
) const {
160 out
<< " remotebit dn " << dn
<< " [" << dnfirst
<< "," << dnlast
<< "] dnv " << dnv
162 << " dirty=" << dirty
;
163 if (!alternate_name
.empty()) {
164 out
<< " altn " << binstrprint(alternate_name
, 8);
168 void dump(Formatter
*f
) const;
169 static void generate_test_instances(std::list
<remotebit
*>& ls
);
171 WRITE_CLASS_ENCODER(remotebit
)
174 * nullbit - a null dentry
178 snapid_t dnfirst
, dnlast
;
182 nullbit(std::string_view d
, snapid_t df
, snapid_t dl
, version_t v
, bool dr
) :
183 dn(d
), dnfirst(df
), dnlast(dl
), dnv(v
), dirty(dr
) { }
184 explicit nullbit(bufferlist::const_iterator
&p
) { decode(p
); }
185 nullbit(): dnfirst(0), dnlast(0), dnv(0), dirty(false) {}
187 void encode(bufferlist
& bl
) const;
188 void decode(bufferlist::const_iterator
&bl
);
189 void dump(Formatter
*f
) const;
190 static void generate_test_instances(std::list
<nullbit
*>& ls
);
191 void print(ostream
& out
) const {
192 out
<< " nullbit dn " << dn
<< " [" << dnfirst
<< "," << dnlast
<< "] dnv " << dnv
193 << " dirty=" << dirty
<< std::endl
;
196 WRITE_CLASS_ENCODER(nullbit
)
199 /* dirlump - contains metadata for any dir we have contents for.
203 static const int STATE_COMPLETE
= (1<<1);
204 static const int STATE_DIRTY
= (1<<2); // dirty due to THIS journal item, that is!
205 static const int STATE_NEW
= (1<<3); // new directory
206 static const int STATE_IMPORTING
= (1<<4); // importing directory
207 static const int STATE_DIRTYDFT
= (1<<5); // dirty dirfragtree
210 CDir::fnode_const_ptr fnode
;
212 __u32 nfull
, nremote
, nnull
;
215 mutable bufferlist dnbl
;
216 mutable bool dn_decoded
;
217 mutable list
<fullbit
> dfull
;
218 mutable vector
<remotebit
> dremote
;
219 mutable vector
<nullbit
> dnull
;
222 dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { }
223 dirlump(const dirlump
&) = delete;
224 dirlump
& operator=(const dirlump
&) = delete;
226 bool is_complete() const { return state
& STATE_COMPLETE
; }
227 void mark_complete() { state
|= STATE_COMPLETE
; }
228 bool is_dirty() const { return state
& STATE_DIRTY
; }
229 void mark_dirty() { state
|= STATE_DIRTY
; }
230 bool is_new() const { return state
& STATE_NEW
; }
231 void mark_new() { state
|= STATE_NEW
; }
232 bool is_importing() { return state
& STATE_IMPORTING
; }
233 void mark_importing() { state
|= STATE_IMPORTING
; }
234 bool is_dirty_dft() { return state
& STATE_DIRTYDFT
; }
235 void mark_dirty_dft() { state
|= STATE_DIRTYDFT
; }
237 const list
<fullbit
> &get_dfull() const { return dfull
; }
238 list
<fullbit
> &_get_dfull() { return dfull
; }
239 const vector
<remotebit
> &get_dremote() const { return dremote
; }
240 const vector
<nullbit
> &get_dnull() const { return dnull
; }
242 template< class... Args
>
243 void add_dfull(Args
&&... args
) {
244 dfull
.emplace_back(std::forward
<Args
>(args
)...);
246 template< class... Args
>
247 void add_dremote(Args
&&... args
) {
248 dremote
.emplace_back(std::forward
<Args
>(args
)...);
250 template< class... Args
>
251 void add_dnull(Args
&&... args
) {
252 dnull
.emplace_back(std::forward
<Args
>(args
)...);
255 void print(dirfrag_t dirfrag
, ostream
& out
) const {
256 out
<< "dirlump " << dirfrag
<< " v " << fnode
->version
257 << " state " << state
258 << " num " << nfull
<< "/" << nremote
<< "/" << nnull
261 for (const auto& p
: dfull
)
263 for (const auto& p
: dremote
)
265 for (const auto& p
: dnull
)
269 string
state_string() const {
271 bool marked_already
= false;
273 state_string
.append("complete");
274 marked_already
= true;
277 state_string
.append(marked_already
? "+dirty" : "dirty");
278 marked_already
= true;
281 state_string
.append(marked_already
? "+new" : "new");
286 // if this changes, update the versioning in encode for it!
287 void _encode_bits(uint64_t features
) const {
289 if (!dn_decoded
) return;
290 encode(dfull
, dnbl
, features
);
291 encode(dremote
, dnbl
);
294 void _decode_bits() const {
296 if (dn_decoded
) return;
297 auto p
= dnbl
.cbegin();
304 void encode(bufferlist
& bl
, uint64_t features
) const;
305 void decode(bufferlist::const_iterator
&bl
);
306 void dump(Formatter
*f
) const;
307 static void generate_test_instances(std::list
<dirlump
*>& ls
);
309 WRITE_CLASS_ENCODER_FEATURES(dirlump
)
311 // my lumps. preserve the order we added them in a list.
312 vector
<dirfrag_t
> lump_order
;
313 map
<dirfrag_t
, dirlump
> lump_map
;
316 vector
<pair
<__u8
,version_t
> > table_tids
; // tableclient transactions
318 inodeno_t opened_ino
;
320 inodeno_t renamed_dirino
;
321 vector
<frag_t
> renamed_dir_frags
;
324 // ino (pre)allocation. may involve both inotable AND session state.
325 version_t inotablev
, sessionmapv
;
326 inodeno_t allocated_ino
; // inotable
327 interval_set
<inodeno_t
> preallocated_inos
; // inotable + session
328 inodeno_t used_preallocated_ino
; // session
329 entity_name_t client_name
; // session
331 // inodes i've truncated
332 vector
<inodeno_t
> truncate_start
; // start truncate
333 map
<inodeno_t
, LogSegment::seq_t
> truncate_finish
; // finished truncate (started in segment blah)
336 vector
<inodeno_t
> destroyed_inodes
;
340 vector
<pair
<metareqid_t
,uint64_t> > client_reqs
;
341 vector
<pair
<metareqid_t
,uint64_t> > client_flushes
;
344 void encode(bufferlist
& bl
, uint64_t features
) const;
345 void decode(bufferlist::const_iterator
& bl
);
346 void get_inodes(std::set
<inodeno_t
> &inodes
) const;
347 void get_paths(std::vector
<std::string
> &paths
) const;
348 void get_dentries(std::map
<dirfrag_t
, std::set
<std::string
> > &dentries
) const;
349 entity_name_t
get_client_name() const {return client_name
;}
351 void dump(Formatter
*f
) const;
352 static void generate_test_instances(std::list
<EMetaBlob
*>& ls
);
354 uint64_t last_subtree_map
;
357 // for replay, in certain cases
358 //LogSegment *_segment;
360 EMetaBlob() : opened_ino(0), renamed_dirino(0),
361 inotablev(0), sessionmapv(0), allocated_ino(0),
362 last_subtree_map(0), event_seq(0)
364 EMetaBlob(const EMetaBlob
&) = delete;
366 EMetaBlob
& operator=(const EMetaBlob
&) = delete;
368 void print(ostream
& out
) {
369 for (const auto &p
: lump_order
)
370 lump_map
[p
].print(p
, out
);
373 void add_client_req(metareqid_t r
, uint64_t tid
=0) {
374 client_reqs
.push_back(pair
<metareqid_t
,uint64_t>(r
, tid
));
376 void add_client_flush(metareqid_t r
, uint64_t tid
=0) {
377 client_flushes
.push_back(pair
<metareqid_t
,uint64_t>(r
, tid
));
380 void add_table_transaction(int table
, version_t tid
) {
381 table_tids
.push_back(pair
<__u8
, version_t
>(table
, tid
));
384 void add_opened_ino(inodeno_t ino
) {
385 ceph_assert(!opened_ino
);
389 void set_ino_alloc(inodeno_t alloc
,
390 inodeno_t used_prealloc
,
391 interval_set
<inodeno_t
>& prealloc
,
392 entity_name_t client
,
393 version_t sv
, version_t iv
) {
394 allocated_ino
= alloc
;
395 used_preallocated_ino
= used_prealloc
;
396 preallocated_inos
= prealloc
;
397 client_name
= client
;
402 void add_truncate_start(inodeno_t ino
) {
403 truncate_start
.push_back(ino
);
405 void add_truncate_finish(inodeno_t ino
, uint64_t segoff
) {
406 truncate_finish
[ino
] = segoff
;
409 bool rewrite_truncate_finish(MDSRank
const *mds
, std::map
<uint64_t, uint64_t> const &old_to_new
);
411 void add_destroyed_inode(inodeno_t ino
) {
412 destroyed_inodes
.push_back(ino
);
415 void add_null_dentry(CDentry
*dn
, bool dirty
) {
416 add_null_dentry(add_dir(dn
->get_dir(), false), dn
, dirty
);
418 void add_null_dentry(dirlump
& lump
, CDentry
*dn
, bool dirty
) {
421 lump
.add_dnull(dn
->get_name(), dn
->first
, dn
->last
,
422 dn
->get_projected_version(), dirty
);
425 void add_remote_dentry(CDentry
*dn
, bool dirty
) {
426 add_remote_dentry(add_dir(dn
->get_dir(), false), dn
, dirty
, 0, 0);
428 void add_remote_dentry(CDentry
*dn
, bool dirty
, inodeno_t rino
, int rdt
) {
429 add_remote_dentry(add_dir(dn
->get_dir(), false), dn
, dirty
, rino
, rdt
);
431 void add_remote_dentry(dirlump
& lump
, CDentry
*dn
, bool dirty
,
432 inodeno_t rino
=0, unsigned char rdt
=0) {
434 rino
= dn
->get_projected_linkage()->get_remote_ino();
435 rdt
= dn
->get_projected_linkage()->get_remote_d_type();
438 lump
.add_dremote(dn
->get_name(), dn
->get_alternate_name(), dn
->first
, dn
->last
,
439 dn
->get_projected_version(), rino
, rdt
, dirty
);
442 // return remote pointer to to-be-journaled inode
443 void add_primary_dentry(CDentry
*dn
, CInode
*in
, bool dirty
,
444 bool dirty_parent
=false, bool dirty_pool
=false,
445 bool need_snapflush
=false) {
447 if (dirty
) state
|= fullbit::STATE_DIRTY
;
448 if (dirty_parent
) state
|= fullbit::STATE_DIRTYPARENT
;
449 if (dirty_pool
) state
|= fullbit::STATE_DIRTYPOOL
;
450 if (need_snapflush
) state
|= fullbit::STATE_NEED_SNAPFLUSH
;
451 add_primary_dentry(add_dir(dn
->get_dir(), false), dn
, in
, state
);
453 void add_primary_dentry(dirlump
& lump
, CDentry
*dn
, CInode
*in
, __u8 state
) {
455 in
= dn
->get_projected_linkage()->get_inode();
457 if (in
->is_ephemeral_rand()) {
458 state
|= fullbit::STATE_EPHEMERAL_RANDOM
;
461 const auto& pi
= in
->get_projected_inode();
462 ceph_assert(pi
->version
> 0);
464 if ((state
& fullbit::STATE_DIRTY
) && pi
->is_backtrace_updated())
465 state
|= fullbit::STATE_DIRTYPARENT
;
468 const sr_t
*sr
= in
->get_projected_srnode();
473 lump
.add_dfull(dn
->get_name(), dn
->get_alternate_name(), dn
->first
, dn
->last
, dn
->get_projected_version(),
474 pi
, in
->dirfragtree
, in
->get_projected_xattrs(), in
->symlink
,
475 in
->oldest_snap
, snapbl
, state
, in
->get_old_inodes());
477 // make note of where this inode was last journaled
478 in
->last_journaled
= event_seq
;
479 //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
482 // convenience: primary or remote? figure it out.
483 void add_dentry(CDentry
*dn
, bool dirty
) {
484 dirlump
& lump
= add_dir(dn
->get_dir(), false);
485 add_dentry(lump
, dn
, dirty
, false, false);
487 void add_import_dentry(CDentry
*dn
) {
488 bool dirty_parent
= false;
489 bool dirty_pool
= false;
490 if (dn
->get_linkage()->is_primary()) {
491 dirty_parent
= dn
->get_linkage()->get_inode()->is_dirty_parent();
492 dirty_pool
= dn
->get_linkage()->get_inode()->is_dirty_pool();
494 dirlump
& lump
= add_dir(dn
->get_dir(), false);
495 add_dentry(lump
, dn
, dn
->is_dirty(), dirty_parent
, dirty_pool
);
497 void add_dentry(dirlump
& lump
, CDentry
*dn
, bool dirty
, bool dirty_parent
, bool dirty_pool
) {
499 if (dn
->get_projected_linkage()->is_remote()) {
500 add_remote_dentry(dn
, dirty
);
502 } else if (dn
->get_projected_linkage()->is_null()) {
503 add_null_dentry(dn
, dirty
);
506 ceph_assert(dn
->get_projected_linkage()->is_primary());
507 add_primary_dentry(dn
, 0, dirty
, dirty_parent
, dirty_pool
);
510 void add_root(bool dirty
, CInode
*in
) {
511 in
->last_journaled
= event_seq
;
512 //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
514 const auto& pi
= in
->get_projected_inode();
515 const auto& px
= in
->get_projected_xattrs();
516 const auto& pdft
= in
->dirfragtree
;
519 const sr_t
*sr
= in
->get_projected_srnode();
523 for (auto p
= roots
.begin(); p
!= roots
.end(); ++p
) {
524 if (p
->inode
->ino
== in
->ino()) {
531 roots
.emplace_back(empty
, "", in
->first
, in
->last
, 0, pi
, pdft
, px
, in
->symlink
,
532 in
->oldest_snap
, snapbl
, (dirty
? fullbit::STATE_DIRTY
: 0),
533 in
->get_old_inodes());
536 dirlump
& add_dir(CDir
*dir
, bool dirty
, bool complete
=false) {
537 return add_dir(dir
->dirfrag(), dir
->get_projected_fnode(),
540 dirlump
& add_new_dir(CDir
*dir
) {
541 return add_dir(dir
->dirfrag(), dir
->get_projected_fnode(),
542 true, true, true); // dirty AND complete AND new
544 dirlump
& add_import_dir(CDir
*dir
) {
545 // dirty=false would be okay in some cases
546 return add_dir(dir
->dirfrag(), dir
->get_projected_fnode(),
547 dir
->is_dirty(), dir
->is_complete(), false, true, dir
->is_dirty_dft());
549 dirlump
& add_fragmented_dir(CDir
*dir
, bool dirty
, bool dirtydft
) {
550 return add_dir(dir
->dirfrag(), dir
->get_projected_fnode(),
551 dirty
, false, false, false, dirtydft
);
553 dirlump
& add_dir(dirfrag_t df
, const CDir::fnode_const_ptr
& pf
, bool dirty
,
554 bool complete
=false, bool isnew
=false,
555 bool importing
=false, bool dirty_dft
=false) {
556 if (lump_map
.count(df
) == 0)
557 lump_order
.push_back(df
);
559 dirlump
& l
= lump_map
[df
];
561 if (complete
) l
.mark_complete();
562 if (dirty
) l
.mark_dirty();
563 if (isnew
) l
.mark_new();
564 if (importing
) l
.mark_importing();
565 if (dirty_dft
) l
.mark_dirty_dft();
569 static const int TO_AUTH_SUBTREE_ROOT
= 0; // default.
570 static const int TO_ROOT
= 1;
572 void add_dir_context(CDir
*dir
, int mode
= TO_AUTH_SUBTREE_ROOT
);
575 return roots
.empty() && lump_order
.empty() && table_tids
.empty() &&
576 truncate_start
.empty() && truncate_finish
.empty() &&
577 destroyed_inodes
.empty() && client_reqs
.empty() &&
578 opened_ino
== 0 && inotablev
== 0 && sessionmapv
== 0;
581 void print(ostream
& out
) const {
583 if (!lump_order
.empty())
584 out
<< " " << lump_order
.front() << ", " << lump_map
.size() << " dirs";
585 if (!table_tids
.empty())
586 out
<< " table_tids=" << table_tids
;
587 if (allocated_ino
|| preallocated_inos
.size()) {
589 out
<< " alloc_ino=" << allocated_ino
;
590 if (preallocated_inos
.size())
591 out
<< " prealloc_ino=" << preallocated_inos
;
592 if (used_preallocated_ino
)
593 out
<< " used_prealloc_ino=" << used_preallocated_ino
;
594 out
<< " v" << inotablev
;
599 void update_segment(LogSegment
*ls
);
600 void replay(MDSRank
*mds
, LogSegment
*ls
, MDPeerUpdate
*su
=NULL
);
602 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob
)
603 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::fullbit
)
604 WRITE_CLASS_ENCODER(EMetaBlob::remotebit
)
605 WRITE_CLASS_ENCODER(EMetaBlob::nullbit
)
606 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::dirlump
)
608 inline ostream
& operator<<(ostream
& out
, const EMetaBlob
& t
) {