]>
git.proxmox.com Git - ceph.git/blob - ceph/src/include/frag.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
24 #include "compact_map.h"
26 #include "ceph_frag.h"
27 #include "include/assert.h"
31 * the goal here is to use a binary split strategy to partition a namespace.
32 * frag_t represents a particular fragment. bits() tells you the size of the
33 * fragment, and value() it's name. this is roughly analogous to an ip address
36 * fragtree_t represents an entire namespace and it's partition. it essentially
37 * tells you where fragments are split into other fragments, and by how much
38 * (i.e. by how many bits, resulting in a power of 2 number of child fragments).
40 * this vaguely resembles a btree, in that when a fragment becomes large or small
41 * we can split or merge, except that there is no guarantee of being balanced.
43 * presumably we are partitioning the output of a (perhaps specialized) hash
50 * description of an individual fragment. that is, a particular piece
51 * of the overall namespace.
53 * this is conceptually analogous to an ip address and netmask.
55 * a value v falls "within" fragment f iff (v & f.mask()) == f.value().
57 * we write it as v/b, where v is a value and b is the number of bits.
58 * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that,
59 * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on.
61 * this makes the right most bit of v the "most significant", which is the
62 * opposite of what we usually see.
67 * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial)
68 * iteration efficient (see, e.g., try_assimilate_children()
69 * - rework frag_t so that we mask the left-most (most significant) bits instead of
70 * the right-most (least significant) bits. just because it's more intutive, and
71 * matches the network/netmask concept.
74 typedef uint32_t _frag_t
;
78 * encoding is dictated by frag_* functions in ceph_fs.h. use those
79 * helpers _exclusively_.
84 frag_t() : _enc(0) { }
85 frag_t(unsigned v
, unsigned b
) : _enc(ceph_frag_make(b
, v
)) { }
86 frag_t(_frag_t e
) : _enc(e
) { }
89 void from_unsigned(unsigned e
) { _enc
= e
; }
92 unsigned value() const { return ceph_frag_value(_enc
); }
93 unsigned bits() const { return ceph_frag_bits(_enc
); }
94 unsigned mask() const { return ceph_frag_mask(_enc
); }
95 unsigned mask_shift() const { return ceph_frag_mask_shift(_enc
); }
97 operator _frag_t() const { return _enc
; }
100 bool contains(unsigned v
) const { return ceph_frag_contains_value(_enc
, v
); }
101 bool contains(frag_t sub
) const { return ceph_frag_contains_frag(_enc
, sub
._enc
); }
102 bool is_root() const { return bits() == 0; }
103 frag_t
parent() const {
105 return frag_t(ceph_frag_parent(_enc
));
109 frag_t
make_child(int i
, int nb
) const {
111 return frag_t(ceph_frag_make_child(_enc
, nb
, i
));
113 void split(int nb
, std::list
<frag_t
>& fragments
) const {
115 unsigned nway
= 1 << nb
;
116 for (unsigned i
=0; i
<nway
; i
++)
117 fragments
.push_back(make_child(i
, nb
));
121 frag_t
left_child() const { return frag_t(ceph_frag_left_child(_enc
)); }
122 frag_t
right_child() const { return frag_t(ceph_frag_right_child(_enc
)); }
124 bool is_left() const { return ceph_frag_is_left_child(_enc
); }
125 bool is_right() const { return ceph_frag_is_right_child(_enc
); }
126 frag_t
get_sibling() const {
128 return frag_t(ceph_frag_sibling(_enc
));
132 bool is_leftmost() const { return ceph_frag_is_leftmost(_enc
); }
133 bool is_rightmost() const { return ceph_frag_is_rightmost(_enc
); }
134 frag_t
next() const {
135 assert(!is_rightmost());
136 return frag_t(ceph_frag_next(_enc
));
140 bool parse(const char *s
) {
142 int r
= sscanf(s
, "%x/%d", &pvalue
, &pbits
);
144 *this = frag_t(pvalue
, pbits
);
151 inline std::ostream
& operator<<(std::ostream
& out
, const frag_t
& hb
)
153 //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '=';
154 unsigned num
= hb
.bits();
156 unsigned val
= hb
.value();
157 for (unsigned bit
= 23; num
; num
--, bit
--)
158 out
<< ((val
& (1<<bit
)) ? '1':'0');
163 inline void encode(const frag_t
&f
, bufferlist
& bl
) { encode_raw(f
._enc
, bl
); }
164 inline void decode(frag_t
&f
, bufferlist::iterator
& p
) {
173 * fragtree_t -- partition an entire namespace into one or more frag_t's.
177 // frag_t f is split by b bits.
178 // if child frag_t does not appear, it is not split.
180 compact_map
<frag_t
,int32_t> _splits
;
185 void swap(fragtree_t
& other
) {
186 _splits
.swap(other
._splits
);
195 return _splits
.empty();
197 int get_split(const frag_t hb
) const {
198 compact_map
<frag_t
,int32_t>::const_iterator p
= _splits
.find(hb
);
199 if (p
== _splits
.end())
206 bool is_leaf(frag_t x
) const {
207 std::list
<frag_t
> ls
;
208 get_leaves_under(x
, ls
);
209 //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl;
218 * get_leaves -- list all leaves
220 void get_leaves(std::list
<frag_t
>& ls
) const {
221 return get_leaves_under_split(frag_t(), ls
);
225 * get_leaves_under_split -- list all leaves under a known split point (or root)
227 void get_leaves_under_split(frag_t under
, std::list
<frag_t
>& ls
) const {
233 int nb
= get_split(t
);
235 t
.split(nb
, q
); // queue up children
237 ls
.push_front(t
); // not spit, it's a leaf.
242 * get_branch -- get branch point at OR above frag @a x
243 * - may be @a x itself, if @a x is a split
244 * - may be root (frag_t())
246 frag_t
get_branch(frag_t x
) const {
248 if (x
== frag_t()) return x
; // root
249 if (get_split(x
)) return x
; // found it!
255 * get_branch_above -- get a branch point above frag @a x
256 * - may be root (frag_t())
257 * - may NOT be @a x, even if @a x is a split.
259 frag_t
get_branch_above(frag_t x
) const {
261 if (x
== frag_t()) return x
; // root
263 if (get_split(x
)) return x
; // found it!
269 * get_branch_or_leaf -- get branch or leaf point parent for frag @a x
270 * - may be @a x itself, if @a x is a split or leaf
271 * - may be root (frag_t())
273 frag_t
get_branch_or_leaf(frag_t x
) const {
274 frag_t branch
= get_branch(x
);
275 int nb
= get_split(branch
);
276 if (nb
> 0 && // if branch is a split, and
277 branch
.bits() + nb
<= x
.bits()) // one of the children is or contains x
278 return frag_t(x
.value(), branch
.bits()+nb
); // then return that child (it's a leaf)
284 * get_leaves_under(x, ls) -- search for any leaves fully contained by x
286 void get_leaves_under(frag_t x
, std::list
<frag_t
>& ls
) const {
288 q
.push_back(get_branch_or_leaf(x
));
290 frag_t t
= q
.front();
292 if (t
.bits() >= x
.bits() && // if t is more specific than x, and
293 !x
.contains(t
)) // x does not contain t,
294 continue; // then skip
295 int nb
= get_split(t
);
297 t
.split(nb
, q
); // queue up children
298 else if (x
.contains(t
))
299 ls
.push_back(t
); // not spit, it's a leaf.
304 * contains(fg) -- does fragtree contain the specific frag @a x
306 bool contains(frag_t x
) const {
308 q
.push_back(get_branch(x
));
310 frag_t t
= q
.front();
312 if (t
.bits() >= x
.bits() && // if t is more specific than x, and
313 !x
.contains(t
)) // x does not contain t,
314 continue; // then skip
315 int nb
= get_split(t
);
317 if (t
== x
) return false; // it's split.
318 t
.split(nb
, q
); // queue up children
320 if (t
== x
) return true; // it's there.
327 * operator[] -- map a (hash?) value to a frag
329 frag_t
operator[](unsigned v
) const {
332 assert(t
.contains(v
));
333 int nb
= get_split(t
);
336 if (nb
== 0) return t
; // done.
338 // pick appropriate child fragment.
339 unsigned nway
= 1 << nb
;
341 for (i
=0; i
<nway
; i
++) {
342 frag_t n
= t
.make_child(i
, nb
);
355 void split(frag_t x
, int b
, bool simplify
=true) {
360 try_assimilate_children(get_branch_above(x
));
362 void merge(frag_t x
, int b
, bool simplify
=true) {
364 assert(_splits
[x
] == b
);
368 try_assimilate_children(get_branch_above(x
));
372 * if all of a given split's children are identically split,
373 * then the children can be assimilated.
375 void try_assimilate_children(frag_t x
) {
376 int nb
= get_split(x
);
378 std::list
<frag_t
> children
;
379 x
.split(nb
, children
);
381 for (std::list
<frag_t
>::iterator p
= children
.begin();
384 int cb
= get_split(*p
);
385 if (!cb
) return; // nope.
386 if (childbits
&& cb
!= childbits
) return; // not the same
389 // all children are split with childbits!
390 for (std::list
<frag_t
>::iterator p
= children
.begin();
394 _splits
[x
] += childbits
;
397 bool force_to_leaf(CephContext
*cct
, frag_t x
) {
401 lgeneric_dout(cct
, 10) << "force_to_leaf " << x
<< " on " << _splits
<< dendl
;
403 frag_t parent
= get_branch_or_leaf(x
);
404 assert(parent
.bits() <= x
.bits());
405 lgeneric_dout(cct
, 10) << "parent is " << parent
<< dendl
;
407 // do we need to split from parent to x?
408 if (parent
.bits() < x
.bits()) {
409 int spread
= x
.bits() - parent
.bits();
410 int nb
= get_split(parent
);
411 lgeneric_dout(cct
, 10) << "spread " << spread
<< ", parent splits by " << nb
<< dendl
;
413 // easy: split parent (a leaf) by the difference
414 lgeneric_dout(cct
, 10) << "splitting parent " << parent
<< " by spread " << spread
<< dendl
;
415 split(parent
, spread
);
421 // add an intermediary split
422 merge(parent
, nb
, false);
423 split(parent
, spread
, false);
425 std::list
<frag_t
> subs
;
426 parent
.split(spread
, subs
);
427 for (std::list
<frag_t
>::iterator p
= subs
.begin();
430 lgeneric_dout(cct
, 10) << "splitting intermediate " << *p
<< " by " << (nb
-spread
) << dendl
;
431 split(*p
, nb
- spread
, false);
435 // x is now a leaf or split.
436 // hoover up any children.
440 frag_t t
= q
.front();
442 int nb
= get_split(t
);
444 lgeneric_dout(cct
, 10) << "merging child " << t
<< " by " << nb
<< dendl
;
445 merge(t
, nb
, false); // merge this point, and
446 t
.split(nb
, q
); // queue up children
450 lgeneric_dout(cct
, 10) << "force_to_leaf done" << dendl
;
456 void encode(bufferlist
& bl
) const {
457 ::encode(_splits
, bl
);
459 void decode(bufferlist::iterator
& p
) {
460 ::decode(_splits
, p
);
462 void encode_nohead(bufferlist
& bl
) const {
463 for (compact_map
<frag_t
,int32_t>::const_iterator p
= _splits
.begin();
466 ::encode(p
->first
, bl
);
467 ::encode(p
->second
, bl
);
470 void decode_nohead(int n
, bufferlist::iterator
& p
) {
475 ::decode(_splits
[f
], p
);
479 void print(std::ostream
& out
) {
480 out
<< "fragtree_t(";
482 q
.push_back(frag_t());
484 frag_t t
= q
.front();
489 for (unsigned i
=0; i
<t
.bits(); i
++) out
<< ' ';
491 int nb
= get_split(t
);
493 out
<< t
<< " %" << nb
;
494 t
.split(nb
, q
); // queue up children
502 void dump(Formatter
*f
) const {
503 f
->open_array_section("splits");
504 for (compact_map
<frag_t
,int32_t>::const_iterator p
= _splits
.begin();
507 f
->open_object_section("split");
508 std::ostringstream frag_str
;
509 frag_str
<< p
->first
;
510 f
->dump_string("frag", frag_str
.str());
511 f
->dump_int("children", p
->second
);
512 f
->close_section(); // split
514 f
->close_section(); // splits
517 WRITE_CLASS_ENCODER(fragtree_t
)
519 inline bool operator==(const fragtree_t
& l
, const fragtree_t
& r
) {
520 return l
._splits
== r
._splits
;
522 inline bool operator!=(const fragtree_t
& l
, const fragtree_t
& r
) {
523 return l
._splits
!= r
._splits
;
526 inline std::ostream
& operator<<(std::ostream
& out
, const fragtree_t
& ft
)
528 out
<< "fragtree_t(";
530 for (compact_map
<frag_t
,int32_t>::const_iterator p
= ft
._splits
.begin();
531 p
!= ft
._splits
.end();
533 if (p
!= ft
._splits
.begin())
535 out
<< p
->first
<< "^" << p
->second
;
542 * fragset_t -- a set of fragments
545 std::set
<frag_t
> _set
;
548 const std::set
<frag_t
> &get() const { return _set
; }
549 std::set
<frag_t
>::iterator
begin() { return _set
.begin(); }
550 std::set
<frag_t
>::iterator
end() { return _set
.end(); }
552 bool empty() const { return _set
.empty(); }
554 bool contains(frag_t f
) const {
556 if (_set
.count(f
)) return true;
557 if (f
.bits() == 0) return false;
562 void insert(frag_t f
) {
570 std::set
<frag_t
>::iterator p
= _set
.begin();
571 while (p
!= _set
.end()) {
573 _set
.count(p
->get_sibling())) {
574 _set
.erase(p
->get_sibling());
575 _set
.insert(p
->parent());
588 inline std::ostream
& operator<<(std::ostream
& out
, const fragset_t
& fs
)
590 return out
<< "fragset_t(" << fs
.get() << ")";