]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #ifndef CEPH_FRAG_H | |
16 | #define CEPH_FRAG_H | |
17 | ||
18 | #include <stdint.h> | |
19 | #include <list> | |
20 | #include <iostream> | |
21 | #include <stdio.h> | |
22 | ||
23 | #include "buffer.h" | |
24 | #include "compact_map.h" | |
25 | ||
26 | #include "ceph_frag.h" | |
27 | #include "include/assert.h" | |
28 | ||
29 | /* | |
30 | * | |
31 | * the goal here is to use a binary split strategy to partition a namespace. | |
32 | * frag_t represents a particular fragment. bits() tells you the size of the | |
33 | * fragment, and value() it's name. this is roughly analogous to an ip address | |
34 | * and netmask. | |
35 | * | |
36 | * fragtree_t represents an entire namespace and it's partition. it essentially | |
37 | * tells you where fragments are split into other fragments, and by how much | |
38 | * (i.e. by how many bits, resulting in a power of 2 number of child fragments). | |
39 | * | |
40 | * this vaguely resembles a btree, in that when a fragment becomes large or small | |
41 | * we can split or merge, except that there is no guarantee of being balanced. | |
42 | * | |
43 | * presumably we are partitioning the output of a (perhaps specialized) hash | |
44 | * function. | |
45 | */ | |
46 | ||
47 | /** | |
48 | * frag_t | |
49 | * | |
50 | * description of an individual fragment. that is, a particular piece | |
51 | * of the overall namespace. | |
52 | * | |
53 | * this is conceptually analogous to an ip address and netmask. | |
54 | * | |
55 | * a value v falls "within" fragment f iff (v & f.mask()) == f.value(). | |
56 | * | |
57 | * we write it as v/b, where v is a value and b is the number of bits. | |
58 | * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, | |
59 | * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. | |
60 | * | |
61 | * this makes the right most bit of v the "most significant", which is the | |
62 | * opposite of what we usually see. | |
63 | */ | |
64 | ||
65 | /* | |
66 | * TODO: | |
67 | * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) | |
68 | * iteration efficient (see, e.g., try_assimilate_children() | |
69 | * - rework frag_t so that we mask the left-most (most significant) bits instead of | |
70 | * the right-most (least significant) bits. just because it's more intutive, and | |
71 | * matches the network/netmask concept. | |
72 | */ | |
73 | ||
74 | typedef uint32_t _frag_t; | |
75 | ||
76 | class frag_t { | |
77 | /* | |
78 | * encoding is dictated by frag_* functions in ceph_fs.h. use those | |
79 | * helpers _exclusively_. | |
80 | */ | |
81 | public: | |
82 | _frag_t _enc; | |
83 | ||
84 | frag_t() : _enc(0) { } | |
85 | frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { } | |
86 | frag_t(_frag_t e) : _enc(e) { } | |
87 | ||
88 | // constructors | |
89 | void from_unsigned(unsigned e) { _enc = e; } | |
90 | ||
91 | // accessors | |
92 | unsigned value() const { return ceph_frag_value(_enc); } | |
93 | unsigned bits() const { return ceph_frag_bits(_enc); } | |
94 | unsigned mask() const { return ceph_frag_mask(_enc); } | |
95 | unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); } | |
96 | ||
97 | operator _frag_t() const { return _enc; } | |
98 | ||
99 | // tests | |
100 | bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); } | |
101 | bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); } | |
102 | bool is_root() const { return bits() == 0; } | |
103 | frag_t parent() const { | |
104 | assert(bits() > 0); | |
105 | return frag_t(ceph_frag_parent(_enc)); | |
106 | } | |
107 | ||
108 | // splitting | |
109 | frag_t make_child(int i, int nb) const { | |
110 | assert(i < (1<<nb)); | |
111 | return frag_t(ceph_frag_make_child(_enc, nb, i)); | |
112 | } | |
113 | void split(int nb, std::list<frag_t>& fragments) const { | |
114 | assert(nb > 0); | |
115 | unsigned nway = 1 << nb; | |
116 | for (unsigned i=0; i<nway; i++) | |
117 | fragments.push_back(make_child(i, nb)); | |
118 | } | |
119 | ||
120 | // binary splitting | |
121 | frag_t left_child() const { return frag_t(ceph_frag_left_child(_enc)); } | |
122 | frag_t right_child() const { return frag_t(ceph_frag_right_child(_enc)); } | |
123 | ||
124 | bool is_left() const { return ceph_frag_is_left_child(_enc); } | |
125 | bool is_right() const { return ceph_frag_is_right_child(_enc); } | |
126 | frag_t get_sibling() const { | |
127 | assert(!is_root()); | |
128 | return frag_t(ceph_frag_sibling(_enc)); | |
129 | } | |
130 | ||
131 | // sequencing | |
132 | bool is_leftmost() const { return ceph_frag_is_leftmost(_enc); } | |
133 | bool is_rightmost() const { return ceph_frag_is_rightmost(_enc); } | |
134 | frag_t next() const { | |
135 | assert(!is_rightmost()); | |
136 | return frag_t(ceph_frag_next(_enc)); | |
137 | } | |
138 | ||
139 | // parse | |
140 | bool parse(const char *s) { | |
141 | int pvalue, pbits; | |
142 | int r = sscanf(s, "%x/%d", &pvalue, &pbits); | |
143 | if (r == 2) { | |
144 | *this = frag_t(pvalue, pbits); | |
145 | return true; | |
146 | } | |
147 | return false; | |
148 | } | |
149 | }; | |
150 | ||
31f18b77 | 151 | inline std::ostream& operator<<(std::ostream& out, const frag_t& hb) |
7c673cae FG |
152 | { |
153 | //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '='; | |
154 | unsigned num = hb.bits(); | |
155 | if (num) { | |
156 | unsigned val = hb.value(); | |
157 | for (unsigned bit = 23; num; num--, bit--) | |
158 | out << ((val & (1<<bit)) ? '1':'0'); | |
159 | } | |
160 | return out << '*'; | |
161 | } | |
162 | ||
163 | inline void encode(frag_t f, bufferlist& bl) { encode_raw(f._enc, bl); } | |
164 | inline void decode(frag_t &f, bufferlist::iterator& p) { | |
165 | __u32 v; | |
166 | decode_raw(v, p); | |
167 | f._enc = v; | |
168 | } | |
169 | ||
170 | ||
171 | ||
172 | /** | |
173 | * fragtree_t -- partition an entire namespace into one or more frag_t's. | |
174 | */ | |
175 | class fragtree_t { | |
176 | // pairs <f, b>: | |
177 | // frag_t f is split by b bits. | |
178 | // if child frag_t does not appear, it is not split. | |
179 | public: | |
180 | compact_map<frag_t,int32_t> _splits; | |
181 | ||
182 | public: | |
183 | // ------------- | |
184 | // basics | |
185 | void swap(fragtree_t& other) { | |
186 | _splits.swap(other._splits); | |
187 | } | |
188 | void clear() { | |
189 | _splits.clear(); | |
190 | } | |
191 | ||
192 | // ------------- | |
193 | // accessors | |
194 | bool empty() const { | |
195 | return _splits.empty(); | |
196 | } | |
197 | int get_split(const frag_t hb) const { | |
198 | compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb); | |
199 | if (p == _splits.end()) | |
200 | return 0; | |
201 | else | |
202 | return p->second; | |
203 | } | |
204 | ||
205 | ||
206 | bool is_leaf(frag_t x) const { | |
207 | std::list<frag_t> ls; | |
208 | get_leaves_under(x, ls); | |
209 | //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl; | |
210 | if (!ls.empty() && | |
211 | ls.front() == x && | |
212 | ls.size() == 1) | |
213 | return true; | |
214 | return false; | |
215 | } | |
216 | ||
217 | /** | |
218 | * get_leaves -- list all leaves | |
219 | */ | |
220 | void get_leaves(std::list<frag_t>& ls) const { | |
221 | return get_leaves_under_split(frag_t(), ls); | |
222 | } | |
223 | ||
224 | /** | |
225 | * get_leaves_under_split -- list all leaves under a known split point (or root) | |
226 | */ | |
227 | void get_leaves_under_split(frag_t under, std::list<frag_t>& ls) const { | |
228 | std::list<frag_t> q; | |
229 | q.push_back(under); | |
230 | while (!q.empty()) { | |
231 | frag_t t = q.back(); | |
232 | q.pop_back(); | |
233 | int nb = get_split(t); | |
234 | if (nb) | |
235 | t.split(nb, q); // queue up children | |
236 | else | |
237 | ls.push_front(t); // not spit, it's a leaf. | |
238 | } | |
239 | } | |
240 | ||
241 | /** | |
242 | * get_branch -- get branch point at OR above frag @a x | |
243 | * - may be @a x itself, if @a x is a split | |
244 | * - may be root (frag_t()) | |
245 | */ | |
246 | frag_t get_branch(frag_t x) const { | |
247 | while (1) { | |
248 | if (x == frag_t()) return x; // root | |
249 | if (get_split(x)) return x; // found it! | |
250 | x = x.parent(); | |
251 | } | |
252 | } | |
253 | ||
254 | /** | |
255 | * get_branch_above -- get a branch point above frag @a x | |
256 | * - may be root (frag_t()) | |
257 | * - may NOT be @a x, even if @a x is a split. | |
258 | */ | |
259 | frag_t get_branch_above(frag_t x) const { | |
260 | while (1) { | |
261 | if (x == frag_t()) return x; // root | |
262 | x = x.parent(); | |
263 | if (get_split(x)) return x; // found it! | |
264 | } | |
265 | } | |
266 | ||
267 | ||
268 | /** | |
269 | * get_branch_or_leaf -- get branch or leaf point parent for frag @a x | |
270 | * - may be @a x itself, if @a x is a split or leaf | |
271 | * - may be root (frag_t()) | |
272 | */ | |
273 | frag_t get_branch_or_leaf(frag_t x) const { | |
274 | frag_t branch = get_branch(x); | |
275 | int nb = get_split(branch); | |
276 | if (nb > 0 && // if branch is a split, and | |
277 | branch.bits() + nb <= x.bits()) // one of the children is or contains x | |
278 | return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf) | |
279 | else | |
280 | return branch; | |
281 | } | |
282 | ||
283 | /** | |
284 | * get_leaves_under(x, ls) -- search for any leaves fully contained by x | |
285 | */ | |
286 | void get_leaves_under(frag_t x, std::list<frag_t>& ls) const { | |
287 | std::list<frag_t> q; | |
288 | q.push_back(get_branch_or_leaf(x)); | |
289 | while (!q.empty()) { | |
290 | frag_t t = q.front(); | |
291 | q.pop_front(); | |
292 | if (t.bits() >= x.bits() && // if t is more specific than x, and | |
293 | !x.contains(t)) // x does not contain t, | |
294 | continue; // then skip | |
295 | int nb = get_split(t); | |
296 | if (nb) | |
297 | t.split(nb, q); // queue up children | |
298 | else if (x.contains(t)) | |
299 | ls.push_back(t); // not spit, it's a leaf. | |
300 | } | |
301 | } | |
302 | ||
303 | /** | |
304 | * contains(fg) -- does fragtree contain the specific frag @a x | |
305 | */ | |
306 | bool contains(frag_t x) const { | |
307 | std::list<frag_t> q; | |
308 | q.push_back(get_branch(x)); | |
309 | while (!q.empty()) { | |
310 | frag_t t = q.front(); | |
311 | q.pop_front(); | |
312 | if (t.bits() >= x.bits() && // if t is more specific than x, and | |
313 | !x.contains(t)) // x does not contain t, | |
314 | continue; // then skip | |
315 | int nb = get_split(t); | |
316 | if (nb) { | |
317 | if (t == x) return false; // it's split. | |
318 | t.split(nb, q); // queue up children | |
319 | } else { | |
320 | if (t == x) return true; // it's there. | |
321 | } | |
322 | } | |
323 | return false; | |
324 | } | |
325 | ||
326 | /** | |
327 | * operator[] -- map a (hash?) value to a frag | |
328 | */ | |
329 | frag_t operator[](unsigned v) const { | |
330 | frag_t t; | |
331 | while (1) { | |
332 | assert(t.contains(v)); | |
333 | int nb = get_split(t); | |
334 | ||
335 | // is this a leaf? | |
336 | if (nb == 0) return t; // done. | |
337 | ||
338 | // pick appropriate child fragment. | |
339 | unsigned nway = 1 << nb; | |
340 | unsigned i; | |
341 | for (i=0; i<nway; i++) { | |
342 | frag_t n = t.make_child(i, nb); | |
343 | if (n.contains(v)) { | |
344 | t = n; | |
345 | break; | |
346 | } | |
347 | } | |
348 | assert(i < nway); | |
349 | } | |
350 | } | |
351 | ||
352 | ||
353 | // --------------- | |
354 | // modifiers | |
355 | void split(frag_t x, int b, bool simplify=true) { | |
356 | assert(is_leaf(x)); | |
357 | _splits[x] = b; | |
358 | ||
359 | if (simplify) | |
360 | try_assimilate_children(get_branch_above(x)); | |
361 | } | |
362 | void merge(frag_t x, int b, bool simplify=true) { | |
363 | assert(!is_leaf(x)); | |
364 | assert(_splits[x] == b); | |
365 | _splits.erase(x); | |
366 | ||
367 | if (simplify) | |
368 | try_assimilate_children(get_branch_above(x)); | |
369 | } | |
370 | ||
371 | /* | |
372 | * if all of a given split's children are identically split, | |
373 | * then the children can be assimilated. | |
374 | */ | |
375 | void try_assimilate_children(frag_t x) { | |
376 | int nb = get_split(x); | |
377 | if (!nb) return; | |
378 | std::list<frag_t> children; | |
379 | x.split(nb, children); | |
380 | int childbits = 0; | |
381 | for (std::list<frag_t>::iterator p = children.begin(); | |
382 | p != children.end(); | |
383 | ++p) { | |
384 | int cb = get_split(*p); | |
385 | if (!cb) return; // nope. | |
386 | if (childbits && cb != childbits) return; // not the same | |
387 | childbits = cb; | |
388 | } | |
389 | // all children are split with childbits! | |
390 | for (std::list<frag_t>::iterator p = children.begin(); | |
391 | p != children.end(); | |
392 | ++p) | |
393 | _splits.erase(*p); | |
394 | _splits[x] += childbits; | |
395 | } | |
396 | ||
397 | bool force_to_leaf(CephContext *cct, frag_t x) { | |
398 | if (is_leaf(x)) | |
399 | return false; | |
400 | ||
401 | lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl; | |
402 | ||
403 | frag_t parent = get_branch_or_leaf(x); | |
404 | assert(parent.bits() <= x.bits()); | |
405 | lgeneric_dout(cct, 10) << "parent is " << parent << dendl; | |
406 | ||
407 | // do we need to split from parent to x? | |
408 | if (parent.bits() < x.bits()) { | |
409 | int spread = x.bits() - parent.bits(); | |
410 | int nb = get_split(parent); | |
411 | lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl; | |
412 | if (nb == 0) { | |
413 | // easy: split parent (a leaf) by the difference | |
414 | lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl; | |
415 | split(parent, spread); | |
416 | assert(is_leaf(x)); | |
417 | return true; | |
418 | } | |
419 | assert(nb > spread); | |
420 | ||
421 | // add an intermediary split | |
422 | merge(parent, nb, false); | |
423 | split(parent, spread, false); | |
424 | ||
425 | std::list<frag_t> subs; | |
426 | parent.split(spread, subs); | |
427 | for (std::list<frag_t>::iterator p = subs.begin(); | |
428 | p != subs.end(); | |
429 | ++p) { | |
430 | lgeneric_dout(cct, 10) << "splitting intermediate " << *p << " by " << (nb-spread) << dendl; | |
431 | split(*p, nb - spread, false); | |
432 | } | |
433 | } | |
434 | ||
435 | // x is now a leaf or split. | |
436 | // hoover up any children. | |
437 | std::list<frag_t> q; | |
438 | q.push_back(x); | |
439 | while (!q.empty()) { | |
440 | frag_t t = q.front(); | |
441 | q.pop_front(); | |
442 | int nb = get_split(t); | |
443 | if (nb) { | |
444 | lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl; | |
445 | merge(t, nb, false); // merge this point, and | |
446 | t.split(nb, q); // queue up children | |
447 | } | |
448 | } | |
449 | ||
450 | lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl; | |
451 | assert(is_leaf(x)); | |
452 | return true; | |
453 | } | |
454 | ||
455 | // encoding | |
456 | void encode(bufferlist& bl) const { | |
457 | ::encode(_splits, bl); | |
458 | } | |
459 | void decode(bufferlist::iterator& p) { | |
460 | ::decode(_splits, p); | |
461 | } | |
462 | void encode_nohead(bufferlist& bl) const { | |
463 | for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin(); | |
464 | p != _splits.end(); | |
465 | ++p) { | |
466 | ::encode(p->first, bl); | |
467 | ::encode(p->second, bl); | |
468 | } | |
469 | } | |
470 | void decode_nohead(int n, bufferlist::iterator& p) { | |
471 | _splits.clear(); | |
472 | while (n-- > 0) { | |
473 | frag_t f; | |
474 | ::decode(f, p); | |
475 | ::decode(_splits[f], p); | |
476 | } | |
477 | } | |
478 | ||
479 | void print(std::ostream& out) { | |
480 | out << "fragtree_t("; | |
481 | std::list<frag_t> q; | |
482 | q.push_back(frag_t()); | |
483 | while (!q.empty()) { | |
484 | frag_t t = q.front(); | |
485 | q.pop_front(); | |
486 | // newline + indent? | |
487 | if (t.bits()) { | |
488 | out << std::endl; | |
489 | for (unsigned i=0; i<t.bits(); i++) out << ' '; | |
490 | } | |
491 | int nb = get_split(t); | |
492 | if (nb) { | |
493 | out << t << " %" << nb; | |
494 | t.split(nb, q); // queue up children | |
495 | } else { | |
496 | out << t; | |
497 | } | |
498 | } | |
499 | out << ")"; | |
500 | } | |
501 | ||
502 | void dump(Formatter *f) const { | |
503 | f->open_array_section("splits"); | |
504 | for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin(); | |
505 | p != _splits.end(); | |
506 | ++p) { | |
507 | f->open_object_section("split"); | |
508 | std::ostringstream frag_str; | |
509 | frag_str << p->first; | |
510 | f->dump_string("frag", frag_str.str()); | |
511 | f->dump_int("children", p->second); | |
512 | f->close_section(); // split | |
513 | } | |
514 | f->close_section(); // splits | |
515 | } | |
516 | }; | |
517 | WRITE_CLASS_ENCODER(fragtree_t) | |
518 | ||
519 | inline bool operator==(const fragtree_t& l, const fragtree_t& r) { | |
520 | return l._splits == r._splits; | |
521 | } | |
522 | inline bool operator!=(const fragtree_t& l, const fragtree_t& r) { | |
523 | return l._splits != r._splits; | |
524 | } | |
525 | ||
526 | inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft) | |
527 | { | |
528 | out << "fragtree_t("; | |
529 | ||
530 | for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin(); | |
531 | p != ft._splits.end(); | |
532 | ++p) { | |
533 | if (p != ft._splits.begin()) | |
534 | out << " "; | |
535 | out << p->first << "^" << p->second; | |
536 | } | |
537 | return out << ")"; | |
538 | } | |
539 | ||
540 | ||
541 | /** | |
542 | * fragset_t -- a set of fragments | |
543 | */ | |
544 | class fragset_t { | |
545 | std::set<frag_t> _set; | |
546 | ||
547 | public: | |
548 | const std::set<frag_t> &get() const { return _set; } | |
549 | std::set<frag_t>::iterator begin() { return _set.begin(); } | |
550 | std::set<frag_t>::iterator end() { return _set.end(); } | |
551 | ||
552 | bool empty() const { return _set.empty(); } | |
553 | ||
554 | bool contains(frag_t f) const { | |
555 | while (1) { | |
556 | if (_set.count(f)) return true; | |
557 | if (f.bits() == 0) return false; | |
558 | f = f.parent(); | |
559 | } | |
560 | } | |
561 | ||
562 | void insert(frag_t f) { | |
563 | _set.insert(f); | |
564 | simplify(); | |
565 | } | |
566 | ||
567 | void simplify() { | |
568 | while (1) { | |
569 | bool clean = true; | |
570 | std::set<frag_t>::iterator p = _set.begin(); | |
571 | while (p != _set.end()) { | |
572 | if (!p->is_root() && | |
573 | _set.count(p->get_sibling())) { | |
574 | _set.erase(p->get_sibling()); | |
575 | _set.insert(p->parent()); | |
576 | _set.erase(p++); | |
577 | clean = false; | |
578 | } else { | |
579 | p++; | |
580 | } | |
581 | } | |
582 | if (clean) | |
583 | break; | |
584 | } | |
585 | } | |
586 | }; | |
587 | ||
588 | inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs) | |
589 | { | |
590 | return out << "fragset_t(" << fs.get() << ")"; | |
591 | } | |
592 | ||
593 | #endif |